0. 效果 ?
- 数据库 ?
- 手机图片 ?
1. 项目搭建 (创建 springboot 项目,集成 jpa,lombok)
-
项目结构 ?
- 数据库表结构 ?
- pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.6.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>mr.s</groupId>
<artifactId>crawlerjd</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>crawlerjd</name>
<description>crawler-jd</description>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.8.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
-
application.properties
#DB Configuration
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/crawler?useUnicode=true&serverTimezone=Asia/Shanghai&characterEncoding=utf-8&nullCatalogMeansCurrent=true
spring.datasource.username=root
spring.datasource.password=123
#Jpa Configuration
spring.jpa.database=MySQL
spring.jpa.show-sql=true
spring.jpa.open-in-view=false
2. 代码编写
-
pojo 下的 Item 类编写
package mr.s.jd.pojo;
import lombok.Data;
import javax.persistence.*;
import java.util.Date;
@Entity
@Table(name = "jd_item")
@Data
public class Item {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
private Long spu;
private Long sku;
private String title;
private Double price;
private String pic;
private String url;
private Date created;
private Date updated;
}
- dao 下的 ItemDao 接口编写
package mr.s.jd.dao;
import mr.s.jd.pojo.Item;
import org.springframework.data.jpa.repository.JpaRepository;
public interface ItemDao extends JpaRepository<Item, Long> {
}
- util 下的 HttpUtils 工具类编写 (注意一下,图片下载保存的地址)
package mr.s.jd.util;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
@Component
public class HttpUtils {
// 连接池管理器
private PoolingHttpClientConnectionManager cm;
public HttpUtils(){
this.cm = new PoolingHttpClientConnectionManager();
// 设置最大连接数
this.cm.setMaxTotal(100);
// 设置每个主机的最大连接数
this.cm.setDefaultMaxPerRoute(10);
}
/**
* get 方式获取页面
* @param url
* @return 页面数据
*/
public String doGetHtml(String url){
// 获取 HttpClient 对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
// 创建 HttpGet 对象
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
httpGet.setHeader("Referer", "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=b1a43153d64f4920a10f8ca31aa6fa6b");
// 设置请求信息
httpGet.setConfig(this.getConfig());
// 发起请求获得请求数据
CloseableHttpResponse httpResponse = null;
try {
httpResponse = httpClient.execute(httpGet);
if (httpResponse.getStatusLine().getStatusCode() == 200){
// 判断响应体是否为空
if (httpResponse.getEntity() != null){
return EntityUtils.toString(httpResponse.getEntity(), "utf8");
}
}
} catch (IOException e) {
e.printStackTrace();
}finally {
// 关闭
if (httpResponse != null){
try {
httpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 请求失败,返回空串
return "";
}
/**
* get 方式获取图片
* @param url
* @return 图片名称
*/
public String doGetImage(String url){
// 获取 HttpClient 对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
// 创建 HttpGet 对象
HttpGet httpGet = new HttpGet(url);
// 设置请求信息
httpGet.setConfig(this.getConfig());
// 发起请求获得请求数据
CloseableHttpResponse httpResponse = null;
try {
httpResponse = httpClient.execute(httpGet);
if (httpResponse.getStatusLine().getStatusCode() == 200){
// 下载图片
// 获得图片后缀
String extName = url.substring(url.lastIndexOf("."));
// 创建图片名
String picName = UUID.randomUUID().toString() + extName;
// 创建 OutputStream
OutputStream outputStream = new FileOutputStream(new File("C:\\Users\\xxx\\Desktop\\download\\" + picName));
// 图片下载
httpResponse.getEntity().writeTo(outputStream);
return picName;
}
} catch (IOException e) {
e.printStackTrace();
}finally {
// 关闭
if (httpResponse != null){
try {
httpResponse.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 下载失败返回空串
return "";
}
// 设置请求信息
private RequestConfig getConfig() {
return RequestConfig.custom()
.setConnectTimeout(1000)
.setConnectionRequestTimeout(500)
.setSocketTimeout(10000)
.build();
}
}
- service 下的 ItemService 服务接口编写
package mr.s.jd.service;
import mr.s.jd.pojo.Item;
import java.util.List;
public interface ItemService {
/**
* 保存
* @param item
*/
public void save(Item item);
/**
* 查询
* @param item
* @return
*/
public List<Item> findAll(Item item);
}
- service.impl 下的 ItemServiceImpl 服务实现类编写
package mr.s.jd.service.impl;
import mr.s.jd.dao.ItemDao;
import mr.s.jd.pojo.Item;
import mr.s.jd.service.ItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.List;
@Service
public class ItemServiceImpl implements ItemService {
@Autowired
private ItemDao itemDao;
@Transactional
@Override
public void save(Item item) {
itemDao.save(item);
}
@Override
public List<Item> findAll(Item item) {
Example<Item> example = Example.of(item);
return itemDao.findAll(example);
}
}
- 最重要的 task 下的 ItemTask 定时任务编写
package mr.s.jd.task;
import com.fasterxml.jackson.databind.ObjectMapper;
import mr.s.jd.pojo.Item;
import mr.s.jd.service.ItemService;
import mr.s.jd.util.HttpUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import java.util.Date;
import java.util.List;
@Configuration //1.主要用于标记配置类,兼备Component的效果。
@EnableScheduling // 2.开启定时任务
public class ItemTask {
@Autowired
private HttpUtils httpUtils;
@Autowired
private ItemService itemService;
private static final ObjectMapper MAPPER = new ObjectMapper();
// 当下载任务完成后,间隔多长时间进行下一次的任务,单位是毫秒
@Scheduled(fixedDelay = 100 * 1000)
public void itemTask() throws Exception{
// 声明解析初始地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=57&click=0&page=";
// 按照页码进行遍历
for (int page = 1; page < 10; page = page + 2){
String html = httpUtils.doGetHtml(url + page);
// 解析页面,获取商品数据并存储
this.parse(html);
}
System.out.println("手机数据抓取完成");
}
// 解析页面,获取商品数据并存储
private void parse(String html) throws Exception{
// 解析页面,获取商品数据并存储
Document doc = Jsoup.parse(html);
// 获取 spu 信息
Elements spuElements = doc.select("div#J_goodsList > ul > li");
for (Element spuElement : spuElements){
// 获取 spu
Long spu = Long.parseLong(spuElement.attr("data-spu"));
// 获取 sku 信息
Elements skuElements = spuElement.select("li.ps-item");
for (Element skuElement : skuElements) {
// 获取 sku
Long sku = Long.parseLong(skuElement.select("[data-sku]").attr("data-sku"));
// 根据 sku 查询商品数据
Item item = new Item();
item.setSku(sku);
List<Item> itemList = itemService.findAll(item);
// 如果不存在,则进行保存操作
if (itemList.size() == 0){
// 设置 spu
item.setSpu(spu);
// 拼接商品详情地址
String itemUrl = "https://item.jd.com/"+sku+".html";
item.setUrl(itemUrl);
// 获取商品图片
String picUrl = "https:" + skuElement.select("img[data-sku]").attr("data-lazy-img");
picUrl = picUrl.replace("/n9/", "/n1/");
String picName = httpUtils.doGetImage(picUrl);
item.setPic(picName);
// 获取商品价格
String priceJson = httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
item.setPrice(price);
// 获取商品标题
String itemInfo = httpUtils.doGetHtml(item.getUrl());
String title = Jsoup.parse(itemInfo).select("div.sku-name").text();
item.setTitle(title);
item.setCreated(new Date());
item.setUpdated(item.getCreated());
// 保存商品数据
itemService.save(item);
}
}
}
}
}
超级重要的 CrawlerjdApplication 类的编写!!!
package mr.s.jd;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication
// 开启定时任务
@EnableScheduling
public class CrawlerjdApplication {
public static void main(String[] args) {
SpringApplication.run(CrawlerjdApplication.class, args);
}
}