掘金原文传送门
说在开头的话:以下的代码存在不严谨的做法,可自行更改:)
项目结构(使用maven管理)
application.properties
#DB Configuration
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/corejava?useUnicode=true&serverTimezone=Asia/Shanghai&characterEncoding=utf-8&nullCatalogMeansCurrent=true
spring.datasource.username=root
spring.datasource.password=23333
#Jpa Configuration
spring.jpa.database=MySQL
spring.jpa.show-sql=true
spring.jpa.open-in-view=true
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.6.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>icn.tcast</groupId>
<artifactId>itcast-crawler-jd</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
MySQL表的结构
HTML源码分析
Item类、ItemService接口、ItemDao接口、ItemServiceImpl类的编写
Item类
package cn.itcast.jd.pojo;
import javax.persistence.*;
import java.util.Date;
//声明这是个实体
@Entity
// 对应表的映射
@Table(name = "jd_item") //表的一个映射
public class Item {
//声明id是主键 以及自增的类型
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
//商品集合id
private Long spu;
//商品最小单位类单元id
private Long sku;
//商品标题
private String title;
//商品价格
private Double price;
public Double getPrice() {
return price;
}
public void setPrice(Double price) {
this.price = price;
}
//商品图片
private String pic;
//商品的详情的地址
private String url;
//创建时间
private Date created;
//更新时间
private Date updated;
public Date getCreated() {
return created;
}
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public void setCreated(Date created) {
this.created = created;
}
public String getPic() {
return pic;
}
public void setPic(String pic) {
this.pic = pic;
}
public Long getSku() {
return sku;
}
public void setSku(Long sku) {
this.sku = sku;
}
public Long getSpu() {
return spu;
}
public void setSpu(Long spu) {
this.spu = spu;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Date getUpdated() {
return updated;
}
public void setUpdated(Date updated) {
this.updated = updated;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}
itemDao
package cn.itcast.jd.dao;
import cn.itcast.jd.pojo.Item;
import org.springframework.data.jpa.repository.JpaRepository;
//操作什么数据 主键类型是啥
public interface ItemDao extends JpaRepository<Item,Long> {
}
ItemService
package cn.itcast.jd.service;
import cn.itcast.jd.pojo.Item;
import java.util.List;
public interface ItemService {
//保存商品
public void save(Item item);
//查询商品
public List<Item> findAll(Item item);
}
ItemServiceImpl
package cn.itcast.jd.impl;
import cn.itcast.jd.dao.ItemDao;
import cn.itcast.jd.pojo.Item;
import cn.itcast.jd.service.ItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.List;
//表示由spring创建它的实例
@Service
public class ItemServiceImpl implements ItemService {
//注入商品的dao 自动装配
@Autowired
private ItemDao itemDao;
@Override
//开启事务
@Transactional
public void save(Item item) {
this.itemDao.save(item);
}
@Override
public List<Item> findAll(Item item) {
//声明查询条件
org.springframework.data.domain.Example<Item> example = Example.of(item);
//依据查询条件来查询数据
List<Item> list = this.itemDao.findAll(example);
return list;
}
}
HttpClient类的封装
package cn.itcast.jd.util;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
//spring创建它的实例
@Component
public class HttpUtils {
//使用连接池
private PoolingHttpClientConnectionManager cm;
public HttpUtils() {
this.cm = new PoolingHttpClientConnectionManager();
//设置最大连接数
this.cm.setMaxTotal(100);
//设置主机最大连接数
this.cm.setDefaultMaxPerRoute(10);
}
//根据请求地址 下载页面数据
public String DoGetHtml(String url) {
//获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
CloseableHttpResponse response = null;
//设置HttpGet请求对象 设置url
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
//httpGet.setHeader("Referer", "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=b1a43153d64f4920a10f8ca31aa6fa6b");
//设置请求信息
httpGet.setConfig(this.getConfig());
try {
//使用httpClient发起请求 获取响应
response = httpClient.execute(httpGet);
//解析响应
if (response.getStatusLine().getStatusCode() == 200) {
//判断响应体Entity是否是不为空
if (response.getEntity() != null) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
return content;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if(response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "";
}
/*
* 下载图片
* @param url
* @return 图片名称
*/
public String doGetImages(String url){
//获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
CloseableHttpResponse response = null;
//设置HttpGet请求对象 设置url
HttpGet httpGet;
httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
//设置请求信息
httpGet.setConfig(this.getConfig());
try {
//使用httpClient发起请求 获取响应
response = httpClient.execute(httpGet);
//解析响应
if (response.getStatusLine().getStatusCode() == 200) {
//判断响应体Entity是否是不为空
if (response.getEntity() != null) {
//下载图片
//获取图片的后缀
String extName = url.substring(url.lastIndexOf("."));
//创建图片名 重命名图片
String picName = UUID.randomUUID().toString()+extName;
//下载图片
//声明OutputStream
OutputStream outputStream = new FileOutputStream(new File("C:\\Users\\mac12\\Desktop\\手机图片\\"+picName));
response.getEntity().writeTo(outputStream);
//返回图片名称
return picName;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//返回空
return "";
}
//设置请求信息
private RequestConfig getConfig(){
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)
.setConnectionRequestTimeout(500)
.setSocketTimeout(10*1000).build();
return config;
};
}
Application引导类的编写
package cn.itcast.jd;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication
//使用定时任务 需要开启定时任务,添加注解
@EnableScheduling
public class Application {
public static void main(String[] args){
SpringApplication.run(Application.class,args);
}
}
ItemTask定时抓取任务
package cn.itcast.jd.task;
import cn.itcast.jd.pojo.Item;
import cn.itcast.jd.service.ItemService;
import cn.itcast.jd.util.HttpUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.http.client.methods.HttpGet;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.Date;
import java.util.List;
//表示由Spring创建实例
@Component
public class ItemTask {
@Autowired
private HttpUtils httpUtils;
@Autowired
private ItemService itemService;
//解析json的工具类
private static final ObjectMapper MAPPER = new ObjectMapper();
//当下载任务完成后 间隔多长时间进行下一次的任务
@Scheduled(fixedDelay = 100*1000)
public void itemtsk() throws Exception{
//声明解析的初始地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=110&click=0&page=";
//遍历页面 按照页面对手机的搜索进行遍历
for(int i = 1;i < 100;i = i+2){
//获取页面
String html =httpUtils.DoGetHtml(url+i);
//解析页面
if(html != null) {
this.parse(html);
}
}
System.out.println("手机数据抓取完成...");
}
//解析页面并存储数据
private void parse(String html) throws Exception {
//解析HTML获取DoM对象
Document doc = Jsoup.parse(html);
//获取spu
Elements spuEles=doc.select("div#J_goodsList >ul > li");
//Element spuEle = spuEles.first();
for(Element spuEle:spuEles){
//获取spu
Long spu = Long.parseLong(spuEle.attr("data-spu"));
//获取sku
Elements skuEles = spuEle.select("li.ps-item");
for(Element skuEle: skuEles){
//获取sku
Long sku =Long.parseLong(skuEle.select("[data-sku]").attr("data-sku"));
//根据sku查询商品数据
Item item = new Item();
item.setSku(sku);
List<Item>list = this.itemService.findAll(item);
//如果商品存在 就执行下一个
if(list.size() > 0){
continue;
}
//s设置商品的spu
item.setSpu(spu);
//获取商品的url
String itemUrl = "https://item.jd.com/"+sku+".html";
item.setUrl(itemUrl);
//商品的图片
String picUrl = "https:"+skuEle.select("img[data-sku]").first().attr("data-lazy-img");
picUrl = picUrl.replace("/n9/","/n1/");
String picName = this.httpUtils.doGetImages(picUrl);
item.setPic(picName);
//商品的价格
String priceJson = this.httpUtils.DoGetHtml("https://p.3.cn/prices/mgets?skuIds=J_"+sku);
//取第一行里面的p对应的字符串并转化为Double
double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
item.setPrice(price);
//商品的标题
String itemInfo = this.httpUtils.DoGetHtml(itemUrl);
String text = Jsoup.parse(itemInfo).select("div.sku-name").text();
item.setTitle(text);
//商品的创建时间
item.setCreated(new Date());
//商品的更新时间
item.setUpdated(item.getCreated());
this.itemService.save(item);
}
}
}
}