1 回顾
l io
n File
u 文件目录操作的对象
n FileInputStream/FileOutputStream
u 文件流
n ObjectInputStream/ObjectOutputStream
u 对象序列化
u 被序列化的对象要实现Serializable
u writeObject()
u readObject()
n InputStreamReader/OutputStreamWriter
u 编码转换流
u java - Unicode
u UTF-8
u GBK
n text - BufferedReader, PrintWriter
n properties - Properties
n xml - DOM4J
n json - Jackson
n yaml - Jackson
l 线程
n 创建
u 继承Thread
u 实现Runnable
n 方法
u Thread.currentThread()
u Thread.sleep()
u Thread.yield()
u getName(),setName()
u start()
u interrupt()
u join()
u setDaemon(true)
u setPriority(优先级)
n 同步 synchronized
u 步调一致地执行,不会引起数据混乱
u synchronized(对象) {
}
抢指定对象的锁
u synchronized void f() {
}
抢当前实例的锁(this)
u static synchronized void f() {
}
抢"类对象"的锁
n 生产者,消费者模型
u 中间用一个集合来传递数据
u 解耦
n 等待和通知
u wait()
u notify()
u notifyAll()
u 必须在synchronized内调用
u 等待通知的对象,必须是加锁的对象
u wait()外面总应该是一个循环
n Lock
u 乐观锁
u Lock
l ReentrantLock
l ReentrantReadWriteLock
n 工具辅助创建,控制线程
u 线程池 ExecutorService/Executors
l Executors.newFixedThreadPool(5)
l Executors.newCachedThreadPool()
l Executors.newSingleThreadExecutor()
l pool.execute(Runnable任务)
u Callable/Future
l Future future = pool.submit(Callable任务)
Object r = future.get();
n ThreadLocal
u 线程绑定
u 线程当做流水线,上游放入数据,下游访问数据
u threadLocal.set(数据)
u threadLocal.get()
u threadLocal.remove()
2 第十八天:实战:爬虫京东
2.1 http协议
向服务器发送的 http 协议数据
GET / HTTP/1.1
Host: www.tedu.cn
Connection: keep-alive
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
服务器返回的数据
HTTP/1.1 200 OK
Date: Tue, 24 Sep 2019 15:30:45 GMT
Content-Type: text/html
Content-Length: 275688
Connection: keep-alive
Server: tarena
Last-Modified: Tue, 24 Sep 2019 01:14:40 GMT
ETag: "5d896e00-434e8"
Accept-Ranges: bytes
Age: 7092
X-Via: 1.1 PShbsjzsxqo180:5 (Cdn Cache Server V2.0), 1.1 PSjlbswt4dm34:3 (Cdn Cache Server V2.0), 1.1 bdwt64:8 (Cdn Cache Server V2.0)
<!DOCTYPE html>
......
package day18;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.Socket;
public class Test2 {
public static void main(String[] args) throws Exception {
String host = "www.tedu.cn";
Socket s = new Socket(host, 80);
System.out.println("已连接");
OutputStream out = s.getOutputStream();
String http = "GET / HTTP/1.1\n"+
"Host: "+host+"\n"+
"Connection: keep-alive\n"+
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36\n"+
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3\n"+
"Accept-Language: zh-CN,zh;q=0.9\n\n";
out.write(http.getBytes());
out.flush();
System.out.println("已发送");
BufferedReader in = new BufferedReader(new InputStreamReader(s.getInputStream(), "UTF-8"));
String line;
while((line = in.readLine()) != null) {
System.out.println(line);
}
}
}
2.2 html和css
<html>
<head>
<style>
div {
...
}
#id1 {
font-size: 50px
}
.c1 {
....
}
div.c0 .c1 {
...
}
</style>
</head>
<body>
<div id="id1">
<a href="www.tedu.cn">点击访问达内</a>
</div>
<div class="c0">
<div class="c1">xxx</div>
<div class="c1">xxx</div>
</div>
<div>
<div class="c1">xxx</div>
<div class="c1">xxx</div>
</div>
</body>
</html>
2.3 爬虫
Jsoup 第三方开源API,方便的执行http请求,并处理响应,方便的从html中提取需要的内容
package day18;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
public class Test3 {
@Test //抓整个页面
public void html() throws IOException{
String url = "http://tech.qq.com/a/20170330/003855.htm";
//doc代表一个页面
String html = Jsoup.connect(url).execute().body();
System.out.println(html);
}
@Test //抓整站,找到所有a链接,然后进行广度优先/深度优先进行遍历
public void getAllATag() throws IOException{
String url = "http://tech.qq.com/a/20170330/003855.htm";
//获取到页面
Document doc = Jsoup.connect(url).get();
//获取到页面中的所有a标签
Elements eles = doc.getElementsByTag("a");
for(Element ele : eles){
String title = ele.text(); //获取a标签的内容
String aurl = ele.attr("href"); //获取a标签的属性
System.out.println(title+" – "+aurl);
}
}
@Test //京东商城,商品标题
public void getItemTile() throws IOException{
String url = "https://item.jd.com/3882469.html";
String title = getTitle(url);
System.out.println(title);
}
private String getTitle(String url) throws IOException {
Document doc = Jsoup.connect(url).get();
Element ele = doc.select("div.sku-name").get(0);
String title = ele.text();
return title;
}
@Test //当当商城,商品标题
public void getDDItemTile() throws IOException{
String url = "http://product.dangdang.com/23579654.html";
Document doc = Jsoup.connect(url).get();
Element ele = doc.select("div.name_info h1").get(0);
String title = ele.text();
System.out.println(title);
}
@Test
public void price() throws IOException {
double price = getPrice("J_100003717483");
System.out.println(price);
}
public double getPrice(String id) throws IOException {
String url = "https://p.3.cn/prices/mgets?skuIds="+id;
String userAgent = "Mozilla/5.0 (Windows NT 5.1; zh-CN) AppleWebKit/535.12 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/535.12";
String json = Jsoup
.connect(url)
.userAgent(userAgent)
.ignoreContentType(true)
.execute()
.body();
//System.out.println(json);
ObjectMapper mapper =new ObjectMapper();
/*
* JsonNode可以是
* 对象: {"a":"1", "b":"2"}
* 数组: [{...}, {...}, {...}]
*
* isArray() 判断是否是一个数组
* get(i) 取数组中指定下标的节点
* get(name) 取对象中该属性的值
*/
JsonNode node = mapper.readTree(json);
double price = node.get(0).get("p").asDouble();
return price;
}
@Test //商品描述
public void getItemDesc() throws IOException{
ObjectMapper mapper =new ObjectMapper();
String url = "http://d.3.cn/desc/3882469";
String body = Jsoup.connect(url).ignoreContentType(true).execute().body();
System.out.println(body);
String json = body.substring(9, body.length()-1); //把函数名去掉
JsonNode jsonNode = mapper.readTree(json);
String desc = jsonNode.get("content").asText();
System.out.println(desc);
}
@Test
public void testCat3() throws IOException {
List<String> catList = getCat3();
for(String url : catList) {
System.out.println(url);
}
System.out.println("处理标准三级分类:"+catList.size());
}
//返回所有三级分类。京东三级分类:1286,标准有:1190
public static List<String> getCat3() throws IOException{
//所有分类都在里面
String url = "http://www.jd.com/allSort.aspx";
//通过选择器把三级分类过滤出来,集合
Elements els = Jsoup.connect(url).get()
.select("dl.clearfix dd a");
//存放爬取所有三级分类链接
List<String> catList = new ArrayList<String>();
for(Element e : els) {
//获取到三级分类a标签的链接值
String catUrl = e.attr("href");
//排除异常链接
if(catUrl.startsWith("//list.jd.com/list.html?cat=")) {
catList.add("http:"+catUrl);
}
}
return catList;
}
@Test
public void pageNum() throws IOException {
String catUrl = "http://list.jd.com/list.html?cat=1713,3274&jth=i";
int num = getPageNum(catUrl);
System.out.println(num);
}
//获取某个分类下总页数,参数就是分类链接
public static int getPageNum(String catUrl) throws IOException {
String p = Jsoup.connect(catUrl).get().select(".fp-text I").get(0).text();
int num = Integer.parseInt(p);
return num;
}
@Test //测试某个分类下的所有的分页链接
public void pageList() throws IOException {
String catUrl = "https://list.jd.com/list.html?cat=1713,3274";
Integer maxNum = getPageNum(catUrl);
List<String> itemList = getPageUrlList(catUrl, maxNum);
for(String item : itemList) {
System.out.println(item);
}
}
//https://list.jd.com/list.html?cat=1713,3274&page=1
//https://list.jd.com/list.html?cat=1713,3274&page=2
//拼接某个一个分类下的所有的分页链接
public static List<String> getPageUrlList(String catUrl, Integer maxNum){
List<String> pageUrlList = new ArrayList<String>();
//遍历所有页数
for(int i=1; i<=maxNum; i++) {
String pageUrl = catUrl + "&page=" + i;
pageUrlList.add(pageUrl);
}
return pageUrlList;
}
@Test //某个列表页面上所有商品链接地址
public void itemList() throws IOException {
String pageUrl = "http://list.jd.com/list.html?cat=1713,3274&page=2";
List<String> itemUrlList = getItemUrlList(pageUrl);
for(String itemUrl : itemUrlList) {
System.out.println(itemUrl);
}
}
//抓取某个列表页面中所有商品链接的地址,
public static List<String> getItemUrlList(String pageUrl) throws IOException{
Elements els = Jsoup.connect(pageUrl).get().select(".p-img a");
List<String> itemUrlList = new ArrayList<String>();
//所有a标签,过滤过的
for(Element e : els) {
//e是a标签,href属性值
String itemUrl = e.attr("href");
itemUrlList.add("http:"+itemUrl);
}
return itemUrlList;
}
@Test
public void site() throws IOException, InterruptedException {
//获取所有的三级分类
List<String> catUrlList = getCat3();
//遍历3级分类
for(String catUrl : catUrlList) {
//延时
Thread.sleep(10);
//返回当前分类的总页数
int maxNum = getPageNum(catUrl);
//某个分类下所有分页链接
List<String> pageUrlList = getPageUrlList(catUrl, maxNum);
//遍历当前分类下的所有分页链接
for(String pageUrl : pageUrlList) {
//当前分页商品链接
List<String> itemUrlList = getItemUrlList(pageUrl);
for(String itemUrl : itemUrlList) {
//获取其标题、价格。。。
String id = itemUrl.substring(itemUrl.lastIndexOf("/")+1, itemUrl.lastIndexOf("."));
double price = getPrice(id);
String title = getTitle(itemUrl);
System.out.println(itemUrl + " - "+ price + " - "+title);
System.out.println();
}
}
}
}
}
2.4 抓取的五种方式
2.4.1 抓取页面
@Test //抓整个页面
public void html() throws IOException{
String url = "http://tech.qq.com/a/20170330/003855.htm";
//doc代表一个页面
String html = Jsoup.connect(url).execute().body();
System.out.println(html);
}
2.4.2 抓取整个网站
@Test //抓整站,找到所有a链接,然后进行广度优先/深度优先进行遍历
public void getAllATag() throws IOException{
String url = "http://tech.qq.com/a/20170330/003855.htm";
//获取到页面
Document doc = Jsoup.connect(url).get();
//获取到页面中的所有a标签
Elements eles = doc.getElementsByTag("a");
for(Element ele : eles){
String title = ele.text(); //获取a标签的内容
String aurl = ele.attr("href"); //获取a标签的属性
log.debug(title+" – "+aurl);
}
}
2.4.3 抓取标题 – 页面上的内容
可以多级父子样式嵌套
@Test //京东商城,商品标题
public void getItemTile() throws IOException{
String url = "https://item.jd.com/3882469.html";
Document doc = Jsoup.connect(url).get();
Element ele = doc.select(".itemInfo-wrap .sku-name").get(0);
String title = ele.text();
log.debug(title);
}
@Test //当当商城,商品标题
public void getDDItemTile() throws IOException{
String url = "http://product.dangdang.com/1052875306.html";
Document doc = Jsoup.connect(url).get();
Element ele = doc.select("article").get(0);
String title = ele.text();
log.debug(title);
}
2.4.4 抓取价格 – json
2017年4月,京东开始对价格进行反爬虫控制,访问过多的IP地址会被禁止。
2019年8月,京东开始限定爬虫爬取价格,必须伪装请求头,让它觉得是浏览器访问
@Test
public void json() throws IOException {
String url = "https://p.3.cn/prices/mgets?skuIds=J_100003717483";
Connection cn = Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 5.1; zh-CN) AppleWebKit/535.12 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/535.12");
String json = cn.ignoreContentType(true).execute().body();
System.out.println(json);
ObjectMapper MAPPER =new ObjectMapper();
JsonNode node = MAPPER.readTree(json);
Double price = node.get(0).get("p").asDouble();
System.out.println(price);
}
2.4.5 抓取描述 – jsonp
@Test //商品描述
public void getItemDesc() throws IOException{
String url = "http://d.3.cn/desc/3882469";
String jsonp = Jsoup.connect(url).ignoreContentType(true).execute().body();
String json = jsonp.substring(9, jsonp.length()-1); //把函数名去掉
JsonNode jsonNode = MAPPER.readTree(json);
String desc = jsonNode.get("content").asText();
log.debug(desc);
}
2.5 爬取京东
抓取商品先要找到商品ID,有两个方案:
方案一:商品ID是一串数字,猜测它是自增的,于是我们可以是做一个自增的循环。但如果商品的ID不是连续,会造成很多访问无法继续访问,报链接超时。
方案二:找到网站的所有商品的列表页面,解析html找到商品的ID,这个方式解析麻烦些,但商品ID直接可以获得。
所有一般来说都是采用第二种方案。
分类、商品列表、商品详情
那抓取京东网站就变成抓取所有分类,按分类找到商品列表页面,从商品列表页面抓取出商品ID,最终循环商品ID,抓取所有商品详情页面,解析商品详情页面,找到所有商品的详细信息。
断点抓取、离线分析
京东有近22个大类143个二级分类,1286三级分类,8615683种商品,近九百万种商品。如果持续在线抓取,会很快比屏蔽。也不方便测试。所以我们采取断点抓取,离线分析。先将分类抓取,将榨取后的信息保存到磁盘中,后期对磁盘中的文件进行分析入库。
2.5.1 商品三级分类
@Test
public void testCat3() throws IOException {
List<String> catList = JD.getCat3();
for(String url : catList) {
System.out.println(url);
}
System.out.println("处理标准三级分类:"+catList.size());
}
//返回所有三级分类。京东三级分类:1286,标准有:1190
public static List<String> getCat3() throws IOException{
//所有分类都在里面
String url = "http://www.jd.com/allSort.aspx";
//通过选择器把三级分类过滤出来,集合
Elements els = Jsoup.connect(url).get()
.select("div dl dd a");
//存放爬取所有三级分类链接
List<String> catList = new ArrayList<String>();
for(Element e : els) {
//获取到三级分类a标签的链接值
String catUrl = e.attr("href");
//排除异常链接
if(catUrl.startsWith("//list.jd.com/list.html?cat=")) {
catList.add("http:"+catUrl);
}
}
return catList;
}
2.5.2 某个分类下的列表总数
@Test
public void pageNum() throws IOException {
String catUrl = "http://list.jd.com/list.html?cat=1713,3274&jth=i";
int num = JD.getPageNum(catUrl);
System.out.println(num);
}
//获取某个分类下总页数,参数就是分类链接
public static int getPageNum(String catUrl) throws IOException {
Integer num = Integer.parseInt(
Jsoup.connect(catUrl).get()
.select(".fp-text I").get(0).text()
);
return num;
}
2.5.3 某个分类下所有分页链接
@Test //测试某个分类下的所有的分页链接
public void pageList() throws IOException {
String catUrl = "https://list.jd.com/list.html?cat=1713,3274";
Integer maxNum = JD.getPageNum(catUrl);
List<String> itemList = JD.getPageUrlList(catUrl, maxNum);
for(String item : itemList) {
System.out.println(item);
}
}
//https://list.jd.com/list.html?cat=1713,3274&page=1
//https://list.jd.com/list.html?cat=1713,3274&page=2
//拼接某个一个分类下的所有的分页链接
public static List<String> getPageUrlList(String catUrl, Integer maxNum){
List<String> pageUrlList = new ArrayList<String>();
//遍历所有页数
for(int i=1; i<=maxNum; i++) {
String pageUrl = catUrl + "&page=" + I;
pageUrlList.add(pageUrl);
}
return pageUrlList;
}
2.5.4 某个分页下所有商品链接
@Test //某个列表页面上所有商品链接地址
public void itemList() throws IOException {
String pageUrl = "http://list.jd.com/list.html?cat=1713,3274&page=210";
List<String> itemUrlList = JD.getItemUrlList(pageUrl);
for(String itemUrl : itemUrlList) {
System.out.println(itemUrl);
}
}
//抓取某个列表页面中所有商品链接的地址,
public static List<String> getItemUrlList(String pageUrl) throws IOException{
Elements els = Jsoup.connect(pageUrl).get()
.select(".p-img a");
List<String> itemUrlList = new ArrayList<String>();
//所有a标签,过滤过的
for(Element e : els) {
//e是a标签,href属性值
String itemUrl = e.attr("href");
itemUrlList.add("http:"+itemUrl);
}
return itemUrlList;
}
2.5.5 获取京东所有商品链接
@Test
public void site() throws IOException, InterruptedException {
//获取所有的三级分类
List<String> catUrlList = JD.getCat3();
//遍历3级分类
for(String catUrl : catUrlList) {
//延时
Thread.sleep(10);
//返回当前分类的总页数
int maxNum = JD.getPageNum(catUrl);
//某个分类下所有分页链接
List<String> pageUrlList = JD.getPageUrlList(catUrl, maxNum);
//遍历当前分类下的所有分页链接
for(String pageUrl : pageUrlList) {
//当前分页商品链接
List<String> itemUrlList = JD.getItemUrlList(pageUrl);
for(String itemUrl : itemUrlList) {
System.out.println(itemUrl);
//获取到当前商品链接,调用方法获取其标题、价格。。。
}
}
}
}
https://item.jd.com/100002795959.html
https://p.3.cn/prices/mgets?skuIds=100002795959