网络爬虫1之HttpClient抓取数据、Jsoup解析数据

网络爬虫1

1.入门介绍

网络爬虫（Web crawler），是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本

环境准备
JDK1.8
IntelliJ IDEA
IDEA自带的Maven
创建Maven工程给pom.xml加入依赖

<dependencies>
    <!-- HttpClient -->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.3</version>
    </dependency>
    <!-- 日志 -->
    <dependency>
        <groupId>org.slf4j</groupId>
        <artifactId>slf4j-log4j12</artifactId>
        <version>1.7.25</version>
    </dependency>
</dependencies>

加入log4j.properties

log4j.rootLogger=DEBUG,A1
log4j.logger.cn.itcast = DEBUG

log4j.appender.A1=org.apache.log4j.ConsoleAppender
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n

编写最简单的爬虫，抓取传智播客首页：http://www.itcast.cn/

package cn.itcast.carwler.test;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

public class CarwlerFirst {
      
      

    public static void main(String[] args) throws Exception {
      
      
        //1.打开浏览器，创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2.输入网址，发起get请求创建HttpGet对象
        HttpGet httpGet = new HttpGet("http://www.itcast.cn/");

        //3.回车，发起请求，返回响应，使用HttpClient对象发起请求
        CloseableHttpResponse response = httpClient.execute(httpGet);

        //4.解析响应，获取数据、
        //判断状态码是否为200
        if (response.getStatusLine().getStatusCode() == 200) {
      
      
            String content = EntityUtils.toString(response.getEntity(), "UTF-8");
            System.out.println(content);
        }
    }

}

2. HttpClient抓取数据

HttpClient
网络爬虫就是用程序帮助我们访问网络上的资源，我们一直以来都是使用HTTP协议访问互联网的网页，网络爬虫需要编写程序，在这里使用同样的HTTP协议访问网页。
这里我们使用Java的HTTP协议客户端 HttpClient这个技术，来实现抓取网页数据。

GET请求
访问传智官网，请求url地址：http://www.itcast.cn/

package cn.itcast.carwler.test;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import sun.net.www.http.HttpClient;

import javax.xml.ws.Response;
import java.io.IOException;

public class HttpGetTest {
      
      
    public static void main(String[] args) {
      
      
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //创建HttpGet对象，设置url访问地址
        HttpGet httpGet = new HttpGet("http://www.itcast.cn/");
        System.out.println("发起的请求信息:"+httpGet);

        CloseableHttpResponse response=null;
        try {
      
      
            //使用HttpClient发起请求，获取response
            response = httpClient.execute(httpGet);

            //解析响应
            if(response.getStatusLine().getStatusCode()==200){
      
      
                String content = EntityUtils.toString(response.getEntity());
                System.out.println(content.length());
            }
        }catch (IOException e){
      
      
            e.printStackTrace();
        }finally {
      
      
            //关闭response
            try {
      
      
                response.close();
            } catch (IOException e) {
      
      
                e.printStackTrace();
            }
            try {
      
      
                httpClient.close();
            } catch (IOException e) {
      
      
                e.printStackTrace();
            }
        }

    }
}

带参数的GET请求
在传智中搜索学习视频，地址为: http://yun.itheima.com/search?keys=Java

package cn.itcast.carwler.test;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URISyntaxException;

public class HttpGetParamTest {
      
      
    public static void main(String[] args) throws Exception {
      
      
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //设置请求地址：http://yun.itheima.com/search?keys=Java
        //创建URIBuilder
        URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
        //设置参数
        uriBuilder.setParameter("keys","Java");
        //多个参数
        //uriBuilder.setParameter("keys","Java").setParameters();

        //创建HttpGet对象，设置url访问地址
        HttpGet httpGet = new HttpGet(uriBuilder.build());
        System.out.println("发起的请求信息"+httpGet);

        CloseableHttpResponse response=null;
        try {
      
      
            //使用HttpClient发起请求，获取response
            response = httpClient.execute(httpGet);

            //解析响应
            if(response.getStatusLine().getStatusCode()==200){
      
      
                String content = EntityUtils.toString(response.getEntity());
                System.out.println(content);
                System.out.println(content.length());
            }
        }catch (IOException e){
      
      
            e.printStackTrace();
        }finally {
      
      
            //关闭response
            try {
      
      
                response.close();
            } catch (IOException e) {
      
      
                e.printStackTrace();
            }
            try {
      
      
                httpClient.close();
            } catch (IOException e) {
      
      
                e.printStackTrace();
            }
        }

    }
}

POST请求
使用POST访问传智官网，请求url地址：http://www.itcast.cn/

package cn.itcast.carwler.test;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpPostTest {
      
      
    public static void main(String[] args) {
      
      
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //创建HttpGet对象，设置url访问地址
        HttpPost httpPost = new HttpPost("http://www.itcast.cn/");
        System.out.println("发起的请求信息:"+httpPost);

        CloseableHttpResponse response=null;
        try {
      
      
            //使用HttpClient发起请求，获取response
            response = httpClient.execute(httpPost);

            //解析响应
            if(response.getStatusLine().getStatusCode()==200){
      
      
                String content = EntityUtils.toString(response.getEntity());
                System.out.println(content.length());
            }
        }catch (IOException e){
      
      
            e.printStackTrace();
        }finally {
      
      
            //关闭response
            try {
      
      
                response.close();
            } catch (IOException e) {
      
      
                e.printStackTrace();
            }
            try {
      
      
                httpClient.close();
            } catch (IOException e) {
      
      
                e.printStackTrace();
            }
        }

    }
}

带参数的POST请求
在传智中搜索学习视频，使用POST请求，url地址为：
http://yun.itheima.com/search
url地址没有参数，参数keys=java放到表单中进行提交

package cn.itcast.carwler.test;

import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;

public class HttpPostParamTest {
      
      
    public static void main(String[] args) throws UnsupportedEncodingException {
      
      
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //创建HttpGet对象，设置url访问地址
        HttpPost httpPost = new HttpPost("http://www.itcast.cn/");
        System.out.println("发起的请求信息:"+httpPost);

        //声明List集合，封装表单中的参数
        ArrayList<NameValuePair> params = new ArrayList<NameValuePair>();
        params.add(new BasicNameValuePair("keys","java"));

        //创建表单的Entity对象
        UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "UTF-8");

        //设置表单的Entity对象到Post请求中
        httpPost.setEntity(formEntity);

        CloseableHttpResponse response=null;
        try {
      
      
            //使用HttpClient发起请求，获取response
            response = httpClient.execute(httpPost);

            //解析响应
            if(response.getStatusLine().getStatusCode()==200){
      
      
                String content = EntityUtils.toString(response.getEntity());
                System.out.println(content.length());
            }
        }catch (IOException e){
      
      
            e.printStackTrace();
        }finally {
      
      
            //关闭response
            try {
      
      
                response.close();
            } catch (IOException e) {
      
      
                e.printStackTrace();
            }
            try {
      
      
                httpClient.close();
            } catch (IOException e) {
      
      
                e.printStackTrace();
            }
        }

    }
}

连接池
如果每次请求都要创建HttpClient，会有频繁创建和销毁的问题，可以使用连接池来解决这个问题。
测试以下代码，并断点查看每次获取的HttpClient都是不一样的。

package cn.itcast.carwler.test;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpClientPoolTest {
      
      
    public static void main(String[] args) {
      
      
        // 创建连接池管理器
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();

        // 设置最大连接数
        cm.setMaxTotal(200);

        // 设置每个主机的并发数
        cm.setDefaultMaxPerRoute(20);

        // 使用连接池管理器发起请求
        doGet(cm);
        doGet(cm);
    }

    private static void doGet(PoolingHttpClientConnectionManager cm) {
      
      
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        HttpGet httpGet = new HttpGet("http://www.itcast.cn/");

        CloseableHttpResponse response = null;
        try {
      
      
            response = httpClient.execute(httpGet);
            // 判断状态码是否是200
            if (response.getStatusLine().getStatusCode() == 200) {
      
      
                // 解析数据
                String content = EntityUtils.toString(response.getEntity(), "UTF-8");
                System.out.println(content.length());
            }
        } catch (Exception e) {
      
      
            e.printStackTrace();
        } finally {
      
      
            //释放连接
            if (response == null) {
      
      
                try {
      
      
                    response.close();
                } catch (IOException e) {
      
      
                    e.printStackTrace();
                }
                //不能关闭HttpClient
                //httpClient.close();
            }
        }
    }
}

请求参数
有时候因为网络，或者目标服务器的原因，请求需要更长的时间才能完成，我们需要自定义相关时间

package cn.itcast.carwler.test;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpConfigTest {
      
      
    public static void main(String[] args) {
      
      
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //创建HttpGet对象，设置url访问地址
        HttpGet httpGet = new HttpGet("http://www.itcast.cn/");
        System.out.println("发起的请求信息:"+httpGet);

        //配置请求信息
        RequestConfig requestConfig = RequestConfig.custom()
                .setConnectTimeout(1000)//设置创建连接的最长时间，单位是毫秒
                .setConnectionRequestTimeout(500)//设置获取连接的最长时间
                .setSocketTimeout(10 * 1000)//设置数据传输的最长时间
                .build();

        httpGet.setConfig(requestConfig);
        
        CloseableHttpResponse response=null;
        try {
      
      
            //使用HttpClient发起请求，获取response
            response = httpClient.execute(httpGet);

            //解析响应
            if(response.getStatusLine().getStatusCode()==200){
      
      
                String content = EntityUtils.toString(response.getEntity());
                System.out.println(content.length());
            }
        }catch (IOException e){
      
      
            e.printStackTrace();
        }finally {
      
      
            //关闭response
            try {
      
      
                response.close();
            } catch (IOException e) {
      
      
                e.printStackTrace();
            }
            try {
      
      
                httpClient.close();
            } catch (IOException e) {
      
      
                e.printStackTrace();
            }
        }

    }
}

3. Jsoup

我们抓取到页面之后，还需要对页面进行解析。可以使用字符串处理工具解析页面，也可以使用正则表达式，但是这些方法都会带来很大的开发成本，所以我们需要使用一款专门解析html页面的技术。

jsoup介绍
jsoup 是一款Java 的HTML解析器，可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API，可通过DOM，CSS以及类似于jQuery的操作方法来取出和操作数据。
jsoup的主要功能如下：
1. 从一个URL，文件或字符串中解析HTML；
2. 使用DOM或CSS选择器来查找、取出数据；
3. 可操作HTML元素、属性、文本；
先加入Jsoup依赖：

<!--Jsoup-->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.10.3</version>
</dependency>
<!--测试-->
<dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>4.12</version>
</dependency>
<!--工具-->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
    <version>3.7</version>
</dependency>
<dependency>
    <groupId>commons-io</groupId>
    <artifactId>commons-io</artifactId>
    <version>2.6</version>
</dependency>

jsoup 解析url:
Jsoup可以直接输入url，它会发起请求并获取数据，封装为Document对象

  /**
    * 解析url
    * @throws Exception
    */
   @Test
   public void testJsoupUrl() throws Exception {
    
    
       //解析url地址,第一个参数是访问的url,第二个参数是访问时候的超时时间
       Document document = Jsoup.parse(new URL("http://www.itcast.cn/"), 1000);

       //使用标签选择器，获取title标签中的内容
       String title = document.getElementsByTag("title").first().text();
       System.out.println(title);
   }

PS：虽然使用Jsoup可以替代HttpClient直接发起请求解析数据，但是往往不会这样用，因为实际的开发过程中，需要使用到多线程，连接池，代理等等方式，而jsoup对这些的支持并不是很好，所以我们一般把jsoup仅仅作为Html解析工具使用

解析字符串
先准备以下html文件
Jsoup可以直接输入字符串，并封装为Document对象

    /**
     * 解析字符串
     * @throws Exception
     */
    @Test
    public void testJsoupString() throws Exception {
    
    
        //读取文件获取
        String html = FileUtils.readFileToString(new File("D:\\jsoup.html"), "UTF-8");

        //解析字符串
        Document document = Jsoup.parse(html);

        //获取title的内容
        Element title = document.getElementsByTag("title").first();
        System.out.println(title.text());
    }

解析文件
Jsoup可以直接解析文件，并封装为Document对象

    /**
     * 解析文件
     * @throws Exception
     */
    @Test
    public void testJsoupHtml() throws Exception {
    
    
        //解析文件
        Document document = Jsoup.parse(new File("D:\\jsoup.html"), "UTF-8");
        //获取title的内容
        Element title = document.getElementsByTag("title").first();
        System.out.println(title.text());
    }

使用dom方式遍历文档
7.1 元素获取

根据id查询元素getElementById
根据标签获取元素getElementsByTag
根据class获取元素getElementsByClass
根据属性获取元素getElementsByAttribute

/**
 * 使用dom方式遍历文档
 * 元素获取
 * @throws Exception
 */
@Test
public void testDom() throws Exception {
      
      
    //解析文件，获取Document对象
    Document document = Jsoup.parse(new File("D:\\jsoup.html"), "utf-8");

    //1.根据id查询元素getElementById
    Element element = document.getElementById("city_bj");

    //2.根据标签获取元素getElementsByTag
    Element element1 = document.getElementsByTag("span").first();

    //3.根据class获取元素getElementsByClass
    Element element2 = document.getElementsByClass("class_a class_b").first();
    Element element3 = document.getElementsByClass("class_b").first();

    //4.根据属性获取元素getElementsByAttribute
    Element element4 = document.getElementsByAttribute("abc").first();
    Element element5 = document.getElementsByAttributeValue("href", "http://www.itcast.cn").first();

    //打印元素的内容
    System.out.println("获取到的元素内容是：" + element.text());
    System.out.println("获取到的元素内容是：" + element1.text());
    System.out.println("获取到的元素内容是：" + element2.text());
    System.out.println("获取到的元素内容是：" + element3.text());
    System.out.println("获取到的元素内容是：" + element4.text());
    System.out.println("获取到的元素内容是：" + element5.text());
}

7.2 元素中获取数据

从元素中获取id
从元素中获取className
从元素中获取属性的值attr
从元素中获取所有属性attributes
从元素中获取文本内容text

/**
 * 使用dom方式遍历文档
 * 从元素中获取数据
 * @throws Exception
 */
@Test
public void testData() throws Exception {
      
      
    //解析文件，获取Document对象
    Document document = Jsoup.parse(new File("D:\\jsoup.html"), "utf-8");

    //根据id查询元素getElementById
    Element element = document.getElementById("test");

    //1.从元素中获取id
    String s = element.id();

    //2.从元素中获取className
    String s1 = element.className();

    //3.从元素中获取属性的值attr
    String s2 = element.attr("id");

    //4.从元素中获取所有属性attributes
    String s3 = element.attributes().toString();

    //5.从元素中获取文本内容text
    String s4 = element.text();

    //打印获取到的内容
    System.out.println("获取到的数据是：" + s);
    System.out.println("获取到的数据是：" + s1);
    System.out.println("获取到的数据是：" + s2);
    System.out.println("获取到的数据是：" + s3);
    System.out.println("获取到的数据是：" + s4);
}

使用选择器语法查找元素
jsoup elements对象支持类似于CSS (或jquery)的选择器语法，来实现非常强大和灵活的查找功能。这个select 方法在Document, Element,或Elements对象中都可以使用。且是上下文相关的，因此可实现指定元素的过滤，或者链式选择访问。
Select方法将返回一个Elements集合，并提供一组方法来抽取和处理结果。

8.1 Selector选择器概述

tagname: 通过标签查找元素，比如：span #id: 通过ID查找元素，比如：# city_bj .class:
通过class名称查找元素，比如：.class_a [attribute]: 利用属性查找元素，比如：[abc]
[attr=value]: 利用属性值来查找元素，比如：[class=s_name]

  /**
     * Selector选择器概述
     * @throws Exception
     */
    @Test
    public void testSelector() throws Exception{
      
      
        //解析文件，获取Document对象
        Document document = Jsoup.parse(new File("D:\\jsoup.html"), "utf-8");

        //tagname: 通过标签查找元素，比如：span
        Elements elements = document.select("span");
        for (Element element : elements) {
      
      
            System.out.println(element.text());
        }

        //#id: 通过ID查找元素，比如：# city_bj
        String text = document.select("#city_bj").text();
        System.out.println(text);

        //.class: 通过class名称查找元素，比如：.class_a
        String text1 = document.select(".class_a").text();
        System.out.println(text1);

        //[attribute]: 利用属性查找元素，比如：[abc]
        String text2 = document.select("[abc]").text();
        System.out.println(text2);

        //[attr=value]: 利用属性值来查找元素，比如：[class=s_name]
        Elements elements2 = document.select("[class=s_name]");
        for (Element element : elements2) {
      
      
            System.out.println(element.text());
        }
    }

8.2 Selector选择器组合使用

el#id: 元素+ID，比如： h3#city_bj el.class: 元素+class，比如： li.class_a
el[attr]: 元素+属性名，比如： span[abc] 任意组合: 比如：span[abc].s_name ancestor
child: 查找某个元素下子元素，比如：.city_con li 查找"city_con"下的所有li parent > child:
查找某个父元素下的直接子元素，比如： .city_con > ul > li
查找city_con第一级（直接子元素）的ul，再找所有ul下的第一级li parent > *: 查找某个父元素下所有直接子元素

  /**
     * Selector选择器组合使用
     * @throws Exception
     */
    @Test
    public void testSelect2() throws Exception {
      
      
        //解析文件，获取Document对象
        Document document = Jsoup.parse(new File("D:\\jsoup.html"), "utf-8");

        //el#id: 元素+ID，比如： h3#city_bj
        String text = document.select("h3#city_bj").text();

        //el.class: 元素+class，比如： li.class_a
        String text1 = document.select("li.class_a").text();

        //el[attr]: 元素+属性名，比如： span[abc]
        String text2 = document.select("span[abc]").text();

        //任意组合: 比如：span[abc].s_name
        String text3 = document.select("span[abc].s_name").text();

        //ancestor child: 查找某个元素下子元素，比如：.city_con li 查找"city_con"下的所有li
        String text4 = document.select(".city_con li").text();

        //parent > child: 查找某个父元素下的直接子元素，比如：
        //.city_con > ul > li 查找city_con第一级（直接子元素）的ul，再找所有ul下的第一级li
        String text5 = document.select(".city_con > ul > li").text();

        //parent > *: 查找某个父元素下所有直接子元素
        String text6 = document.select(".city_con > *").text();

        //打印获取到的内容
        System.out.println("获取到的数据是：" + text);
        System.out.println("获取到的数据是：" + text1);
        System.out.println("获取到的数据是：" + text2);
        System.out.println("获取到的数据是：" + text3);
        System.out.println("获取到的数据是：" + text4);
        System.out.println("获取到的数据是：" + text5);
        System.out.println("获取到的数据是：" + text6);
    }