页面爬取

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/River741132472/article/details/86238012
package com.shengdun.demo.controller;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.List;

public class test {
    public static void main(String[] args) throws Exception{

        CloseableHttpClient httpClient= HttpClients.createDefault();  //创建HttpClient实例
        HttpGet httpGet=new HttpGet("https://www.amazon.cn/dp/B075LGPY95/ref=lp_665002051_1_2?s=wireless&ie=UTF8&qid=1547102558&sr=1-2");
        CloseableHttpResponse response=httpClient.execute(httpGet);//执行get请求
        HttpEntity entity=response.getEntity();//获取返回实体
        String webContent= EntityUtils.toString(entity, "utf-8");
        System.out.println("网页内容:"+webContent);  //指定编码打印网页内容
        response.close();  //关闭和释放系统资源

        Document document= Jsoup.parse(webContent);
        String elements = document.getElementsByAttributeValue("id","imgBlkFront").attr("data-a-dynamic-image");
        String landingImage = document.getElementsByAttributeValue("id","landingImage").attr("data-a-dynamic-image");
        String e =  elements.replaceAll("\\:\\[[0-9]{3}\\,[0-9]{3}\\]","");
        //获取图片列表
        List list = new ArrayList();
        //todo ceshi
        Elements imgs = document.getElementsByTag("img");
        //颜色
        Elements colorNames = document.getElementsByAttributeValueMatching("id","color_name_[0-9].*?");
        Elements colorName1;
        for(Element colorName :  colorNames){
            colorName1  = colorName.getElementsByTag("img");
            colorName1.attr("src");
            colorName1.attr("alt");
            System.out.println(colorName1);
            System.out.println(colorName1.attr("src"));
            System.out.println(colorName1.attr("alt"));
        }
        //尺寸
        Elements sizeNames = document.getElementsByAttributeValueMatching("id","size_name_[0-9].*?");
        for(Element sizeName :  sizeNames){
            System.out.println(sizeName.attr("title"));
            System.out.println(sizeName.text());
        }

        //获取标题
        String productTitle = document.getElementsByAttributeValue("id","productTitle").text();
        //获取价格
        //String price = document.getElementsByAttributeValue("class","a-size-medium a-color-price").text();
        //String price3 = document.getElementsByAttributeValue("id","priceblock_ourprice").text();
        //String price2 = document.getElementsByAttributeValue("id","priceblock_saleprice").text();
        Elements element = document.getElementsByAttributeValueMatching("id","priceblock_.*?price");
        String price ="";
        for(Element ele: element){
            if("span".equals(ele.tagName())){
                price = ele.text();
            }
        }
        System.out.println(price);

        //获取页面内容
        Elements pageContent = document.getElementsByAttributeValue("id","dp-container");
        //商品描述
        Elements productDescription = document.getElementsByAttributeValue("id","productDescription");

        System.out.println(list);
    }

}

猜你喜欢

转载自blog.csdn.net/River741132472/article/details/86238012