jsoup jxpath 实现爬虫

package com.bigdata.project.util.reptile;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException;
import cn.wanghaomiao.xpath.model.JXDocument;

/**
 *@description 蜘蛛
 *@author DXH
 *@date 2018年6月26日08:35:24 
 */
public class JsoupSpiderUtils {
	/**
	 *获取百度前5条标题,内容,标题连接 
	 */
	public static void main(String[] args) {
		String keyword="足球";
		String url="http://www.baidu.com/s?wd="+keyword;
		String encoding="UTF-8";
		String xpath1="//div[@id='content_left']/div";// /li[first()]/div/h2/allText()
		String xpath3=".//h3/a";//href//title
		String xpath4=".//div[@class='c-abstract']/html()";
		String xpath5=".//div[@class='c-row']/html()";
		String xpath6=".//div[@class='op-tieba-general-main-col op-tieba-general-main-con']/html()";
		String xpath7=".//div[@class='c-span18 c-span-last']/html()";
		String result =	getHtmlResourceByUrl(url,encoding);
		//设置返回容器
		//List<Map<String,String>> returnLi = new ArrayList<Map<String,String>>();
		if(result!=null){
			//解析源代码
	        Document doc = Jsoup.parse(result);
	        JXDocument jxdoc= new JXDocument(doc);
	        List<Object> rs = null;
	        try {
	            rs = jxdoc.sel(xpath1);
	            //System.out.println(rs.size());
	            if(null!=rs && !rs.isEmpty()){
	            	for(int i=0;i<rs.size();i++){
	            		
	            		Object obj = rs.get(i);
	            		if(obj instanceof Element){
	            			 int index = ((Element)obj).siblingIndex();
		                     System.out.println(index);
	            		};
	            		
	            		 Document doc3 = Jsoup.parse(obj.toString());
	            		 JXDocument jxdoc3= new JXDocument(doc3);
	            		 Object obj3 = jxdoc3.sel(xpath3).get(0);
	            		 Element title = (Element)obj3;
	            		 System.out.println(title.html());
	            		 System.out.println(title.attr("href").toString());
	            		 
	            		 Object obj4 = jxdoc3.sel(xpath4);
	            		 //System.out.println(isEmpty(obj4));
	            		 if(isEmpty(obj4)){
	            			 Object obj5 = jxdoc3.sel(xpath5);
	            			 if(isEmpty(obj5)){
	            				 Object obj6 = jxdoc3.sel(xpath6);
	            				 if(isEmpty(obj6)){
	            					 //System.out.println("====");
	            					 Object obj7 = jxdoc3.sel(xpath7);
	            					 if(isEmpty(obj7)){
		            					 System.out.println("====");
			            			 }else{
			            				 System.out.println(obj7.toString());
			            			 }
		            			 }else{
		            				 System.out.println(obj6.toString());
		            			 }
	            			 }else{
	            				 System.out.println(obj5.toString());
	            			 }
	            		 }else{
	            			 System.out.println(obj4.toString());
	            		 }
	            		 
	            		 
	            	}
	            }
	        } catch (XpathSyntaxErrorException e) {
	            System.err.println("输入Xpath不合法!");
	            return;
	        }
		}
	}
	public static boolean isEmpty(Object obj){
        if (obj == null){
            return true;
        }
        if (obj instanceof List){
            return ((List) obj).size() == 0;
        }
        if (obj instanceof String){
            return ((String) obj).trim().equals("");
        }
        return false;
    }
	/**
	 *通过URL获取网站内容 
	 */
	public static String getHtmlResourceByUrl(String url, String encoding){
       //声明一个存储网页源代码的容器
       StringBuffer buff = new StringBuffer();
       URL urlObj = null;
       URLConnection uc = null;
       InputStreamReader in = null;
       BufferedReader reader = null;
   try {
          //建立网络链接
          urlObj = new URL(url);
          //打开网络链连接
          uc = urlObj.openConnection();
          //建立网络的输入流
          in = new InputStreamReader(uc.getInputStream(),encoding);
          //缓冲写入的文件流
          reader = new BufferedReader(in);
          String tempLine = null;
         //循环读取文件流
          while((tempLine = reader.readLine()) != null){
               buff.append(tempLine + "\n");  //循环追加数据
           }
       } catch (Exception e) {
           e.printStackTrace();
           System.out.println("Conection timeout ...");
       } finally {
           if(in != null){
               try {
            	   in.close();
               } catch (IOException e) {
                   e.printStackTrace();
               }
          }
      }
       return buff.toString();
   }
}

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.8.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/cn.wanghaomiao/JsoupXpath -->
        <dependency>
            <groupId>cn.wanghaomiao</groupId>
            <artifactId>JsoupXpath</artifactId>
            <version>0.3.2</version>
        </dependency>

猜你喜欢

转载自blog.csdn.net/rentian1/article/details/81104349