package com.bigdata.project.util.reptile;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException;
import cn.wanghaomiao.xpath.model.JXDocument;
/**
*@description 蜘蛛
*@author DXH
*@date 2018年6月26日08:35:24
*/
public class JsoupSpiderUtils {
/**
*获取百度前5条标题,内容,标题连接
*/
public static void main(String[] args) {
String keyword="足球";
String url="http://www.baidu.com/s?wd="+keyword;
String encoding="UTF-8";
String xpath1="//div[@id='content_left']/div";// /li[first()]/div/h2/allText()
String xpath3=".//h3/a";//href//title
String xpath4=".//div[@class='c-abstract']/html()";
String xpath5=".//div[@class='c-row']/html()";
String xpath6=".//div[@class='op-tieba-general-main-col op-tieba-general-main-con']/html()";
String xpath7=".//div[@class='c-span18 c-span-last']/html()";
String result = getHtmlResourceByUrl(url,encoding);
//设置返回容器
//List<Map<String,String>> returnLi = new ArrayList<Map<String,String>>();
if(result!=null){
//解析源代码
Document doc = Jsoup.parse(result);
JXDocument jxdoc= new JXDocument(doc);
List<Object> rs = null;
try {
rs = jxdoc.sel(xpath1);
//System.out.println(rs.size());
if(null!=rs && !rs.isEmpty()){
for(int i=0;i<rs.size();i++){
Object obj = rs.get(i);
if(obj instanceof Element){
int index = ((Element)obj).siblingIndex();
System.out.println(index);
};
Document doc3 = Jsoup.parse(obj.toString());
JXDocument jxdoc3= new JXDocument(doc3);
Object obj3 = jxdoc3.sel(xpath3).get(0);
Element title = (Element)obj3;
System.out.println(title.html());
System.out.println(title.attr("href").toString());
Object obj4 = jxdoc3.sel(xpath4);
//System.out.println(isEmpty(obj4));
if(isEmpty(obj4)){
Object obj5 = jxdoc3.sel(xpath5);
if(isEmpty(obj5)){
Object obj6 = jxdoc3.sel(xpath6);
if(isEmpty(obj6)){
//System.out.println("====");
Object obj7 = jxdoc3.sel(xpath7);
if(isEmpty(obj7)){
System.out.println("====");
}else{
System.out.println(obj7.toString());
}
}else{
System.out.println(obj6.toString());
}
}else{
System.out.println(obj5.toString());
}
}else{
System.out.println(obj4.toString());
}
}
}
} catch (XpathSyntaxErrorException e) {
System.err.println("输入Xpath不合法!");
return;
}
}
}
public static boolean isEmpty(Object obj){
if (obj == null){
return true;
}
if (obj instanceof List){
return ((List) obj).size() == 0;
}
if (obj instanceof String){
return ((String) obj).trim().equals("");
}
return false;
}
/**
*通过URL获取网站内容
*/
public static String getHtmlResourceByUrl(String url, String encoding){
//声明一个存储网页源代码的容器
StringBuffer buff = new StringBuffer();
URL urlObj = null;
URLConnection uc = null;
InputStreamReader in = null;
BufferedReader reader = null;
try {
//建立网络链接
urlObj = new URL(url);
//打开网络链连接
uc = urlObj.openConnection();
//建立网络的输入流
in = new InputStreamReader(uc.getInputStream(),encoding);
//缓冲写入的文件流
reader = new BufferedReader(in);
String tempLine = null;
//循环读取文件流
while((tempLine = reader.readLine()) != null){
buff.append(tempLine + "\n"); //循环追加数据
}
} catch (Exception e) {
e.printStackTrace();
System.out.println("Conection timeout ...");
} finally {
if(in != null){
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return buff.toString();
}
}
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/cn.wanghaomiao/JsoupXpath -->
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>JsoupXpath</artifactId>
<version>0.3.2</version>
</dependency>