版权声明:转载请注明:http://blog.csdn.net/update7?viewmode=contents https://blog.csdn.net/update7/article/details/80955633
package util;
import org.jsoup.*;
import org.jsoup.select.*;
import beans.InfoBeans;
import org.jsoup.nodes.*;
import java.util.List;
import java.util.ArrayList;
import java.io.*;
import java.net.*;
public class HtmlClass {
//爬虫的数据全部从网络而来,所以连接网络
public Document getHtmlTextByWeb(String url)
{
Document doc = null;
try
{
//防止网站屏蔽
int timeout = (int)(Math.random()*1000);
//处理超时
while(timeout!=0)
{
timeout--;//倒计时
}
doc = Jsoup.connect(url).data("query","JavaEE").userAgent("Mozilla")
.cookie("auth","token").timeout(300000).post();
}
catch (Exception e) {
e.printStackTrace();
try {
//post行不通,用get方式获取html文档
doc = Jsoup.connect(url).timeout(50000).get();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
return doc;
}
/**
* 获取本地的html文档
* @param args
*/
public Document getHtmlDocumentByPath(String name,String path)
{
Document doc = null;
String path_2 = "e:/htmls/" + name + ".html";
File file = new File(path_2);
String url = path;
try {
doc = Jsoup.parse(file,"GBK");
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try
{
/**
*1.转文件
*2.判断文件是否存在
*/
if(!doc.children().isEmpty())
{
doc = null;
System.out.print("文档已存在..");
}
}
catch (Exception e) {
System.out.print("本地文件未找到,从网络上进行加载。。");
doc = this.getHtmlTextByWeb(url);
//保存文档入本地
this.saveFileToLocal(url,name);
}
return doc;
}
/**
* 网络文档本地化保存
* @param url
* @param name
*/
public void saveFileToLocal(String url,String name)
{
//1.输入流
InputStream is = null;
//2.输出流
FileOutputStream fos = null;
//3.组织文件保存的目的地
String path = "e:/" + name;
File dest = new File(path);
//4.包裹好文件,以便于输出
try {
fos = new FileOutputStream(dest);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
//讲字节输入流进行构建
URL temp = null;
try {
temp = new URL(url);
} catch (MalformedURLException e) {
e.printStackTrace();
}
//将程序中的字节输入流与网络资源进行对接
try {
is = temp.openStream();
} catch (IOException e) {
e.printStackTrace();
}
//5.带缓冲的
BufferedInputStream bis = new BufferedInputStream(is);
BufferedOutputStream bos = new BufferedOutputStream(fos);
//6.进行读与写
int len = 0;
//缓冲区
byte[] bs = new byte[1024*24];
try {
while((len=bis.read(bs,0,bs.length))!=-1)
{
bos.write(bs,0,len);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
bos.close();
fos.close();
bis.close();
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 根据元素(标签)的属性来获取元素列表
* @param attrName 样式名称
*/
public Elements getEleByAttr(Document doc , String attrName)
{
//用来保存属性所对应的标签集合
Elements elements = null;
elements = doc.select(attrName);
return elements;
}
/**
* 获取省份
* @param args
*/
public List getProvince(String name ,
String url, String type)
{
//1.声明一个集合,用于保存省份信息
List list = new ArrayList();
//2.调用自定义的方法获取网页文档
Document doc = this.getHtmlTextByWeb(url);
//3.解析文档
if(doc!=null)
{
//4.获取属性
Elements es = this.getEleByAttr(doc, type);
//System.out.println(es);
//5.遍历元素
for (Element e : es) {
if(e!=null)
{
for (Element e1 : e.children()) {
String[] prv =new String[4];
//System.out.println(e.children().first().ownText());
if(e1.children().first() != null)
{
InfoBeans info = new InfoBeans();
prv[0] = url;
info.setUrl(url);
//省份名字
prv[1] = e1.children().first().ownText();
info.setProvince(prv[1]);
//获取省份对应的url
prv[2] = e1.children().first().attr("abs:href");
info.setAlink(prv[2]);
//保存属性
prv[3] =type;//属性名
info.setAttrValue(prv[3]);
//保存数据
list.add(info);
}
}
}
}
}
return list;
}
public static void main(String[] args) {
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";
HtmlClass hc = new HtmlClass();
List ps = hc.getProvince("", url, ".provincetr");
//System.out.print();
// System.out.print(ps);
}
}