java简单爬虫

所有代码下载
package util;
import org.jsoup.*;
import org.jsoup.select.*;

import beans.InfoBeans;

import org.jsoup.nodes.*;

import java.util.List;
import java.util.ArrayList;
import java.io.*;
import java.net.*;

public class HtmlClass {

	//爬虫的数据全部从网络而来，所以连接网络
	public Document getHtmlTextByWeb(String url)
	{
		Document doc = null;
		try
		{
			//防止网站屏蔽
			int timeout = (int)(Math.random()*1000);
			//处理超时
			while(timeout!=0)
			{
				timeout--;//倒计时
			}
			doc = Jsoup.connect(url).data("query","JavaEE").userAgent("Mozilla")
					.cookie("auth","token").timeout(300000).post();
			
		}
		catch (Exception e) {
			e.printStackTrace();
			try {
				//post行不通，用get方式获取html文档
				doc = Jsoup.connect(url).timeout(50000).get();
			} catch (IOException e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			}
		}
		return doc;
	}
	/**
	 * 获取本地的html文档
	 * @param args
	 */
	public Document getHtmlDocumentByPath(String name,String path)
	{
		Document doc = null;
		String path_2 = "e:/htmls/" + name + ".html";
		File file = new File(path_2);
		String url = path;
		try {
			doc = Jsoup.parse(file,"GBK");
		} catch (IOException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		try
		{
			/**
			 *1.转文件
			 *2.判断文件是否存在
			 */
			if(!doc.children().isEmpty())
			{
				doc = null;
				System.out.print("文档已存在..");
			}
		}
		catch (Exception e) {
			System.out.print("本地文件未找到，从网络上进行加载。。");
			doc = this.getHtmlTextByWeb(url);
			//保存文档入本地
			this.saveFileToLocal(url,name);
		}
		return doc;
	}
	/**
	 * 网络文档本地化保存
	 * @param url
	 * @param name
	 */
	public void saveFileToLocal(String url,String name)
	{
		//1.输入流
		InputStream is = null;
		//2.输出流
		FileOutputStream fos = null;
		//3.组织文件保存的目的地
		String path = "e:/" + name;
		File dest = new File(path);
		//4.包裹好文件，以便于输出
		try {
			fos = new FileOutputStream(dest);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		//讲字节输入流进行构建
		URL temp = null;
		try {
			temp = new URL(url);
		} catch (MalformedURLException e) {
			e.printStackTrace();
		}
		//将程序中的字节输入流与网络资源进行对接
		try {
			is = temp.openStream();
		} catch (IOException e) {
			e.printStackTrace();
		}
		
		//5.带缓冲的
		BufferedInputStream bis = new BufferedInputStream(is);
		BufferedOutputStream bos = new BufferedOutputStream(fos);
		//6.进行读与写
		int len = 0;
		//缓冲区
		byte[] bs = new byte[1024*24];
		try {
			while((len=bis.read(bs,0,bs.length))!=-1)
			{
				bos.write(bs,0,len);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		try {
			bos.close();
			fos.close();
			bis.close();
			is.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	/**
	 * 根据元素（标签）的属性来获取元素列表
	 * @param attrName 样式名称
	 */
	public Elements getEleByAttr(Document doc , String attrName)
	{
		//用来保存属性所对应的标签集合
		Elements elements = null;
		elements = doc.select(attrName);
		return elements;
	}
	/**
	 * 获取省份
	 * @param args
	 */
	public List getProvince(String name , 
			String url, String type)
	{
		//1.声明一个集合，用于保存省份信息
		List list = new ArrayList();
		//2.调用自定义的方法获取网页文档
		Document doc = this.getHtmlTextByWeb(url);
		//3.解析文档
		if(doc!=null)
		{
			//4.获取属性
			Elements es = this.getEleByAttr(doc, type);
			//System.out.println(es);
			//5.遍历元素
			
			for (Element e : es) {
				if(e!=null)
				{
					for (Element e1 : e.children()) {
						String[] prv =new String[4];
						//System.out.println(e.children().first().ownText());
						if(e1.children().first() != null)
						{
							InfoBeans info = new InfoBeans();
							prv[0] = url;
							info.setUrl(url);
							//省份名字
							prv[1] = e1.children().first().ownText();
							info.setProvince(prv[1]);
							//获取省份对应的url
							prv[2] = e1.children().first().attr("abs:href");
							info.setAlink(prv[2]);
							//保存属性
							prv[3] =type;//属性名
							info.setAttrValue(prv[3]);
							//保存数据
							list.add(info);
						}
					}
				}
			}
		}
		return list;
	}
	public static void main(String[] args) {
		String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";
		HtmlClass hc = new HtmlClass();
		List ps = hc.getProvince("", url, ".provincetr");
		//System.out.print();
//		System.out.print(ps);
	}
}
猜你喜欢