jsoup爬虫

java爬虫，用jsoup工具就变得极为简单，可以获取dom，通过操作dom的方式来读取里边的数据。下边是我写的一些方法

package com.soft;

import java.io.IOException;
//引入相应jar包
import org.jsoup.*;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Jsouptest {

	static // Document doc =
			// Jsoup.connect("http://sports.qq.com/a/20180922/006638.htm").get();
	Document doc;

	public static void main(String[] args) throws IOException {
	//输入链接路径
		doc = Jsoup.connect("https://www.aliyun.com/jiaocheng/787161.html?spm=5176.100033.2.11.3cce65065LUEW3").get();
		gettitle();
		gettextfromarticlecontent();
	}

	public static void gettitle() {
		String title = doc.title();
		System.out.println("title is: " + title);
		String content=doc.text();
		System.out.println("content is:"+content);
	}
 
	public void getpic() {
		Elements images = doc.select("img[src~=(?i)\\.(png|jpe?g|gif)]");
		for (Element image : images) {
			 System.out.println("src : " + image.attr("src"));
			 System.out.println("height : " + image.attr("height"));
			 System.out.println("width : " + image.attr("width"));
			 System.out.println("alt : " + image.attr("alt"));
		}
	}

	public void geturl() {
		Elements links = doc.select("a[href]");
		System.out.println("连接列表");
		for (Element link : links) {
			if (link.attr("href").length() > 4) {
				if (link.attr("href").substring(0, 4) == "http" || link.attr("href").substring(0, 4).equals("http")) {
					 System.out.println(link.text()+" : " +
					 link.attr("href"));
				}
			}
		}
	}
	public void gettextfromclasstext() {
		Elements texts = doc.getElementsByClass("text");
		for (Element text : texts) {
			System.out.println(text.text());
		}
	}
	public void gettextfromlitext() {
		Elements lis = doc.select("li");
		for (Element li : lis) {
			String listr = li.toString();
			String ahref = "<a ";
			if (listr.indexOf(ahref) == -1) {
				System.out.println(li.toString());
			}
		}
	}
	public static void gettextfromarticlecontent() {
		Elements texts = doc.getElementsByClass("article-content");
		for (Element text : texts) {
			System.out.println(text.text());
		}
	}
	public static void gettextfromcontent() {
		Elements texts = doc.getElementsByClass("content");
		for (Element text : texts) {
			System.out.println(text.text());
		}
	}
	public void getfavImage() {
		String favImage = "Not Found";
		Element element = doc.head().select("link[href~=.*\\.(ico|png)]").first();
		if (element == null) {
			element = doc.head().select("meta[itemprop=image]").first();
			if (element != null) {
				favImage = element.attr("content");
			}
		} else {
			favImage = element.attr("href");
		}
		 System.out.println("favImage"+favImage);
	}

}

针对不同的网站，写一个专门的爬虫，用上述的方法，获取数据，对数据进行过滤，保存数据。
我的资源里也有相应的jar包资源。

猜你喜欢