java爬虫,用jsoup工具就变得极为简单,可以获取dom,通过操作dom的方式来读取里边的数据。下边是我写的一些方法
package com.soft;
import java.io.IOException;
//引入相应jar包
import org.jsoup.*;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Jsouptest {
static // Document doc =
// Jsoup.connect("http://sports.qq.com/a/20180922/006638.htm").get();
Document doc;
public static void main(String[] args) throws IOException {
//输入链接路径
doc = Jsoup.connect("https://www.aliyun.com/jiaocheng/787161.html?spm=5176.100033.2.11.3cce65065LUEW3").get();
gettitle();
gettextfromarticlecontent();
}
public static void gettitle() {
String title = doc.title();
System.out.println("title is: " + title);
String content=doc.text();
System.out.println("content is:"+content);
}
public void getpic() {
Elements images = doc.select("img[src~=(?i)\\.(png|jpe?g|gif)]");
for (Element image : images) {
System.out.println("src : " + image.attr("src"));
System.out.println("height : " + image.attr("height"));
System.out.println("width : " + image.attr("width"));
System.out.println("alt : " + image.attr("alt"));
}
}
public void geturl() {
Elements links = doc.select("a[href]");
System.out.println("连接列表");
for (Element link : links) {
if (link.attr("href").length() > 4) {
if (link.attr("href").substring(0, 4) == "http" || link.attr("href").substring(0, 4).equals("http")) {
System.out.println(link.text()+" : " +
link.attr("href"));
}
}
}
}
public void gettextfromclasstext() {
Elements texts = doc.getElementsByClass("text");
for (Element text : texts) {
System.out.println(text.text());
}
}
public void gettextfromlitext() {
Elements lis = doc.select("li");
for (Element li : lis) {
String listr = li.toString();
String ahref = "<a ";
if (listr.indexOf(ahref) == -1) {
System.out.println(li.toString());
}
}
}
public static void gettextfromarticlecontent() {
Elements texts = doc.getElementsByClass("article-content");
for (Element text : texts) {
System.out.println(text.text());
}
}
public static void gettextfromcontent() {
Elements texts = doc.getElementsByClass("content");
for (Element text : texts) {
System.out.println(text.text());
}
}
public void getfavImage() {
String favImage = "Not Found";
Element element = doc.head().select("link[href~=.*\\.(ico|png)]").first();
if (element == null) {
element = doc.head().select("meta[itemprop=image]").first();
if (element != null) {
favImage = element.attr("content");
}
} else {
favImage = element.attr("href");
}
System.out.println("favImage"+favImage);
}
}
针对不同的网站,写一个专门的爬虫,用上述的方法,获取数据,对数据进行过滤,保存数据。
我的资源里也有相应的jar包资源。