/**
* 此实例用于采集tianya wenda的贴子及回复,组成一个map
*/
package org.apache.nutch.our;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 页面解析
*
* @author LJ
*
*/
public class JsoupParse {
public static Map<String,List<String>> parser(String content) {
Map<String,List<String>> map = new HashMap<String,List<String>>();
Document doc = Jsoup.parse(content);
// Element body = doc.body();
Element titles = doc.select("div.wpcpsCSS").first();
String name = titles.ownText();
System.out.println("标题" + name);
// 获取标题内容
Element dep = doc.select("div[style~=(margin:2px 5px 3px 4px)]")
.first();
String description = dep.text();
System.out.println("标题内容" + description);
String mapString = name+","+description;
// 回复
//利用集合来装多个回复
List<String> rs = new ArrayList<String>();
Elements replys = doc.select("div[style~=(margin: 2px 5px 3px 4px;)]");
for (int i = 0; i < replys.size(); i++) {
Element e = replys.get(i);
String ry = e.text();
rs.add(ry);
System.out.println("回复内容: " + ry);
}
//把问与回复存入map
map.put(mapString, rs);
return map;
// 获取分类
// String pattern
// ="<a class='wpfitCSS wpfilCSS' id=hover title='[*]' href='label?lid=[(0-9a-zA-Z)*]'>";
// Element category = doc.getElementsMatchingOwnText(pattern).first();
// System.out.println(category.text());
// System.out.println(body);// 获取页面body内容
}
public static void main(String[] args) throws Exception {
//String url = "http://wenda.tianya.cn/wenda/thread?tid=15krbkptkho99qirlp0a5rf2dlrqk443dkhj7";
String url2 = "http://wenda.tianya.cn/wenda/thread?tid=15kra997o6kb9p17pvr5fpaj5j8n2ub65sgem";
String content = FileDownLoader.doGet(url2, "UTF-8");
parser(content);
// String content2 = FileDownLoader.doGet(url, "UTF-8");
// parser(content2);
}
}
注:通过查看jsoup API 可以对网页进行深入解析哈。