htmlparser1.6
提取tr似乎有些问题,直接用css selector提取的tr冗余,tr里面还有tr。
所以这里多做了些处理。请看代码。
public static Map<String,String> parseList(String url) { Map<String,String> rlt=new LinkedHashMap<String,String>(); NodeFilter filter=new CssSelectorNodeFilter (".className tr"); filter = new AndFilter(filter, new NotFilter(new HasChildFilter(new CssSelectorNodeFilter ("tr")))); Parser parser; try { parser = new Parser(url); NodeList list = parser.extractAllNodesThatMatch(filter); for(int i=0;i<list.size();i++){ Node tr=list.elementAt(i); parser = new Parser(tr.toHtml()); NodeList tds = parser.extractAllNodesThatMatch(new CssSelectorNodeFilter ("td")); String key=tds.elementAt(0).toPlainTextString(); String value=tds.elementAt(1).toPlainTextString(); rlt.put(key, value); } } catch (ParserException e) { e.printStackTrace(); } return rlt; }
考虑一下