public class HTMLBean<T> { private String eleName;//元素名称 private int eleCount;//元素个数 private String result;//元素值 private Map<String,T> attribute;//元素属性 public String getResult() { return result; } public void setResult(String result) { this.result = result; } public Map<String, T> getAttribute() { return attribute; } public void setAttribute(Map<String, T> attribute) { this.attribute = attribute; } public String getEleName() { return eleName; } public void setEleName(String eleName) { this.eleName = eleName; } public int getEleCount() { return eleCount; } public void setEleCount(int eleCount) { this.eleCount = eleCount; } }
public interface ElementFilter<T> { boolean filter(HTMLBean<T> bean); }
public class HtmlUtil { //patternString1 = "<(\\w+)\\s+?(type\\s*?=[^>]+)?\\s+?(src\\s*?=[^>]+)?>(.*?)</\\1>"; private static final String patternString = "<(\\w+)\\s+(\\w+\\s*=[^>]+)?>(.*?)</\\1>"; private static final Pattern pattern = Pattern.compile(patternString,Pattern.DOTALL); private static final String patternString1 = "(\\w+?)=[\"|\']?(.*?)[\"|\']?\\s+?"; private static final Pattern pattern1 = Pattern.compile(patternString1,Pattern.DOTALL); public static HTMLBean<String> getHTMLBean(String content){ HTMLBean<String> bean = null; Matcher matcher = pattern.matcher(content);; if(matcher!=null && matcher.find()) { bean = new HTMLBean<String>(); bean.setEleName(matcher.group(1)); String result = matcher.group(2); bean.setResult(matcher.group(3)); HashMap<String,String> attrMap = new HashMap<String, String>(); result = result + " "; Matcher m = pattern1.matcher(result); int len = 0; while(m!=null && m.find()) { attrMap.put(m.group(1),m.group(2)); len = len + 1; } //String[] attr = result.split("\\s+?\\w+?="); /* int len = attr.length; for (int i = 0; i < len; i++) { String temp = attr[i].replaceAll("\"|'", ""); int index = temp.indexOf("="); if (index > -1) { attrMap.put(temp.substring(0, index),temp.substring(index + 1, temp.length())); } else { attrMap.put(temp,""); } //String[] temp = attr[i].split("=\\*?[\"|']"); //System.out.println(attr[i]); //attrMap.put(temp[0],temp.length > 1 ? temp[1] : ""); }*/ bean.setAttribute(attrMap); bean.setEleCount(len); } return bean; } public static List<HTMLBean<String>> getHTMLBeanList(String path, ElementFilter<String> filter) throws IOException { LinkedList<HTMLBean<String>> link = new LinkedList<HTMLBean<String>>(); InputStream fs = new FileInputStream(path); InputStreamReader isr = new InputStreamReader(fs, "UTF-8"); BufferedReader br = new BufferedReader(isr); String r = null; while ((r = br.readLine()) != null) { HTMLBean<String> bean = getHTMLBean(r); if (bean != null && filter.filter(bean)) { link.add(bean); } } return link; } @Test public void getHTMLBeanContentList_test() throws IOException{ String p = "D:\\Users\\lewking\\Desktop\\test.html"; List<HTMLBean<String>> link = getHTMLBeanList(p,new ElementFilter<String>(){ @Override public boolean filter(HTMLBean<String> bean) { //过滤 A 标签 return "a".equals(bean.getEleName().toLowerCase()); } }); System.out.println("解析完成.............."); for(HTMLBean<String> bean : link){ System.out.println("< " + bean.getEleName() +" >"); System.out.println("%%%%%%%%: " + bean.getResult()); Map<String,String> m = bean.getAttribute(); for(Iterator<Map.Entry<String, String>> entry = m.entrySet().iterator();entry.hasNext();){ Entry<String, String> e = entry.next(); System.out.println(e.getKey() +" : "+ e.getValue()); } } } }