思路是抽取页面所有链接,根据网站host以及一些逻辑分析,剔除掉不必要的网址。计算每个xpath对应的链接数,取其中最大值。代码依赖于jsoup、httpclient
一、抽取网页所有链接并进行一些过滤
1 public static ArrayList<String> getList(String url, String html) { 2 3 ArrayList<String> list = new ArrayList<String>(); 4 String host = url.substring(url.indexOf("://") + 3, url.indexOf("/", url.indexOf("://") + 3)); 5 6 if (html.toLowerCase().contains("<recordset>")) { 7 StringBuffer bf = new StringBuffer("<body><ul>"); 8 Pattern pattern = Pattern.compile("<record[\\s\\S]*?(<a[\\s\\S]*?</a>)[\\s\\S]*?</record>", Pattern.CASE_INSENSITIVE); 9 Matcher match = pattern.matcher(html); 10 while(match.find()) { 11 bf.append("<li>" + match.group(1) + "</li>\n"); 12 } 13 bf.append("</ul></body>"); 14 Document document = Jsoup.parse(bf.toString()); 15 document.setBaseUri(url); 16 Elements a = document.getElementsByTag("a"); 17 for(Node e : a) { 18 list.add("<record[\\\\s\\\\S]*?(<a[\\\\s\\\\S]*?</a>)[\\\\s\\\\S]*?</record> - href: " + e.attr("abs:href")); 19 } 20 } else { 21 Document document = Jsoup.parse(Jsoup.parse(html).body().html()); 22 document.setBaseUri(url); 23 Elements a = document.getElementsByTag("a"); 24 for (Node e : a) { 25 boolean flag = true; 26 if (e.attr("abs:href") != "" && !e.attr("abs:href").endsWith("/") 27 && (e.attr("abs:href").contains(host) || e.attr("abs:href").contains(":80"))) { 28 String xpath = "/a[@href] - href: " + e.attr("abs:href"); 29 while (true) { 30 if (e.parentNode().nodeName() == "body") { 31 xpath = "//body" + xpath; 32 break; 33 } else { 34 e = e.parentNode(); 35 if (e.attr("class").contains("hide") || e.attr("style").contains("display:none") 36 || e.attr("class").contains("head")) { 37 flag = false; 38 } else { 39 if (e.nodeName().toLowerCase() == "div") { 40 xpath = "/" + e.nodeName() 41 + (e.attr("class") == "" ? "" : "[@class='" + e.attr("class") + "']") 42 + xpath; 43 } else { 44 xpath = "/" + e.nodeName() + xpath; 45 } 46 47 } 48 } 49 } 50 if (flag) { 51 list.add(xpath); 52 } 53 } 54 } 55 } 56 return list; 57 }
二、对抽取出的列表进一步优化
1 for (int i = 0; i < list.size(); i++) { 2 String[] arr = list.get(i).split(" - href: "); 3 if (i == 0) { 4 urls = new ArrayList<String>(); 5 urls.add(arr[1]); 6 map.put(arr[0], urls); 7 } else { 8 if (arr[0].equals(list.get(i - 1).substring(0, list.get(i - 1).indexOf(" - href: ")))) { 9 urls.add(arr[1]); 10 // map.put(arr[0], map.get(arr[0]) + 1); 11 } else { 12 urls = new ArrayList<String>(); 13 urls.add(arr[1]); 14 map.put(arr[0], urls); 15 } 16 } 17 } 18 19 // 优化map 20 for (String key : map.keySet()) { 21 ArrayList<Integer> sortLenth = new ArrayList<Integer>(); 22 for (String link : map.get(key)) { 23 sortLenth.add(link.length()); 24 } 25 Collections.sort(sortLenth); 26 int flag = sortLenth.get(0); 27 for (String link : map.get(key)) { 28 if (link.length() > flag + 10) { 29 removeList.add(key); 30 break; 31 } 32 } 33 } 34 // 移除不需要的key 35 for (String key : removeList) { 36 map.remove(key); 37 }
三、分析xpath对应的链接数
1 // 按值排序map 2 for (String key : map.keySet()) { 3 if (sortMap.containsKey(map.get(key).size())) { 4 sortMap.get(map.get(key).size()).add(key); 5 } else { 6 ArrayList<String> valueList = new ArrayList<String>(); 7 valueList.add(key); 8 sortMap.put(map.get(key).size(), valueList); 9 } 10 } 11 12 for (Integer i : sortMap.keySet()) { 13 key_list.add(i); 14 } 15 Collections.sort(key_list); 16 17 // 取最大值的xpath 18 if (sortMap.get(key_list.get(key_list.size() - 1)).size() > 0) { 19 for (String str : sortMap.get(key_list.get(key_list.size() - 1))) { 20 xpath += str + "|"; 21 } 22 xpath = xpath.substring(0, xpath.length() - 1); 23 } else { 24 xpath = ""; 25 }
样本数据150条,经测试成功率在85%以上。