package com.shishike.susie.utility; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; public class JsoupTool { ArrayList<String> baseUrl = new ArrayList<>(); ArrayList<String> addedUrl = new ArrayList<>(); public ArrayList<String> getAllUrls(String urls) { urls="http://jira.shishike.com/secure/Dashboard.jspa?selectPageId=10700"; ArrayList<String> allUrls = new ArrayList<>(); ArrayList<String> baseUrls = getBaseUrls(urls); for (String baseUrl : baseUrls) { allUrls.addAll(getImportsUrl(baseUrl)); allUrls.addAll(getMediaUrl(baseUrl)); allUrls.addAll(getLinkUrl(baseUrl)); } return allUrls; } private ArrayList<String> getBaseUrls(String urls) { addedUrl.add(urls); ArrayList<String> allUrls = new ArrayList<>(); ArrayList<String> baseUrl = new ArrayList<>(); allUrls = getLinkUrl(urls); for (String url : allUrls) { if (url.contains(".html") || url.endsWith("/")) baseUrl.add(url); } for (int i = 0; i < baseUrl.size() - 1; i++) { for (int j = i + 1; j < baseUrl.size() - 1; j++) { if (baseUrl.get(j).equals(baseUrl.get(i))) baseUrl.remove(j); } if (!baseUrl.get(i).contains("xxx.com")) baseUrl.remove(i); } this.baseUrl.addAll(baseUrl); for (int i = 0; i < baseUrl.size() - 1; i++) { baseUrl.remove(0); String url = baseUrl.get(i); if (!addedUrl.contains(url) && url.contains("xxx.com")) { // System.out.println("begin debug"); // System.out.println(url); // System.out.println("end debug"); getBaseUrls(url); // addedUrl.add(url); } } for (int i = 0; i < this.baseUrl.size() - 1; i++) { for (int j = i + 1; j < this.baseUrl.size() - 1; j++) { if (this.baseUrl.get(j).equals(this.baseUrl.get(i))) { this.baseUrl.remove(j); } } if (!this.baseUrl.get(i).contains("xxx.com")) this.baseUrl.remove(i); } return this.baseUrl; } private ArrayList<String> getMediaUrl(String urls) { ArrayList<String> allUrls = new ArrayList<>(); Document doc; try { doc = Jsoup.connect(urls).get(); Elements media = doc.select("[src]"); for (Element src : media) { String url = src.attr("abs:src"); if (!url.isEmpty()) { allUrls.add(url); } } } catch (IOException e) { e.printStackTrace(); } return allUrls; } private ArrayList<String> getImportsUrl(String urls) { ArrayList<String> allUrls = new ArrayList<>(); Document doc; try { doc = Jsoup.connect(urls).get(); Elements imports = doc.select("link[href]"); for (Element src : imports) { String url = src.attr("abs:href"); if (!url.isEmpty()) { allUrls.add(url); } } } catch (IOException e) { e.printStackTrace(); } return allUrls; } private ArrayList<String> getLinkUrl(String urls) { ArrayList<String> allUrls = new ArrayList<>(); Document doc; try { doc = Jsoup.connect(urls).get(); Elements links = doc.select("a[href]"); for (Element src : links) { String url = src.attr("abs:href"); if (!url.isEmpty()) { allUrls.add(url); } } for (int i = 0; i < allUrls.size() - 1; i++) { for (int j = i + 1; j < allUrls.size() - 1; j++) { if (allUrls.get(j).equals(allUrls.get(i))) { allUrls.remove(j); } } } } catch (IOException e) { e.printStackTrace(); } return allUrls; } }
获取页面所有链接的方法
猜你喜欢
转载自blog.csdn.net/intelrain/article/details/80746398
今日推荐
周排行