采用的是Rhinojs引擎,模拟js运行。相比httpClient+jsoup而言htmlUnit更强大。
pom.xml
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.26</version>
</dependency>
引入htmlUnit.
简单测试:
public class TestHtmlUnit { public static void main(String[] args) { Set<String> urls=getPageUrls(); if(null!=urls && urls.size() > 0){ for (String url : urls) { System.out.println(url); } } } /** * 获取uc-搞笑 最新推荐的视屏页面地址 * @return Set<String> */ public static Set<String> getPageUrls(){ Set<String> set = new HashSet<String>(); String base="https://news.uc.cn"; WebClient webClient = new WebClient(BrowserVersion.FIREFOX_52);//实例化客户端(火狐浏览器) String targurl="https://news.uc.cn/c_shipin/"; webClient.getOptions().setJavaScriptEnabled(false);//关闭javaScript //webClient.getOptions().setCssEnabled(false);//关闭css try { HtmlPage page=webClient.getPage(targurl); try { Thread.sleep(2000); } catch (InterruptedException e) { e.printStackTrace(); } List<HtmlDivision> divs=page.getByXPath("//div[@class='news-list']"); HtmlDivision div=divs.get(0); List<HtmlListItem> lis=div.getByXPath("//li[@class='news-item flag']"); if(null!=lis && lis.size()>0){ for (HtmlListItem li : lis) { DomNodeList<HtmlElement> as=li.getElementsByTagName("a"); if(null!=as && as.size() > 0){ HtmlElement a=as.get(0); String href=a.getAttribute("href"); if(null!=href && !href.equals("")){ href=base+href; System.out.println(href); set.add(href); } } } } } catch (FailingHttpStatusCodeException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally{ webClient.close(); } return set; } }