版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/fzuzhanghao1993/article/details/85072624
闲来使用正则表达式做了一个HTML页面的数据爬虫,主要是根据页面规则去匹配相应字段内容,记录一下。
利用HttpGet获取页面内容,使用pattern获取匹配内容
CloseableHttpClient client = HttpClientBuilder.create().build();
long t = new Date().getTime();
for (int i = 1; i <= max; i++) {
String uri = "http:///xxx.com/xxx/PG(?)tok=" + t;
uri = uri.replaceAll("(?)",i+"");
HttpGet hget = new HttpGet(uri);
hget.addHeader(HttpHeaders.ACCEPT, "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
hget.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
CloseableHttpResponse response = client.execute(hget);
String bodyAsString = EntityUtils.toString(response.getEntity());
StringBuffer ori = new StringBuffer(bodyAsString);
if (ori.length() > 0) {
//获取指定a标签内链接内容以及随后的图片内容
Pattern pattern = Pattern.compile("<a onclick=\"[^\"]*\"\\starget=\"_blank\"\\shref='([^']*)'>[\\s]*<img src=\"([^\"]*)");
Matcher matcher = pattern.matcher(ori.toString());
List<String> content = new ArrayList<>();
while (matcher.find()) {
String tmp = matcher.group(1);
String msg = matcher.group(2);
System.out.println(tmp +":"+msg );
}
}else{
continue;
}
}
}
try {
Thread.sleep(30000);//休息30s
} catch (InterruptedException e) {
e.printStackTrace();
}
}
随后附上获取某标题后内容(常用)
Pattern adminPricePattr = Pattern.compile("姓 名 :</span>[^>]*([^<]*)");