通过HttpURLConnection和Jsoup技术,抓取需要验证页面的数据,简单记录一下,做个备忘。
【需求】1.抓取界面需要登录后,才能访问
2.抓取界面中用户姓名和用户手机号
实现步骤:
一、获取界面HTML数据
private String getHTML(String id) throws IOException{
//1:创建服务地址
URL url = new URL("****&id="+id);
//2:打开到服务地址的一个连接
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
//3:设置连接参数
//3.1设置发送方式:POST必须大写
connection.setRequestMethod("GET");
//3.2设置数据格式:Content-type
connection.setRequestProperty("content-type", "text/xml;charset=utf-8");
connection.setRequestProperty("Connection", "keep-alive");
connection.setRequestProperty("Cookie", "***");
connection.setRequestProperty("Host", "***");
//3.3设置输入输出,新创建的connection默认是没有读写权限的,
connection.setDoInput(true);
connection.setDoOutput(true);
//4:组织SOAP协议数据,发送给服务端
OutputStream os = null;
String returnData="";
try{
os=connection.getOutputStream();
//5:接收服务端的响应
int responseCode = connection.getResponseCode();
if(200 == responseCode){//表示服务端响应成功
InputStream is =null;
InputStreamReader isr=null;
BufferedReader br =null;
try{
is = connection.getInputStream();
isr = new InputStreamReader(is,"UTF-8");
br = new BufferedReader(isr);
StringBuilder sb = new StringBuilder();
String temp = null;
while(null != (temp = br.readLine())){
sb.append(temp);
}
returnData=sb.toString();
}catch(Exception ee){
ee.printStackTrace();
}finally{
if(br!=null){
br.close();
}
if(isr!=null){
isr.close();
}
if(is!=null){
is.close();
}
}
}
}catch(Exception e){
e.printStackTrace();
}finally{
if(os!=null){
os.close();
}
}
return returnData;
}
二、解析HTML数据,获取用户姓名和用户手机号
private static void parseHtml(String html){
Document doc = Jsoup.parse(html);
Elements rows = doc.select("table").get(0).select("tr");
if (rows.size() == 1) {
System.out.println("没有结果");
}else {
System.out.print("/姓名:" + rows.get(0).select("input").get(0).attr("value"));
System.out.print("//手机:" + rows.get(1).select("input").get(0).attr("value")+"//");
System.out.println();
}
}