版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/QuietHRH/article/details/82794063
获取数据
1. 原生JDK
- 创建URL对象
- 获取连接
- 设置请求方式
- post方式要打开输出流,因为参数在请求体中. conn.setDoOutput(true);
- 流的方式获取数据
package com.hrh.jdk;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
public class JdkPost {
//演示使用原生jdk来发送post请求
public static void main(String[] args) throws Exception {
//1. 创建url对象
URL url = new URL("http://www.itcast.cn");
//2. 获取连接
HttpURLConnection httpURLConnection = (HttpURLConnection)url.openConnection();
//3. 设置请求方式
//httpURLConnection.setRequestMethod("GET");
httpURLConnection.setRequestMethod("POST");
//4. 设置请求参数 POST方式
//打开输出流. 因为jdk默认将输出流是关闭的
httpURLConnection.setDoOutput(true);
OutputStream out = httpURLConnection.getOutputStream();
out.write("username=zs&password=123".getBytes());
//5. 获取数据
InputStream in = httpURLConnection.getInputStream();
int len = -1;
byte[] b = new byte[1024];
while((len = in.read(b)) != -1){
System.out.println(new String(b,0,len));
}
}
}
2. httpClient
- 导入依赖
- 获取httpClient对象
- 设置请求方式
- 设置请求参数
- 发送请求,获取数据
package com.itheima.httpClient;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
public class HttpClientPost {
public static void main(String[] args) throws IOException {
//1. 获取httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2. 设置请求方式
// HttpGet httpGet = new HttpGet("http://www.itcast.cn");
HttpPost httpPost = new HttpPost("http://www.itcast.cn");
//3. 设置请求参数
//设置请求头:
//设置请求体:POST
List<BasicNameValuePair> list = new ArrayList<BasicNameValuePair>();
list.add(new BasicNameValuePair("username","zhangsan"));
list.add(new BasicNameValuePair("age","18"));
HttpEntity entity = new UrlEncodedFormEntity(list);
httpPost.setEntity(entity);
//4. 发送请求, 获取响应对象
CloseableHttpResponse response = httpClient.execute(httpPost);
//5. 获取数据
//5.1 获取状态码
int statusCode = response.getStatusLine().getStatusCode();
//5.2 获取响应头
Header[] headers = response.getHeaders("Content-Type");
String value = headers[0].getValue();
System.out.println(value);
//5.3 获取响应体
String html = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(html);
}
}
解析数据
jsoup
- 导入依赖
- 获取dom对象
- Jsoup.parse( String html ); 常用
- Jsoup.connect( url ).get(); 指定url
- Jsoup.parse( File in, String charset); 指定file文件路径
- Jsoup.parseBodyFragment( String html ) 解析html片段
- 根据选择器获取Elements
package com.hrh.jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
public class ItcastParseSelector {
public static void main(String[] args) throws IOException {
//1. 获取dom对象
Document document = Jsoup.connect("http://www.itcast.cn").get();
//2. 解析操作
Elements aEl = document.select(".nav_txt>ul>li>a");
for (Element a : aEl) {
System.out.println(a.text());
}
}
}
jsoup的常用方法:
- parse(String html); 用来得到dom对象
- select(选择器);
- text(); 获取指定元素的内容体(只能获取文本内容)
- html();获取指定元素的内容体(可以将html代码也一并获取到)
- attr(name); 根据指定的属性名称获取其对应属性的值
保存数据
当解析完数据以后, 要将解析后的数据保存到对应的一个容器中(MySQL, 文件), 目前采用的MySQL,后期可以使用hadoop,hbase
- 使用数据库来进行保存数据的操作有几种方式:
- JDBC: 七大步
- DbUtils: queryRunner
- mybatis
- spring中jdbcTemplate
案例 爬取起点中文网的榜单小说
package com.hrh.anli;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
public class QiDianDemo {
//抽取方法 传入URL 获得document对象
public static Document getDocument(String url) throws Exception{
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = httpClient.execute(httpGet);
String html = EntityUtils.toString(response.getEntity(), "utf-8");
Document document = Jsoup.parse(html);
return document;
}
public static void main(String[] args) throws Exception {
//起点首页url
String url="https://www.qidian.com/";
//获取document
Document document = getDocument(url);
//获得某一榜单(这里为签约作家新书榜)
Elements aEl = document.select("[class=rank-list mr0] li a[href*=book.qidian.com][class!=link]");
//遍历获得的a标签 取出url 依次获得每本书的链接
for (Element a : aEl) {
url="https:"+a.attr("href");
document=getDocument(url);
//获得开始阅读的url
Elements readBtn = document.select("#readBtn");
String bookName = document.select(".book-info h1 em").text();
url="https:"+readBtn.attr("href");
//创建一个输出流,将爬到的小说以txt形式保存在硬盘
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E://"+bookName+".txt")));
//遍历某一本书的免费章节
while(true){
document=getDocument(url);
//获得本章的章节名称 并输出到文本中
Elements chapterName = document.select(".j_chapterName");
bw.write(chapterName.text());
bw.newLine();
bw.flush();
//获得本章的小说内容 并输出到文本中
Elements pEl = document.select("[class=read-content j_readContent] p");
for (Element p : pEl) {
bw.write(p.text());
bw.newLine();
bw.flush();
}
//获得下一章的元素
Elements chapterNext = document.select("#j_chapterNext[href*=read.qidian.com]");
//判断下一章是否存在(这里指的是免费章节)
//存在则继续进入下一章的链接
//不存在则跳出本书的章节遍历,进入榜单中下一本书的遍历
if(chapterNext==null || chapterNext.size()==0){
break;
}
//获得下一章的链接
url="https:"+chapterNext.attr("href");
}
//关流
bw.close();
}
}
}