1.之前看了一些论坛上有一个坛友用python写的抓取电影下载链接的,于是心血来潮的我也打算用java来写一个!其实并不是很难,下面附上代码
这是对电影天堂的电影的抓取的方法,(在此期间尝试设置代{过}{滤}理,以及用线程池,但貌似均没有成功) 说明下主要的jar包主要有httpclient4.5以及jsoup1.7
1.
2.下面的飘花电影网的,其实可以看到爬取的过程是大同小异的,只是选择器有所差别而已
最后附上成功的截图
最后一张是在网页上的应用
这是对电影天堂的电影的抓取的方法,(在此期间尝试设置代{过}{滤}理,以及用线程池,但貌似均没有成功) 说明下主要的jar包主要有httpclient4.5以及jsoup1.7
1.
[Java]
纯文本查看
复制代码
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
package
downloade;
import
java.io.File;
import
java.io.FileOutputStream;
import
java.io.IOException;
import
java.io.OutputStreamWriter;
import
java.util.HashMap;
import
java.util.Map;
import
java.util.concurrent.ExecutorService;
import
java.util.concurrent.Executors;
import
org.apache.http.HttpHost;
import
org.apache.http.HttpResponse;
import
org.apache.http.client.HttpClient;
import
org.apache.http.client.config.RequestConfig;
import
org.apache.http.client.methods.HttpGet;
import
org.apache.http.impl.client.HttpClients;
import
org.apache.http.util.EntityUtils;
import
org.jsoup.Jsoup;
import
org.jsoup.nodes.Document;
import
org.jsoup.nodes.Element;
import
org.jsoup.select.Elements;
import
com.sun.corba.se.spi.orbutil.threadpool.ThreadPool;
import
Pojo.DyUrl;
import
dao.JDBCUtils;
public
class
Dyttdownload {
static
int
id=
1
;
public
static
HttpClient client=
null
;
public
static
void
main(String[] args) {
//ExecutorService fixedThreadPool = Executors.newFixedThreadPool(10);
Map<Integer,String> map=
new
HashMap<>();
for
(
int
i=
1
;i<
50
;i++){
// http://www.ygdy8.net/html/gndy/dyzz/list_23_2.html
map.put(i,
"http://www.ygdy8.net/html/gndy/dyzz/list_23_"
+i+
".html"
);
}
for
(String string : map.values()) {
getUrl(string);
// Thread.currentThread().sleep(2000);
}
// getDownloadUrl("http://www.ygdy8.net/html/gndy/dyzz/20170926/55094.html");
}
public
static
void
getUrl(String uri){
JDBCUtils utils=
new
JDBCUtils();
try
{
client=HttpClients.createDefault();
// RequestConfig config=RequestConfig.custom().setProxy(new HttpHost("110.73.14.161",8123)).build();
HttpGet get=
new
HttpGet(uri);
// get.setConfig(config);
HttpResponse response=client.execute(get);
String result =EntityUtils.toString(response.getEntity(),
"gb2312"
);
Document doc=Jsoup.parse(result);
//css选择器
Elements elements= doc.select(
"table.tbspan "
);
for
(Element element : elements) {
element.setBaseUri(
"http://www.ygdy8.net"
);
DyUrl dy=getDownloadUrl(element.select(
"tr"
).get(
1
).select(
"a"
).text(),element.select(
"tr"
).get(
1
).select(
"a"
).attr(
"abs:href"
));
dy.setId(id);
utils.insert(dy);
id++;
}
}
catch
(IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public
static
DyUrl getDownloadUrl(String name,String dyurl){
DyUrl dy=
new
DyUrl();
// RequestConfig config=RequestConfig.custom().setProxy(new HttpHost("110.73.14.161",8123)).build();
try
{
client=HttpClients.createDefault();
HttpGet get =
new
HttpGet(dyurl);
// get.setConfig(config);
HttpResponse response=client.execute(get);
String result =EntityUtils.toString(response.getEntity(),
"gb2312"
);
Document doc=Jsoup.parse(result);
Elements elements=doc.select(
"div#Zoom table tr td a "
);
dy.setDyname(name);
dy.setDyUrl(elements.get(
0
).text());
}
catch
(IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return
dy;
}
}
|
2.下面的飘花电影网的,其实可以看到爬取的过程是大同小异的,只是选择器有所差别而已
[Java]
纯文本查看
复制代码
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
package
downloade;
import
java.io.IOException;
import
java.util.HashMap;
import
java.util.Map;
import
java.util.concurrent.ExecutorService;
import
java.util.concurrent.Executors;
import
org.apache.http.HttpHost;
import
org.apache.http.HttpResponse;
import
org.apache.http.client.HttpClient;
import
org.apache.http.client.methods.HttpGet;
import
org.apache.http.impl.client.CloseableHttpClient;
import
org.apache.http.impl.client.HttpClientBuilder;
import
org.apache.http.impl.client.HttpClients;
import
org.apache.http.util.EntityUtils;
import
org.jsoup.Jsoup;
import
org.jsoup.nodes.Document;
import
org.jsoup.nodes.Element;
import
org.jsoup.select.Elements;
import
Pojo.DyUrl;
import
dao.JDBCUtils;
public
class
piaohuadownload {
static
int
id=
1
;
public
static
HttpClient client=
null
;
public
static
void
main(String[] args) {
Map<Integer,String> map=
new
HashMap<>();
for
(
int
i=
16
;i<
50
;i++){
map.put(i,
"http://www.piaohua.com/html/dongzuo/list_"
+i+
".html"
);
}
for
(String string : map.values()) {
System.out.println(
"正在爬这个"
+string+
"网页"
);
// TODO Auto-generated method stub
getUrl(string);
}
}
public
static
void
getUrl(String uri){
JDBCUtils utils=
new
JDBCUtils();
try
{
client =HttpClientBuilder.create().build();
HttpResponse response=client.execute(
new
HttpGet(uri));
String result =EntityUtils.toString(response.getEntity(),
"utf-8"
);
Document doc=Jsoup.parse(result);
doc.setBaseUri(
"http://www.piaohua.com"
);
Elements elements=doc.select(
"#list dl"
);
for
(Element element : elements) {
String name=element.select(
"font"
).first().text();
String dyurl=element.select(
"a"
).first().absUrl(
"href"
);
DyUrl dy=getDownloadUrl(name, dyurl);
dy.setId(id);
utils.insert(dy);
id++;
}
}
catch
(IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public
static
DyUrl getDownloadUrl(String name,String dyurl){
DyUrl dUrl=
new
DyUrl();
try
{
client=HttpClients.createDefault();
HttpResponse response;
response = client.execute(
new
HttpGet(dyurl));
String result =EntityUtils.toString(response.getEntity(),
"utf-8"
);
Document doc=Jsoup.parse(result);
Elements elements=doc.select(
"#showinfo"
).select(
"a"
);
dUrl.setDyname(name);
dUrl.setDyUrl(elements.first().text());
}
catch
(IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return
dUrl;
}
}
|
最后附上成功的截图
最后一张是在网页上的应用