版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/funkstill/article/details/86107473
1.初始化项目
在之前的基础上进行故不需要重新安装依赖,具体参考初始化项目。
2.动态 userAgent
每次爬取的时候从中随机选取一个
//./src/userAgent.js
const userAgents = [
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
];
3.具体代码
这里尝试使用标志量应对nodejs的单线程异步特性。
var request = require("request");
var cheerio = require("cheerio");
var fs = require("fs");
var proxys =[];
var useful =[];
function getProxys(pageNum){
let userAgent = userAgents[parseInt(Math.random()*userAgents.length)];
url = "https://www.xicidaili.com/nn/"+pageNum;
request({
url:url,
method:"GET",
headers:{
'User-Agent':userAgent
}
},function(err,res,body){
if(!err){
var $ = cheerio.load(body);
var trs = $("#ip_list tr");
for(var i=1;i<trs.length;i++){
var proxy = {};
tr = trs.eq(i);
tds = tr.children("td");
proxy['ip'] = tds.eq(1).text();
proxy['port'] = tds.eq(2).text();
proxy['type'] = tds.eq(5).text();
var speed = tds.eq(6).children("div").attr("title");
speed = speed.substring(0,speed.length-1);
var connectTime = tds.eq(7).children("div").attr("title");
connectTime = connectTime.substring(0,connectTime.length-1);
if(speed<=5&&connectTime<=1){
proxys.push(proxy);
}
}
}
check();
});
}
/**
* 检查代理是否有效
*/
function check(){
var url = "http://apps.bdimg.com/libs/jquery/2.1.4/jquery.min.js";
var flag = proxys.length;//检查异步函数是否执行完成的标志量
for(var i=0;i<proxys.length;i++){
var proxy = proxys[i];
request({
url:url,
proxy: proxy['type'].toLowerCase()+"://"+proxy['ip']+":"+proxy['port'],
method:'GET',
timeout:20000
},function(err,res,body){
if(!err){
if(res.statusCode==200){
useful.push(res.request['proxy']['href']);
console.log(res.request['proxy']['href'],"useful");
}else{
console.log(res.request['proxy']['href'],"failed");
}
}else{
}
flag--;
if(flag==0){
saveProxys();
}
})
}
}
/**
* 保存有效代理
*/
function saveProxys(){
fs.writeFileSync("proxys.json",JSON.stringify(useful));
console.log("Save finished!");
}
getProxys(1);