设置随机请求头和ip代理池
middlewares.py代码如下:
import json,random import requests from useragent_randomchange.models import ProxyModel from twisted.internet.defer import DeferredLock class UseragentRandomchangeDownloaderMiddleware(object): USER_AGENTS=[ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36'] def process_request(self, request, spider): user_agent=random.choice(self.USER_AGENTS) request.headers['User-Agent']=user_agent class IPProxyRandomchangeDownloaderMiddleware(object): PROXY_URL='xxxxxxxxx' #接口URL def __init__(self): super(IPProxyRandomchangeDownloaderMiddleware,self).__init__() self.current_proxy=None self.lock=DeferredLock() #定义一把锁 def process_request(self, request, spider): if 'proxy' not in requests.meta or self.current_proxy.is_expiring: #请求代理 self.update_proxy() request.meta['proxy']=self.current_proxy.proxy def process_response(self,request,response,spider): if response.status !=200 or "captcha"in response.url: if not self.current_proxy.blacked: self.current_proxy.blacked=True print('%s这个代理被加入黑名单了' % self.current_proxy.ip) self.update_proxy() #如果来到这里,说明这个请求已经被boss直聘识别为爬虫了 #所以这个请求就相当于说明都没有获取到 #如果不返回request,那么这个request就相当于没有获取到数据 #也就是说,这个请求就被废掉了,这个数据就没有被抓取到 #所有要重新返回request,让这个请求重新加入到调度中 #下次再发送 return request #如果是正常的,那么要记得返回response #如果不返回,那么这个reponse就不会被传到爬虫那里去 #也就得不到解析 return response def update_proxy(self): self.lock.acquire() if not self.current_proxy or self.current_proxy.is_expiring or self.current_proxy.blacked: response = requests.get(self.PROXY_URL) text = response.text print("重新获取了一个代理:",text) result = json.loads(text) if len(result['data']) > 0: data = result['data'][0] proxy_model = ProxyModel(data) self.current_proxy = proxy_model self.lock.release()
封装了一个models.py
from datetime import datetime,timedelta class ProxyModel(object): def __init__(self,data): self.ip=data['ip'] self.port=data['port'] self.expire_str=data['expire_time'] self.blacked=False data_str,time_str=self.expire_str.split(" ") year,month,day=data_str.split("-") hour,minute,second=time_str.split(":") self.expire_time=datetime(year=int(year),month=int(month),day=int(day),hour=int(hour),minute=int(minute),second=int(second)) #https://ip:port self.proxy="http://{}:{}".format(self.ip,self.port) @property def is_expiring(self): now=datetime.now() if(self.expire_time-now)<timedelta(seconds=5): return True else: return False
还需在setting.py中设置
DOWNLOADER_MIDDLEWARES = { 'useragent_randomchange.middlewares.UseragentRandomchangeDownloaderMiddleware': 100, 'useragent_randomchange.middlewares.IPProxyRandomchangeDownloaderMiddleware': 200 } DOWNLOAD_DELAY = 1