from scrapy import signals
from fake_useragent import UserAgent
from scrapy.http import HtmlResponse
import random
from selenium import webdriver
#定义UA的切换
class UserAgentMiddleware(object):
def __init__(self, user_agent=''):
self.ua = UserAgent()
def process_request(self, request, spider):
print('===UserAgentMiddleware process_request==')
if self.ua:
#显示当前使用的useragent
print("********Current UserAgent:%s************")
print(self.ua.random)
request.headers.setdefault('User-Agent', self.ua.random)
#定义IP的切换
class RandomProxyMiddleware(object):
def __init__(self):
self.PROXIES = [
{'ip_port': '180.118.240.179:61234', 'user_passwd': 'user1:pass1'},
{'ip_port': '61.135.217.7:80', 'user_passwd': 'user2:pass2'},
{'ip_port': '111.155.116.238:8123', 'user_passwd': 'user3:pass3'},
{'ip_port': '125.121.112.107:808', 'user_passwd': 'user4:pass4'},
]
def process_request(self, request, spider):
print('更换代理IP。。。')
proxy = random.choice(self.PROXIES)
if proxy['user_passwd'] is None:
# 没有代理账户验证的代理使用方式
request.meta['proxy'] = "http://" + proxy['ip_port']
else:
# 对账户密码进行 base64 编码转换
base64_userpasswd = base64.b64encode(proxy['user_passwd'])
# 对应到代理服务器的信令格式里
request.headers['Proxy-Authorization'] = 'Basic ' + base64_userpasswd
request.meta['proxy'] = "http://" + proxy['ip_port']
#selenium 动态页面的爬取
class WebDriverMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# 加载驱动
print('================process_request================')
browser = webdriver.PhantomJS(executable_path=r'C:\phantomjs-2.1.1-windows\phantomjs-2.1.1-windows\bin\phantomjs.exe')
browser.get(request.url) # 加载网页
data = browser.page_source # 获取网页文本
data = data.encode('utf-8')
browser.quit()
return HtmlResponse(request.url, body=data, encoding='utf-8', request=request)
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
middlewares中自己设置UA的切换,随机更换IP地址,以及对#selenium 动态页面的爬取
猜你喜欢
转载自blog.csdn.net/lzz781699880/article/details/81413594
今日推荐
周排行