参考: https://blog.csdn.net/qq_37616069/article/details/80376807
# coding=utf-8
import scrapy
class DoubanLogin(scrapy.Spider):
name = 'douban'
start_urls = 'https://www.douban.com/accounts/login'
def start_requests(self):
#保存请求页面的cookie
yield scrapy.Request(self.start_urls, callback=self.parse_link, meta={'cookiejar': 1})
def parse_link(self, response):
capt_id = response.xpath('//div/input[@name="captcha-id"]/@value').extract()
capt = response.xpath('//*[@id="captcha_image"]/@src').extract()
'''判断是否有验证码,来构造formdata'''
if len(capt) == 0:
data = {
'source': 'index_nav',
'form_email': '********',
'form_password': '********',
'redir': 'https://www.douban.com/',
'login': '登录'
}
else:
print(capt)
captcha_value = input('input capt: ') # 验证码
data = {
'source': 'index_nav',
'form_email': '********',
'form_password': '********',
'captcha-id': capt_id,
'captcha-solution': captcha_value,
'redir': 'https://www.douban.com/',
'login': '登录'
}
#使用上面保存的cookie
yield scrapy.FormRequest.from_response(response,
meta={'cookiejar': response.meta['cookiejar']},
formdata=data,
callback=self.after_login)
def after_login(self, response):
summary = response.xpath('//*[@class="nav-user-account"]/a/span[1]/text()').extract() # 获取登陆后的简介
print(summary)
settings.py
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3\
359.181 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://www.douban.com/',
'Accept-Encoding': 'gzip, deflate, br',
}