1.python抓取网页的过程中,有很多网页都是需要登陆后才可以进行信息采集,分析页面链接有的时候太费时间,因此引入了Selenium+PlantomJs实现模拟登陆,简单,方便实现登陆拿到cookies
2.实现思路分析
a)访问京东登陆页面
https://passport.jd.com/new/login.aspx
b)输入用户名,密码,复杂的有各种形式验证码,比如淘宝登陆的滑动验证
c)登陆成功,cookies持久化到本地
3.使用技术Selenium+ChromeDriver(PhantomJs)
4.代码实现(模拟登陆,cookies持久化到本地)
# -*- coding: utf-8 -*-
import random,time,os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from user_agents import agents
class LoginCookies(object):
loginurl = 'https://passport.jd.com/new/login.aspx'
username ='username'
pwd = 'pwd'
headers ={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Host': 'passport.jd.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36',
}
def __init__(self):
self.cookies = {}
options = Options()
#options.add_argument('--headless')
options.add_argument('--no-sandbox')
# options.add_argument('--disable-dev-shm-usage')
#driver = webdriver.Chrome(executable_path="chromedriver",
# chrome_options=options)
#self.driver = webdriver.Chrome(chrome_options=options)
self.driver = webdriver.PhantomJS()
#self.driver = webdriver.Firefox()
self.agent = random.choice(agents)
def login(self):
#self.driver.delete_all_cookies()
self.headers["User-Agent"] = self.agent
for key in self.headers:
webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.{}'.format(key)] = self.headers[key]
self.driver.get(self.loginurl)
time.sleep(1)
self.driver.maximize_window() # 将浏览器最大化
time.sleep(1)
#print('cookies:',self.driver.get_cookies())
with open('html.txt','w') as f:
f.write(self.driver.page_source)
self.driver.find_element_by_xpath(("//a[text()='账户登录']")).click()
time.sleep(1)
self.driver.find_element_by_name('loginname').send_keys(self.username)
time.sleep(1)
self.driver.find_element_by_name('nloginpwd').send_keys(self.pwd)
time.sleep(1)
self.driver.find_element_by_id("loginsubmit").click()
time.sleep(5)
self.driver.save_screenshot('Screenshots/logindjd.png')
#print('login cookies:', self.driver.get_cookies())
#print(type(self.driver.get_cookies()))
self.savecookies(self.driver.get_cookies()[0])
self.cookies = self.driver.get_cookies()[0]
print('search',self.driver.find_element_by_id('J_searchbg').text)
self.driver.quit()
def savecookies(self,cookies):
with open('cookies.txt', 'w') as f:
if cookies:
for k, v in cookies.items():
f.writelines('{0}:{1}'.format(k, v) + "\n")
def readcookies(self):
if os.path.exists('cookies.txt'):
for line in open('cookies.txt'):
item = line.split(':')
self.cookies[item[0]] = item[1]
else:
self.login()
if __name__ == '__main__':
loginCookies = LoginCookies()
loginCookies.readcookies()
print('logindcookies:',loginCookies.cookies)