- 当我们学python爬虫一段时间后会发现许多网站如果不登录的话,数据你是获取不下来的。这时候就需要想其他办法了。在这里小博向大家介绍的实例就是通过selenium模拟浏览器登录,并截取验证码图片上传到云打码解码,从而实现网站的登录的
- 首先我们需要引入一下模块包
import time
from selenium import webdriver
from YDMHTTP import yanzheng #这里指的是云打码代码,下边会介绍到
from PIL import Image
from selenium.webdriver.support.ui import WebDriverWait
- 接下来就是爬虫代码
# Project Leader:刘
import time
from selenium import webdriver
from YDMHTTP import yanzheng
from PIL import Image
from selenium.webdriver.support.ui import WebDriverWait
# 调用谷歌浏览器
diver = webdriver.Chrome()
# 打开网页
diver.get('http://t.tangjiu.com/Login?ReturnUrl=%2f')
#输入账号
username = WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_id('user'))
username.send_keys('dukanglaojiu')
# 输入密码
password = WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_id('pass'))
password.send_keys('密码')
# 定位验证码标签,对验证码实现截图功能
captcha = WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_id('imgYanZhengMa'))
# 截取整个页面
diver.save_screenshot('page.png')
# 获取验证码图片的x,y坐标以及自身的宽度和高度
left = captcha.location['x']
top = captcha.location['y']
right = captcha.location['x'] + captcha.size['width']
bottom = captcha.location['y'] + captcha.size['height']
print('right',right)
print('bottom',bottom)
# 截取验证码并保存到本地
img = Image.open('page.png')
img = img.crop((left,top,right,bottom))
img.save('captcha.png')
# 将截取的验证码上传到云打码进行在线识别
text = yanzheng('captcha.png')
print('识别结果:',text)
# 定位验证码的输入框
captcha_input = WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_id('vccode'))
captcha_input.send_keys(text)
# 点击登录按钮
diver.find_element_by_id('BtnLoadByPass').click()
time.sleep(1)
# 点击经销商代理意向
WebDriverWait(diver, 10).until(
lambda diver: diver.find_element_by_xpath('//div[@class="pz_SideLayer"]/ul[2]/li[4]/a[1]')).click()
# 登录之后,休眠一段时间再获取网页源代码,因为网页渲染需要一定的时间
# 当做
x = 0
for i in range(1,100):
time.sleep(1)
# 获取当前有多少条数据
neis = WebDriverWait(diver, 10).until(
lambda diver: diver.find_elements_by_xpath('//div[@class="manageTab"]/table/tbody/tr'))
print(len(neis))
print('正在爬取第{}页数据'.format(i))
for i in range(len(neis)):
try:
x += 1
# 点击VIP查看
vip = WebDriverWait(diver, 10).until(lambda diver: neis[i].find_element_by_xpath('.//td[@width="135"]/a')).click()
# 获取当前window对象
curent_window = diver.current_window_handle
# 获取当前程序中所有的window对象
all_window = diver.window_handles
# 将window对象从上一个窗口对象切换到新打开的window窗口对象
for window in all_window:
if window != curent_window:
diver.switch_to.window(window)
# time.sleep(1)
# 获取电话所在的img标签
# if len(WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_xpath('//li/img'))) !='':
# phone = diver.find_element_by_xpath('//li/img')
phone = WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_xpath('//li/img'))
name = diver.find_elements_by_xpath('//div/ul')
# for na in name:
# print(na.text)
# print('+++++++++++++')
print('全部信息:',name[8].text)
ph = phone.get_attribute('src').split('=')[-1]
# 把爬取下来的数据写成文本格式
with open('中国糖酒网.txt', 'a', encoding='utf-8') as f:
f.write('第{}位\n'.format(x))
f.write(name[8].text)
f.write('\n电话号码:{}'.format(ph))
f.write('\n\n')
print('电话号码:',ph)
# time.sleep(2)
diver.close()
# 新的标签页关闭后,需要重新切换到上一个window,否则无法访问上一个页面中的内容
diver.switch_to.window(curent_window)
# time.sleep(2)
except Exception as e:
print(e)
pass
# time.sleep(1)
# 点击下一页
diver.find_element_by_id('nextpaged').click()
- 接下来就是上面提到的云打码打码(如果没有账号,大家可以申请下)
import http.client, mimetypes, urllib, json, time, requests
class YDMHttp:
apiurl = 'http://api.yundama.com/api.php'
username = ''
password = ''
appid = ''
appkey = ''
def __init__(self, username, password, appid, appkey):
self.username = username
self.password = password
self.appid = str(appid)
self.appkey = appkey
def request(self, fields, files=[]):
response = self.post_url(self.apiurl, fields, files)
response = json.loads(response)
return response
def balance(self):
data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['balance']
else:
return -9001
def login(self):
data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['uid']
else:
return -9001
def upload(self, filename, codetype, timeout):
data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
file = {'file': filename}
response = self.request(data, file)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['cid']
else:
return -9001
def result(self, cid):
data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
response = self.request(data)
return response and response['text'] or ''
def decode(self, filename, codetype, timeout):
cid = self.upload(filename, codetype, timeout)
if (cid > 0):
for i in range(0, timeout):
result = self.result(cid)
if (result != ''):
return cid, result
else:
time.sleep(1)
return -3003, ''
else:
return cid, ''
def report(self, cid):
data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
response = self.request(data)
if (response):
return response['ret']
else:
return -9001
def post_url(self, url, fields, files=[]):
for key in files:
files[key] = open(files[key], 'rb');
res = requests.post(url, files=files, data=fields)
return res.text
######################################################################
def yanzheng(filename):
# 用户名
username = '填写你的账号'
# 密码
password = '(填写你的密码'
# 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
appid = 5712
# 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
appkey = '这里是软件密匙'
# 图片文件
filename = 'captcha.png'
# 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
codetype = 1004
# 超时时间,秒
timeout = 60
# 检查
if (username == 'username'):
print('请设置好相关参数再测试')
else:
# 初始化
yundama = YDMHttp(username, password, appid, appkey)
# 登陆云打码
uid = yundama.login();
print('uid: %s' % uid)
# 查询余额
balance = yundama.balance();
print('balance: %s' % balance)
# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
cid, result = yundama.decode(filename, codetype, timeout);
print('cid: %s, result: %s' % (cid, result))
return result
######################################################################
if __name__ == '__main__':
# 用户名
username = '账号'
# 密码
password = '密码'
# 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
appid = 5712
# 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
appkey = '软件密匙'
# 图片文件
filename = 'captcha.png'
# 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
codetype = 1004
# 超时时间,秒
timeout = 60
# 检查
if (username == 'username'):
print('请设置好相关参数再测试')
else:
# 初始化
yundama = YDMHttp(username, password, appid, appkey)
# 登陆云打码
uid = yundama.login();
print('uid: %s' % uid)
# 查询余额
balance = yundama.balance();
print('balance: %s' % balance)
# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
cid, result = yundama.decode(filename, codetype, timeout);
print('cid: %s, result: %s' % (cid, result))
**通过以上打码,就可以实现验证码登录网站**