Python selenium爬虫实例（列举中国糖酒网）

当我们学python爬虫一段时间后会发现许多网站如果不登录的话，数据你是获取不下来的。这时候就需要想其他办法了。在这里小博向大家介绍的实例就是通过selenium模拟浏览器登录，并截取验证码图片上传到云打码解码，从而实现网站的登录的

首先我们需要引入一下模块包

import time
from selenium import webdriver
from YDMHTTP import yanzheng #这里指的是云打码代码，下边会介绍到
from PIL import Image
from selenium.webdriver.support.ui import WebDriverWait

接下来就是爬虫代码

# Project Leader：刘
import time
from selenium import webdriver
from YDMHTTP import yanzheng
from PIL import Image
from selenium.webdriver.support.ui import WebDriverWait
# 调用谷歌浏览器
diver = webdriver.Chrome()
# 打开网页
diver.get('http://t.tangjiu.com/Login?ReturnUrl=%2f')
#输入账号
username = WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_id('user'))
username.send_keys('dukanglaojiu')
# 输入密码
password = WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_id('pass'))
password.send_keys('密码')
# 定位验证码标签，对验证码实现截图功能
captcha = WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_id('imgYanZhengMa'))
# 截取整个页面
diver.save_screenshot('page.png')

# 获取验证码图片的x，y坐标以及自身的宽度和高度
left = captcha.location['x']
top = captcha.location['y']
right = captcha.location['x'] + captcha.size['width']
bottom = captcha.location['y'] + captcha.size['height']
print('right',right)
print('bottom',bottom)

# 截取验证码并保存到本地
img = Image.open('page.png')
img = img.crop((left,top,right,bottom))
img.save('captcha.png')

# 将截取的验证码上传到云打码进行在线识别
text = yanzheng('captcha.png')
print('识别结果：',text)

# 定位验证码的输入框
captcha_input = WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_id('vccode'))
captcha_input.send_keys(text)

# 点击登录按钮
diver.find_element_by_id('BtnLoadByPass').click()
time.sleep(1)
# 点击经销商代理意向
WebDriverWait(diver, 10).until(
        lambda diver: diver.find_element_by_xpath('//div[@class="pz_SideLayer"]/ul[2]/li[4]/a[1]')).click()
# 登录之后，休眠一段时间再获取网页源代码，因为网页渲染需要一定的时间
# 当做
x = 0
for i in range(1,100):
    time.sleep(1)
    # 获取当前有多少条数据
    neis = WebDriverWait(diver, 10).until(
        lambda diver: diver.find_elements_by_xpath('//div[@class="manageTab"]/table/tbody/tr'))
    print(len(neis))
    print('正在爬取第{}页数据'.format(i))
    for i in range(len(neis)):
        try:
            x += 1
            # 点击VIP查看
            vip = WebDriverWait(diver, 10).until(lambda diver: neis[i].find_element_by_xpath('.//td[@width="135"]/a')).click()
            # 获取当前window对象
            curent_window = diver.current_window_handle
            # 获取当前程序中所有的window对象
            all_window = diver.window_handles
            # 将window对象从上一个窗口对象切换到新打开的window窗口对象
            for window in all_window:
                if window != curent_window:
                    diver.switch_to.window(window)
            # time.sleep(1)
            # 获取电话所在的img标签
            # if len(WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_xpath('//li/img'))) !='':
                # phone = diver.find_element_by_xpath('//li/img')
            phone = WebDriverWait(diver,10).until(lambda diver:diver.find_element_by_xpath('//li/img'))
            name = diver.find_elements_by_xpath('//div/ul')
            # for na in name:
            #     print(na.text)
            #     print('+++++++++++++')
            print('全部信息：',name[8].text)
            ph = phone.get_attribute('src').split('=')[-1]
            # 把爬取下来的数据写成文本格式
            with open('中国糖酒网.txt', 'a', encoding='utf-8') as f:
                f.write('第{}位\n'.format(x))
                f.write(name[8].text)
                f.write('\n电话号码:{}'.format(ph))
                f.write('\n\n')
            print('电话号码：',ph)
            # time.sleep(2)
            diver.close()
            # 新的标签页关闭后，需要重新切换到上一个window，否则无法访问上一个页面中的内容
            diver.switch_to.window(curent_window)
            # time.sleep(2)
        except Exception as e:
                print(e)
                pass
    # time.sleep(1)
    # 点击下一页
    diver.find_element_by_id('nextpaged').click()

接下来就是上面提到的云打码打码（如果没有账号，大家可以申请下）

import http.client, mimetypes, urllib, json, time, requests

class YDMHttp:

    apiurl = 'http://api.yundama.com/api.php'
    username = ''
    password = ''
    appid = ''
    appkey = ''

    def __init__(self, username, password, appid, appkey):
        self.username = username  
        self.password = password
        self.appid = str(appid)
        self.appkey = appkey

    def request(self, fields, files=[]):
        response = self.post_url(self.apiurl, fields, files)
        response = json.loads(response)
        return response
    
    def balance(self):
        data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['balance']
        else:
            return -9001
    
    def login(self):
        data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['uid']
        else:
            return -9001

    def upload(self, filename, codetype, timeout):
        data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
        file = {'file': filename}
        response = self.request(data, file)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['cid']
        else:
            return -9001

    def result(self, cid):
        data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
        response = self.request(data)
        return response and response['text'] or ''

    def decode(self, filename, codetype, timeout):
        cid = self.upload(filename, codetype, timeout)
        if (cid > 0):
            for i in range(0, timeout):
                result = self.result(cid)
                if (result != ''):
                    return cid, result
                else:
                    time.sleep(1)
            return -3003, ''
        else:
            return cid, ''

    def report(self, cid):
        data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
        response = self.request(data)
        if (response):
            return response['ret']
        else:
            return -9001

    def post_url(self, url, fields, files=[]):
        for key in files:
            files[key] = open(files[key], 'rb');
        res = requests.post(url, files=files, data=fields)
        return res.text

######################################################################
def yanzheng(filename):
    # 用户名
    username = '填写你的账号'

    # 密码
    password = '（填写你的密码'

    # 软件ＩＤ，开发者分成必要参数。登录开发者后台【我的软件】获得！
    appid = 5712

    # 软件密钥，开发者分成必要参数。登录开发者后台【我的软件】获得！
    appkey = '这里是软件密匙'

    # 图片文件
    filename = 'captcha.png'

    # 验证码类型，# 例：1004表示4位字母数字，不同类型收费不同。请准确填写，否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
    codetype = 1004

    # 超时时间，秒
    timeout = 60

    # 检查
    if (username == 'username'):
        print('请设置好相关参数再测试')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appid, appkey)

        # 登陆云打码
        uid = yundama.login();
        print('uid: %s' % uid)

        # 查询余额
        balance = yundama.balance();
        print('balance: %s' % balance)

        # 开始识别，图片路径，验证码类型ID，超时时间（秒），识别结果
        cid, result = yundama.decode(filename, codetype, timeout);
        print('cid: %s, result: %s' % (cid, result))
        return result
######################################################################
if __name__ == '__main__':
    # 用户名
    username = '账号'

    # 密码
    password = '密码'

    # 软件ＩＤ，开发者分成必要参数。登录开发者后台【我的软件】获得！
    appid = 5712

    # 软件密钥，开发者分成必要参数。登录开发者后台【我的软件】获得！
    appkey = '软件密匙'

    # 图片文件
    filename = 'captcha.png'

    # 验证码类型，# 例：1004表示4位字母数字，不同类型收费不同。请准确填写，否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
    codetype = 1004

    # 超时时间，秒
    timeout = 60

    # 检查
    if (username == 'username'):
        print('请设置好相关参数再测试')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appid, appkey)

        # 登陆云打码
        uid = yundama.login();
        print('uid: %s' % uid)

        # 查询余额
        balance = yundama.balance();
        print('balance: %s' % balance)

        # 开始识别，图片路径，验证码类型ID，超时时间（秒），识别结果
        cid, result = yundama.decode(filename, codetype, timeout);
        print('cid: %s, result: %s' % (cid, result))

**通过以上打码，就可以实现验证码登录网站**

Python selenium爬虫实例（列举中国糖酒网）

猜你喜欢