selenium模拟登陆淘宝网并且将‘衣服’相关信息下载储存在mysql数据库

import re
import pymysql
from lxml import etree
from selenium import webdriver
#一下三行用于等待判断页面是否加载完毕
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import TimeoutException

#selenium驱动谷歌浏览器
brower = webdriver.Chrome()

#创建数据库实例对象
con = pymysql.connect(host='localhost',user='root',password='',db='taobao',port=3306)
#创建游标
cur = con.cursor()
#创建一个表
cur.execute("CREATE TABLE yifu (id int(4) NOT NULL auto_increment PRIMARY KEY ,title VARCHAR(60),prince FLOAT(4,2),people int(10),city VARCHAR(10),shop VARCHAR(20),img VARCHAR(200))")



def search():
    '''
    功能：打开网页
    加载网页
    获取输入框对象
    获取搜索按钮对象
    向输入框对象输入关键字
    搜索按钮对象执行点击一次的方法
    '''
    try:

        brower.get('https://www.taobao.com')  #打开淘宝首页

        #等待搜索框加载 并且 获取输入框对象 （是一个列表）
        #WebDriverWait(brower,10)显示等待，直到这个元素被加载完成才会才会继续执行
        in_put = WebDriverWait(brower,10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#q'))
        )
        #等待 点击搜索按钮 加载 并且 获取搜索按钮对象 （是一个列表）
        submit = WebDriverWait(brower,10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button'))
        )

        #在输入框内输入
        in_put[0].send_keys('衣服') #因为获得的是一个对象列表，所以要将需要的对象取出 in_put[0]
        #点击 搜索按钮
        submit[0].click()

        #等待 总页数显示标签 加载 并且 获取对象 （是一个列表）
        total_page = WebDriverWait(brower,10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))
        )
        get_products()
        return total_page[0].text #该对象是一个文本标签，获取该标签里面的内容

    except TimeoutException:
        return search()   #出现超时错误再次运行该函数,这里要用到return，效果就是 total = search()一直在运行并且将值传给total，
                          #如果不写return则传不了值给total


def next_page(page_num):
    '''
    功能：执行翻页操作

    :param page_num:
    :return:
    '''
    try:
        in_put = WebDriverWait(brower,10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input'))
            )[0]
        submit = WebDriverWait(brower,10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))
        )[0]
        in_put.clear()
        in_put.send_keys(page_num)
        submit.click()

        #EC.text_to_be_present_in_element 该方法用于判断所要的文本是否出现在指定标签元素当中,在这里是用来判断当前页面数是否是我们输入的数。如果是则为True
        active = WebDriverWait(brower,10).until(
                EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_num))
            )
        print(active)
        get_products()
    except TimeoutException:
        next_page(page_num)


def get_products():
    #获取商品信息的操作对象
    WebDriverWait(brower,10).until(
         EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist > div > div > div:nth-child(1) > div'))
        )

    html = brower.page_source
    html = etree.HTML(html)
    items = html.xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div')
    for i in items:
        product={
            'img' : i.xpath('./div/div/div/a/img/@data-src')[0],
            'prince' : float(i.xpath('./div[2]/div/div/strong/text()')[0]),
            'people' : int(
                            re.compile('(\d+)').search(
                                    i.xpath('./div[2]/div[1]/div[2]/text()')[0]
                            ).group(1)
                        ),
            'title' : i.xpath('./div/div/div/a/img/@alt')[0],
            'city' : i.xpath('./div[2]/div[3]/div[2]/text()')[0],
            'shop': i.xpath('./div[2]/div[3]/div/a/span[2]/text()')[0]
        }
        print(product)
        cur.execute("INSERT INTO yifu (title,prince,people,city,shop,img) VALUES (%s,%s,%s,%s,%s,%s)",(product['title'],product['prince'],product['people'],product['city'],product['shop'],product['img']))
    con.commit() #提交




def main():
    total = search()
    total = int( re.compile('(\d+)').search(total).group(1) )
    print(total)
    for i in range(2,total+1):
        next_page(i)
    con.close() #关闭
    brower.close()

if __name__ == '__main__':
    main()
selenium模拟登陆淘宝网并且将‘衣服’相关信息下载储存在mysql数据库

猜你喜欢