import re
import pymysql
from lxml import etree
from selenium import webdriver
#一下三行用于等待判断页面是否加载完毕
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
#selenium驱动谷歌浏览器
brower = webdriver.Chrome()
#创建数据库实例对象
con = pymysql.connect(host='localhost',user='root',password='',db='taobao',port=3306)
#创建游标
cur = con.cursor()
#创建一个表
cur.execute("CREATE TABLE yifu (id int(4) NOT NULL auto_increment PRIMARY KEY ,title VARCHAR(60),prince FLOAT(4,2),people int(10),city VARCHAR(10),shop VARCHAR(20),img VARCHAR(200))")
def search():
'''
功能:打开网页
加载网页
获取输入框对象
获取搜索按钮对象
向输入框对象输入关键字
搜索按钮对象执行点击一次的方法
'''
try:
brower.get('https://www.taobao.com') #打开淘宝首页
#等待搜索框加载 并且 获取输入框对象 (是一个列表)
#WebDriverWait(brower,10)显示等待,直到这个元素被加载完成才会才会继续执行
in_put = WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#q'))
)
#等待 点击搜索按钮 加载 并且 获取搜索按钮对象 (是一个列表)
submit = WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button'))
)
#在输入框内输入
in_put[0].send_keys('衣服') #因为获得的是一个对象列表,所以要将需要的对象取出 in_put[0]
#点击 搜索按钮
submit[0].click()
#等待 总页数显示标签 加载 并且 获取对象 (是一个列表)
total_page = WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))
)
get_products()
return total_page[0].text #该对象是一个文本标签,获取该标签里面的内容
except TimeoutException:
return search() #出现超时错误再次运行该函数,这里要用到return,效果就是 total = search()一直在运行并且将值传给total,
#如果不写return则传不了值给total
def next_page(page_num):
'''
功能:执行翻页操作
:param page_num:
:return:
'''
try:
in_put = WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input'))
)[0]
submit = WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))
)[0]
in_put.clear()
in_put.send_keys(page_num)
submit.click()
#EC.text_to_be_present_in_element 该方法用于判断所要的文本是否出现在指定标签元素当中,在这里是用来判断当前页面数是否是我们输入的数。如果是则为True
active = WebDriverWait(brower,10).until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_num))
)
print(active)
get_products()
except TimeoutException:
next_page(page_num)
def get_products():
#获取商品信息的操作对象
WebDriverWait(brower,10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist > div > div > div:nth-child(1) > div'))
)
html = brower.page_source
html = etree.HTML(html)
items = html.xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div')
for i in items:
product={
'img' : i.xpath('./div/div/div/a/img/@data-src')[0],
'prince' : float(i.xpath('./div[2]/div/div/strong/text()')[0]),
'people' : int(
re.compile('(\d+)').search(
i.xpath('./div[2]/div[1]/div[2]/text()')[0]
).group(1)
),
'title' : i.xpath('./div/div/div/a/img/@alt')[0],
'city' : i.xpath('./div[2]/div[3]/div[2]/text()')[0],
'shop': i.xpath('./div[2]/div[3]/div/a/span[2]/text()')[0]
}
print(product)
cur.execute("INSERT INTO yifu (title,prince,people,city,shop,img) VALUES (%s,%s,%s,%s,%s,%s)",(product['title'],product['prince'],product['people'],product['city'],product['shop'],product['img']))
con.commit() #提交
def main():
total = search()
total = int( re.compile('(\d+)').search(total).group(1) )
print(total)
for i in range(2,total+1):
next_page(i)
con.close() #关闭
brower.close()
if __name__ == '__main__':
main()
selenium模拟登陆淘宝网并且将‘衣服’相关信息下载储存在mysql数据库
猜你喜欢
转载自blog.csdn.net/a_lazy_zhu/article/details/80858079
今日推荐
周排行