列表页
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from lxml import etree
import time
import csv
import pymongo
# 打开浏览器实例
# def openBrower(brower_type):
# if brower_type == 'chrome':
# option = webdriver.ChromeOptions()
# option.add_argument('--headless')
# return webdriver.Chrome(options=option, executable_path='chromedriver.exe')
# else:
# print('浏览器调用模块出现故障!')
def parse_website(ky,start_page,end_page):
# browser = openBrower('chrome') # 通过Chrome()方法打开chrome浏览器
options = webdriver.FirefoxOptions()
options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
browser = webdriver.Firefox(options=options)
wait = WebDriverWait(browser, 10) # 等待
browser.get("https://www.jd.com")# 访问京东网站
#等待直到局部元素显示出来,这里的局部元素为淘宝网页搜索框部分
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#key")))
input.send_keys(ky) #在输入框调用send_keys方法模拟输入关键字
#等待直到元素可被点击,这里的元素为搜索按钮
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.button')))
submit.send_keys(Keys.ENTER) #模拟点击搜索按钮操作
# 按条件搜索
# low_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_selectorPrice > div.f-price-set > div:nth-child(1) > input")))#条件搜索价格低
# low_input.clear() # 清除当前输入框中的内容
# low_input.send_keys(low_price)
# high_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_selectorPrice > div.f-price-set > div:nth-child(3) > input")))#条件搜索价格高
# high_input.clear() # 清除当前输入框中的内容
# high_input.send_keys(high_price)
# price_submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.item2')))#确定
# price_submit.send_keys(Keys.ENTER) # 模拟点击确定按钮
for i in range(start_page,end_page+1):
time.sleep(1)
#等待直到局部元素显示出来,这里的局部元素为到第[2]页中的[..]
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input")))
input.clear() #清除当前输入框中的内容
input.send_keys(i) #把下一页的页码传入输入框中
#等待直到元素可被点击,这里的元素为输入页码后的的确定按钮
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
submit.send_keys(Keys.ENTER) #模拟点击确定按钮
time.sleep(3)
browser.refresh() #刷新网页
browser.execute_script("window.scrollTo(0,document.body.scrollHeight)") # 滑动至浏览器底部
# 一页显示60个商品,确保60个商品都正常加载出来。
try:
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")))
except:
pass
html = browser.page_source # 页面渲染完毕后生成html
time.sleep(1)
print('正在解析第%d页数据'%i)
parseHtml = etree.HTML(html)
li_list = parseHtml.xpath('//*[@class="gl-item"]')
count = 0
for item in li_list:
count += 1
print('正在解析第%d个资源'%count)
try:
id = item.xpath('./@data-sku')[0]
except:
id = 'NaN'
print(id)
try:
img_url = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/img/@data-lazy-img')[0]
img_url = f'https:{img_url}'
# print(img_url)
except:
img_url = 'NaN'
try:
detail_url = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-img"]/a/@href')[0]
if detail_url[:4] != 'http':
detail_url = f'https:{detail_url}'
# print(detail_url)
except:
detail_url = 'NaN'
try:
price = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-price"]/strong/i/text()')[0]
except:
price = 'NaN'
try:
title = ''.join(item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-name p-name-type-2"]/a/em//text()'))
except:
title = 'NaN'
if '京东超市' == title[:4]:
verify = True
else:
verify = False
try:
comment_num = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-commit"]/strong/a/text()')[0]
except:
comment_num = 'NaN'
try:
shop_url = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-shop"]/span/a/@href')[0]
except:
shop_url = 'NaN'
try:
shop_name = item.xpath('.//div[@class="gl-i-wrap"]/div[@class="p-shop"]/span/a/text()')[0]
except:
shop_name = 'NaN'
try:
info = ({'name':ky,'good_id':id,'title': title, 'price':price, 'img_url':img_url,'detail_url': detail_url,'verify': verify, 'comment_num':comment_num,'shop_name': shop_name,'shop_url': shop_url})
save(info)
except:
pass
# with open('京东详情链接.txt', 'a') as f:
# f.write(detail_url)
#
# with open('京东详情链接.txt', 'r') as fp:
# for line in fp:
# line = line.strip('\n')
# # print(line)
# 配置mongodb
def save(result):
client = pymongo.MongoClient('localhost', 27017)
dbname = client['spider']
MONGO_TABLE = dbname['jdlb']
MONGO_TABLE.insert(result)
#存储本地csv
# def save(list):
# print(list)
# csvFile = open(fr'{ky}.csv', 'a+', newline='', encoding='utf-8-sig') # 设置newline,否则两行之间会空一行
# writer = csv.writer(csvFile)
# writer.writerow(list)
# csvFile.close()
if __name__ == "__main__":
ky = input('请输入爬取商品信息:')
start = time.time()
start_page = int(input('请输入要爬取的起始页(1-80):'))
end_page = int(input('请输入要爬取的结束页(1-80):'))
# low_price = input('请输入商品价格搜索区间最低价')
# high_price = input('请输入商品价格搜索区间最高价')
# save(['商品ID', '标题', '价格', '图片展示', '详情页', '是否为自营', '评价数', '店铺名', '店铺地址'])
# main(ky)
parse_website(ky,start_page,end_page)
end = time.time()
print(end - start)
print('商品爬取完毕,谢谢使用!')
详情页
import csv
import time
from selenium import webdriver
import pymongo
ky = ""
def get_detail(ky):
# ky = input("请输入搜索商品链接:")
urls = ky.split(';') # 通过'https:'对整体url进行分割
# print(urls)
for i in urls: # 同时对所有的url进行遍历
url = i # 分离后形成单个的url加上'https:'以便访问
# print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windnyi ows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
options = webdriver.FirefoxOptions()
options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
browser = webdriver.Firefox(options=options)
# browser = webdriver.Chrome()# 启用谷歌浏览器内核
# browser = webdriver.Firefox() # 启用火狐浏览器内核
try:
browser.get(url) # 浏览器加入url访问
except:
continue
# browser.maximize_window()#最大化窗口
try:
button = browser.find_element_by_id('sufei-dialog-close') # 产生点击事件,关闭登录框
button.click()
except:
pass
# time.sleep(1) # 等待页面动态内容加载完毕
# wait = WebDriverWait(browser, 5) #等待页面动态内容加载完毕
# 展示图一
try:
img1 = browser.find_element_by_css_selector(
'#spec-list > ul > li.img-hover > img').get_attribute('src')
# img1 = img1.replace('60x60', '430x430')
except:
img1 = ' '
# print('展示图一:', img1)
# 展示图二
try:
img2 = browser.find_element_by_css_selector(
'#spec-list > ul > li.img-hover > img').get_attribute('src')
# img2 = img2.replace('60x60', '430x430')
except:
img2 = ' '
# print('展示图二:', img2)
# 展示图三
try:
img3 = browser.find_element_by_css_selector(
'#spec-list > ul > li.img-hover > img').get_attribute('src')
# img3 = img3.replace('60x60', '430x430')
except:
img3 = ' '
# print('展示图三:', img3)
# 展示图四
try:
img4 = browser.find_element_by_css_selector(
'#spec-list > ul > li.img-hover > img').get_attribute('src')
# img4 = img4.replace('60x60', '430x430')
except:
img4 = ' '
# print('展示图四:', img4)
# 展示图五
try:
img5 = browser.find_element_by_css_selector(
'#spec-list > ul > li.img-hover > img').get_attribute('src')
# img5 = img5.replace('60x60', '430x430')
except:
img5 = ' '
# print('展示图五:', img5)
# 详情网址
print('详情网址:', url)
# 商品标题
try:
title = browser.find_element_by_class_name('sku-name').text
except:
title = ' '
# print('商品标题:', title)
# 店铺名称
try:
set_meal = browser.find_element_by_xpath('//*[@id="crumb-wrap"]/div/div[2]/div[2]/div[1]/div/a').text
except:
set_meal = ' '
# print('店铺名称:', set_meal)
# 发货地点
try:
address = browser.find_element_by_id('summary-service').text
except:
address = ' '
# print('发货地点:', address)
# 产品价格
try:
price = browser.find_element_by_class_name("p-price").text
except:
price = ' '
# print('产品价格:', price)
# 评价人数
try:
evaluation = browser.find_element_by_css_selector('#comment-count > a').text
# evaluation1 = evaluation1.replace('累计评价', '')
except:
evaluation = ' '
# print('评价人数:', evaluation)
# 属性套餐
try:
combo = browser.find_element_by_id('choose-attrs').text
combo = combo.split('\n')
combo = combo[0:-1:1]
except:
combo = ' '
# print('属性套餐:',combo)
# 商品详情参数
try:
details = browser.find_element_by_xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[@class="parameter2 p-parameter-list"]').text
details = details.split('\n')
except:
details = ' '
# print('商品详情', details)
# try:
# info = [url, img1, img2, img3, img4, img5, title, set_meal, address, price, evaluation,combo,details]
# save(info)
# except:
# pass
try:
info = ({'detail_url':url,'img1': img1, 'img2':img2,'img3': img3,'img4': img4,'img5': img5,'title': title,
'set_meal':set_meal, 'address':address,'price': price, 'evaluation':evaluation, 'combo':combo,'details':details})
save(info)
except:
pass
# with open('京东商品详情信息.csv', 'a', newline='', encoding='utf-8-sig') as fp:
# csv_writer = csv.writer(fp, delimiter=',')
# csv_writer.writerow([
# url, img1, img2, img3, img4, img5, title1, set_meal1, address1, price1, evaluation1,combo,details1
# ])
# 关闭内核浏览器
browser.close()
browser.quit()
#京东信息存储
# def save(list):
# csvFile = open('京东商品详情信息.csv', 'a', newline='', encoding='utf-8-sig') # 设置newline,否则两行之间会空一行
# writer = csv.writer(csvFile)
# writer.writerow(list)
# csvFile.close()
#配置mongodb
def save(result):
client = pymongo.MongoClient('localhost', 27017)
dbname = client['spider']
MONGO_TABLE = dbname['jdxq']
MONGO_TABLE.insert(result)
if __name__ =='__main__':
# csv文件标题
# headers = ['详情链接','展示图1','展示图2','展示图3','展示图4','展示图5','商品标题','店铺名称','发货地点','产品价格','评价人数','套餐属性','商品详情']
# save(headers)
get_detail(ky)