import time
from selenium import webdriver
from bs4 import BeautifulSoup
url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=1&s=1&click=0"
driver = webdriver.Chrome()
driver.implicitly_wait(3)
driver.get(url)
# 模拟下滑到底部操作
for i in range(1, 5):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
# 将加载好的页面源码给bs4解析
soup = BeautifulSoup(driver.page_source, "html.parser")
# 进行信息的抽取(商品名称,价格) p-name p-name-type-2
goods_info = soup.select(".gl-item")
for info in goods_info:
title = info.select(".p-name.p-name-type-2 a")[0].text.strip()
price = info.select(".p-price")[0].text.strip()
print(title)
print(price)
driver.close()
import requests
from bs4 import BeautifulSoup
import json
def check(items):
if len(items) == 0:
return "No Public House"
else:
return items
def got_html(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36'}
url = 'https://search.jd.com/Search?keyword=Java&enc=utf-8'
response = requests.get(url, headers=headers)
html = response.content.decode()
# print(html)
return html
def parse_html(html):
soup = BeautifulSoup(html, 'lxml')
item_list = soup.select('ul[class="gl-warp clearfix"] li')
print(len(item_list))
result = []
for item in item_list:
# css选择器 选择价格
data_sku = '.J_' + item.attrs['data-sku'] + ' i'
# print(data_sku)
price = item.select(data_sku)[0].get_text()
result.append(price)
# print(price)
# 书名
name = item.select('div[class="p-name p-name-type-2"]')[0].get_text().strip()
result.append(name)
# print(name)
# 评论数
data_id = '#J_comment_' + item.attrs['data-sku']
comments = item.select(data_id)[0].get_text().strip()
result.append(comments)
# print(comments)
# 出版社
# J_goodsList > ul > li:nth-child(1) > div > div.p-shop > span > a
public = item.select('div > div.p-shop > span > a')[0].get_text().strip()
public = check(public)
result.append(public)
# print(public)
return result
def save_data(data):
data = json.dumps(data, ensure_ascii=False)
with open('Java_book.json', 'a', encoding='utf-8') as F:
F.write(data)
def main():
url_start = 'https://search.jd.com/Search?keyword=Java&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&page='
url_end = '&s=58&click=0'
for i in range(1, 11):
url = url_start + str(i) + url_end
html = got_html(url)
details = parse_html(html)
save_data(details)