重点掌握BeautifulSoup的使用,用class或者id属性进行select 和select_one定位,正则表达式的试用。
from bs4 import BeautifulSoup
import requests
import time
import json
import re
# import phantomjs
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36','referer': 'https://www.jd.com/'}
cookiestr='shshshfpa=509a4836-a95f-a00b-da6f-a2cee3bdc012-1573951043; shshshfpb=xei61TmhyHUJmvGIu%2FBoS3w%3D%3D; __jdu=787291882; user-key=27687cb1-4237-49c1-be50-1389469ccb2a; cn=0; ipLoc-djd=1-72-4137-0; areaId=1; PCSYCityID=CN_330000_0_0; __jdc=122270672; 3AB9D23F7A4B3C9B=DCJUYZT25TVN4JGXFQIH5WNSLDSVHW4ZJE4YXXJEHTQW7CSAAWIXEJA5SY6KYZWKQQNRQEW5GIBRUEYWYHZTRPD5IU; _gcl_au=1.1.1358316623.1582685147; shshshfp=14c88871408acf96dfa7675a8c41baa8; __jda=122270672.787291882.1573690002.1582682640.1582851083.29; __jdv=122270672|direct|-|none|-|1582851083348; __jdb=122270672.3.787291882|29.1582851083'
cookies={} #构建cookies应对反爬
for i in cookiestr.split(';'):
cookies[i.split('=')[0]]=i.split('=')[1]
def get_price(skuid): #获得产品原价及促销价
url='https://p.3.cn/prices/mgets?callback=jQuery7409665&ext=11101100&pin=&type=1&area=1_72_4137_0&skuIds=J_%s'%skuid
html = requests.get(url,headers=headers,cookies=cookies) #cookies似乎不需要
pattern = re.compile(r'{.+}')
originalprice=(json.loads(re.findall(pattern, html.text)[0])['op'])
promotionprice=(json.loads(re.findall(pattern, html.text)[0])['p'])
return originalprice,promotionprice #返回原价和促销价的元祖
def get_comments(skuid):#获得产品评价信息
url='https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=%s'%skuid
html = requests.get(url,headers=headers,cookies=cookies)
dic=json.loads(html.text)
return dic #返回字典格式
def get_html(url):
# b=webdriver.PhantomJS() #反爬情况下, 采用selenium+PhantomJS爬取
# b.get(url)
# if b.page_source:
# print('获取页面成功...')
# parse_html(b.page_source)
# else:
# print('Error:', html.text)
# return b.page_source
html=requests.get(url,headers=headers)
if html.status_code==200:
# print('获取页面成功...')
# print(html.text)
parse_html(html.text)
else:
print('Error:',html.text)
return html.text
def parse_html(html):
soup=BeautifulSoup(html,'lxml')
products=soup.select('#J_goodsList > ul > li') #这里的>不能省略,否则会有重复信息获取 id用#,class用.
n=0
for i in products:
try:
shopname = i.select_one('div > div.p-shop').get('data-shop_name')
# shopname = i.find('div',class_='p-shop').get('data-shop_name') # 用BS的find也可以
sku=i.get('data-sku')
comments = get_comments(sku)
price = get_price(sku)[1]
productname =i.select_one('div > div.p-name > a > em').text.strip()
productlink='http:'+i.select_one('div > div.p-img > a')['href']
img='http:'+i.select_one('div > div.p-img > a > img')['src']
# r.hmset(sku, {'商品名称': productname, '店铺': shopname,'商品链接':productlink,'商品图片链接':img})
n+=1
except:
img = 'http:' + i.select_one('div > div.p-img > a > img')['data-lazy-img']
# shopname = i.select_one('div > div.p-shop').get('data-shop_name')
# # shopname = i.find('div',class_='p-shop').get('data-shop_name') # 用BS的find也可以
# sku = i.select_one('div')['data-sku']
# productname = i.select_one('div > div.p-name > a > em').text.strip()
# productlink = 'http:' + i.select_one('div > div.p-img > a')['href']
n += 1
finally:
# print(shopname ,',',sku,',', productname,',', productlink,',', img)
print(shopname, sku, productname, productlink, img, '价格:', price, '评论数:',
comments['CommentsCount'][0]['CommentCount'], '好评率:', comments['CommentsCount'][0]['GoodRate'])
print(len(products))
print(n)
if __name__=='__main__':
time1=time.time()
url = ['https://list.jd.com/list.html?cat=9847,9850&page=%s'%str(i) for i in range(1,2)]
pageNum=0
for i in url:
time.sleep(1)
pageNum+=1
print('Crawling Page No.:',pageNum)
get_html(i)
time2=time.time()
print('Time used:',time2-time1) #Time used: 4.054231882095337这是未被反爬情况下,爬取5页的时间