安装mongodb,默认设置localhost', 27017 goods.txt 需要爬取商品id 直接执行tm_goods_rate_max.py 爬取到的评价会在桌面生成excel
码云:https://gitee.com/zhuchaocc/tmall_rate.git
import json
import re
from pymongo import MongoClient
import httpUtils
import Model
import threading
import time
import tm_goods_rate_max
import openpyxl
import winreg
import os
conn = MongoClient('localhost', 27017)
db = conn.rate_url
good_detail = "https://detail.tmall.com/item.htm?id="
good_rate_temp = "https://rate.tmall.com/list_detail_rate.htm?order=3&callback=json&itemId=%s&spuId=%s&sellerId=%s¤tPage="
proxy_list = []
flag_num = 0
def rate_url(goods_id_list):
rate_url_dict = {}
for good_id in goods_id_list:
html = httpUtils.open(good_detail + good_id, proxy_list)
it = re.finditer(re.compile(r'"spuId":\d+'), str(html), flags=0)
spu_id = 0
for match in it:
spu_id = match.group().split(":")[1]
it = re.finditer(re.compile(r'"sellerId":\d+'), str(html), flags=0)
seller_id = 0
for match in it:
seller_id = match.group().split(":")[1]
rate_url_dict[str(good_id)] = good_rate_temp % (good_id, spu_id, seller_id)
return rate_url_dict
def last_page(rate_url):
url = rate_url + "1"
rate_json = httpUtils.open(url, proxy_list)
return json.loads(rate_json[9:-1])["rateDetail"]["paginator"]["lastPage"]
def url_mongdb_cache(rate_url_dict):
for key in rate_url_dict.keys():
value = rate_url_dict[key]
end = last_page(value + "1")
for i in range(1, end + 1):
db.col.insert({"goodsId": key, "page": str(i), "url": value + str(i), "type": "1"})
print({"goodsId": key, "page": str(i), "url": value + str(i)})
def rate(rate_url):
rate_json = httpUtils.open(rate_url["url"], proxy_list)
rate_list = json.loads(rate_json[9:-1])["rateDetail"]["rateList"]
for rate_data in rate_list:
auction_sku = rate_data['auctionSku']
sku_list = auction_sku.split(';')
sku = ''
if (len(sku_list) > 0):
for i in sku_list:
sku += i.split(":")[1] + "/"
sku = sku[0:len(sku)-1]
user = rate_data['goldUser']
flag = '超级会员' if user else ''
new_rate = Model.Rate(rate_data['rateContent'], rate_data['rateDate'], rate_data['displayUserNick'],
sku, flag, rate_data['reply'], rate_url["goodsId"])
print(new_rate.__dict__)
db.col.insert(new_rate.__dict__)
def write_excel():
wb = openpyxl.Workbook()
for goods_id in goods_id_list:
goods_rate_list = db.col.find({"goods_id": goods_id})
wb.create_sheet(goods_id, index=0)
sheet = wb.get_sheet_by_name(goods_id)
sheet['A1'] = '评价'
sheet['B1'] = '评价日期'
sheet['C1'] = '评价人'
sheet['D1'] = 'sku'
sheet['E1'] = '超级会员'
sheet['F1'] = '卖家回复'
for goods_rate in goods_rate_list:
srs = []
srs.append(goods_rate['rate_content'])
srs.append(goods_rate['rate_date'])
srs.append(goods_rate['user_nick'])
srs.append(goods_rate['sku'])
srs.append(goods_rate['gold_user'])
srs.append(goods_rate['reply'])
sheet.append(srs)
wb.save(get_desktop() + '/rate_pro.xlsx')
db.col.remove()
def get_desktop():
path = r'Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders'
key = winreg.OpenKey(winreg.HKEY_CURRENT_USER, path)
return winreg.QueryValueEx(key, "Desktop")[0]
def one():
url_mongdb_cache(rate_url(goods_id_list))
def two():
num = 0
while 1:
if (num >= 5):
tm_goods_rate_max.flag_num += 1
return
rate_url_dic = db.col.find_one_and_delete({"type": "1"})
if (rate_url_dic == None):
time.sleep(5)
num += 1
else:
num = 0
rate(rate_url_dic)
def three(num):
while 1:
if (tm_goods_rate_max.flag_num >= num):
write_excel()
break
else:
time.sleep(5)
if __name__ == '__main__':
path = os.getcwd() + "\goods.txt"
f = open(path)
goods_id_str = f.readline()
goods_id_list = goods_id_str.split(",")
show_str = ''
for i in goods_id_list:
show_str += i + "\n"
flag = input("需要爬取的商品id为:\n" + show_str + "\n确定输入yes否则按任意键退出!\n")
if flag == "yes":
# 默认开启十个线程爬取
thread_num = 10
if len(goods_id_list) > 30:
thread_num = 20
# 获取代理
proxy_list = httpUtils.get_proxy(httpUtils.xici_url)
# 清空数据
db.col.remove()
threads = []
threads.append(threading.Thread(target=one))
threads.append(threading.Thread(target=three, args=[thread_num]))
for i in range(thread_num):
threads.append(threading.Thread(target=two))
for t in threads:
t.start()