爬取豆瓣影评少年的你评论信息
爬取 用户名,评论标题,内容,发表时间,点赞数量
直接上代码了 大部分注释都有 自己可以看下
import requests
import lxml.html
import json
import re
from queue import Queue
import threading
CRAWL_EXIT = False
PARSE_EXIT = False
etree = lxml.html.etree
class ThreadCrawls(threading.Thread):
def __init__(self,threadName,pageQueue,dataQueue):
threading.Thread.__init__(self)
self.threadName = threadName
self.pageQueue = pageQueue
self.dataQueue = dataQueue
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
def run(self):
print("开启采集线程"+self.threadName)
while not CRAWL_EXIT:
try:
page = self.pageQueue.get(False)
url = "https://movie.douban.com/subject/30166972/reviews?start=" + str((page - 1) * 20)
response = requests.get(url, headers=self.headers)
response.encoding = "utf-8"
content = response.text
content = re.sub("[\t\r\n]", "", content)
self.dataQueue.put(content)
except:
pass
print("结束采集线程"+self.threadName)
class ThreadParses(threading.Thread):
def __init__(self,threadName,dataQueue,localFile,lock):
threading.Thread.__init__(self)
self.threadName = threadName
self.dataQueue = dataQueue
self.localFile = localFile
self.lock = lock
def run(self):
print("开启解析线程"+self.threadName)
while not PARSE_EXIT:
try:
html = self.dataQueue.get(False)
self.parse(html)
except:
pass
print("结束解析线程" + self.threadName)
def parse(self,html):
text = etree.HTML(html)
node_list = text.xpath(".//div[@class='main review-item']")
for node in node_list:
username = node.xpath(".//a[@class='name']")[0].text
time = node.xpath(".//span[@class='main-meta']")[0].text
title = node.xpath(".//div[@class='main-bd']/h2/a")[0].text
content = node.xpath(".//div[@class='short-content']")[0].text
zan = node.xpath(".//div[@class='action']/a/span")[0].text
items = {
"username": username,
"title": title,
"content": content,
"time": time,
"zan": zan
}
with self.lock:
self.localFile.write(json.dumps(items,ensure_ascii=False)+"\n")
def main():
pageQueue = Queue(20)
for i in range(1,21):
pageQueue.put(i)
dataQueue = Queue(20)
localFile = open("dbduanping.json","a",encoding="utf-8")
lock = threading.Lock();
crawlList = ['采集1号线程', '采集2号线程', '采集3号线程']
threadCrawls = []
for threadName in crawlList:
thread = ThreadCrawls(threadName,pageQueue,dataQueue)
thread.start()
threadCrawls.append(thread)
parseList = ['解析1号线程', '解析2号线程', '解析3号线程']
threadParses = []
for threadName in parseList:
thread = ThreadParses(threadName,dataQueue,localFile,lock)
thread.start()
threadParses.append(thread)
while not pageQueue.empty():
pass
global CRAWL_EXIT
CRAWL_EXIT = True
print("pageQueue 为空")
for thread in threadCrawls:
thread.join()
while not dataQueue.empty():
pass
print("dataQueue 为空")
global PARSE_EXIT
PARSE_EXIT = True
for thread in threadParses:
thread.join()
with lock:
localFile.close()
if __name__ == "__main__":
main()