import requests
import threading
from queue import Queue
from lxml import etree
# 爬取糗事百科
# 多线程
class QiubaiSpider:
def __init__(self):
self.temp_url = 'https://www.qiushibaike.com/8hr/page/{}/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 QQBrowser/4.4.108.400'
}
self.url_q = Queue()
self.html_q = Queue()
self.content_q = Queue()
def getUrl_list(self):
url_list = [self.temp_url.format(i) for i in range(1,14)]
for url in url_list:
self.url_q.put(url)
def parse_url(self):
while True:
url = self.url_q.get()
print(url)
resp = requests.get(url,headers=self.headers)
self.html_q.put(resp.content.decode())
self.url_q.task_done()
def get_content_list(self):
while True:
html_str = self.html_q.get()
html = etree.HTML(html_str)
div_list = html.xpath('//div[@id="content-left"]/div')
content_list =[]
for div in div_list:
item = {}
text = div.xpath('.//div[@class="content"]/span/text()')
author = div.xpath('.//h2/text()')
# print(text)
item['author'] = author
item['text'] = text
# item['text'] = [i for i in div.xpath('.//div[@class="content"]/span/text()')]
content_list.append(item)
self.content_q.put(content_list)
self.html_q.task_done()
def save_content_list(self):
current =0
while True:
content_list = self.content_q.get()
for content in content_list:
current +=1
print(content)
print(current)
self.content_q.task_done()
def run(self):
thread_list = []
self.getUrl_list()
for i in range(3):
t_parse = threading.Thread(target=self.parse_url)
thread_list.append(t_parse)
t_content = threading.Thread(target=self.get_content_list)
thread_list.append(t_content)
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True)
t.start()
for q in [self.url_q,self.html_q,self.content_q]:
q.join()
print('爬取完成。。。')
if __name__ == '__main__':
qiubai = QiubaiSpider()
qiubai.run()
糗事百科_多线程demo(2)
猜你喜欢
转载自blog.csdn.net/zzw19951261/article/details/81075987
今日推荐
周排行