import queue
import threading
from fake_useragent import UserAgent
import time
import requests
from requests.exceptions import RequestException
from lxml import etree
import json
# 2个队列
# 两个线程类
# 爬虫线程的退出标志
spider_exit_flag = False
parse_exit_flag = False
lock = threading.Lock()
# 爬虫线程
class SpiderThread(threading.Thread):
def __init__(self, spider_queue, data_queue, id, *args, **kwargs):
super().__init__(*args, **kwargs)
self.spider_queue = spider_queue
self.data_queue = data_queue
self.id = id
def run(self):
# 从spider_queue中不停的拿数据,然后爬取,然后爬取的内容存到data_queue中.
while True:
global spider_exit_flag
if spider_exit_flag:
break
try:
url = self.spider_queue.get(block=False)
# 开始爬
times = 3
while times >= 0:
try:
response = requests.get(url, timeout=10)
self.data_queue.put(response.text)
print(f'{self.id}号线程成功获取{url}的数据')
# 放慢一下爬虫的速度
time.sleep(1)
self.spider_queue.task_done()
# 只要成功一次就不用再试
break
except RequestException:
# 说明请求不成功
times -= 1
except Exception as e:
pass
# 解析线程
class ParseThread(threading.Thread):
def __init__(self, data_queue, id, fp, *args, **kwargs):
super().__init__(*args, **kwargs)
self.data_queue = data_queue
self.id = id
self.fp = fp
def run(self):
# 不停的从data_queue中取数据
while True:
global parse_exit_flag
if parse_exit_flag:
break
try:
data = self.data_queue.get(block=False)
# 用xpath提取数据
# 把解析的过程封装成一个函数 parse
self.parse(data)
print(f'{self.id}号解析线程解析成功')
# 等事情都做完之后才告诉队列可以释放队列锁.
self.data_queue.task_done()
except queue.Empty:
pass
def parse(self, data):
html = etree.HTML(data)
div_list = html.xpath('//div[contains(@id, "qiushi_tag_")]')
# 对div_list遍历
items = []
for div in div_list:
img = div.xpath('.//img[1]/@src')
name = div.xpath('.//h2')[0].text.strip()
content = div.xpath('.//div[@class="content"]/span')[0].text.strip()
item = {}
item['img'] = img
item['name'] = name
item['content'] = content
items.append(item)
result = {
'status': 'ok',
'code': 200,
'data': items
}
# 写入文件
global lock
with lock:
self.fp.write(json.dumps(result, ensure_ascii=False) + '\n')
def main():
# 创建队列
spider_queue = queue.Queue(12)
# data_queue
data_queue = queue.Queue(12)
base_url = 'https://www.qiushibaike.com/text/page/%d/'
for i in range(1,13):
url = base_url % i
spider_queue.put(url)
# 创建线程
for i in range(3):
SpiderThread(spider_queue, data_queue, i).start()
fp = open('./qiushi.json', 'a', encoding='utf-8')
for i in range(3):
ParseThread(data_queue, i, fp).start()
# 队列锁
spider_queue.join()
global spider_exit_flag
spider_exit_flag = True
data_queue.join()
global parse_exit_flag
parse_exit_flag = True
# 关闭文件
fp.close()
if __name__ == '__main__':
main()
多线程爬去糗事百科
猜你喜欢
转载自blog.csdn.net/weixin_42825585/article/details/87893931
今日推荐
周排行