import threading
from queue import Queue
import requests
from lxml import etree
import json
import os
# 线程类 用于爬取页面的数据
class ThreadCrawl(threading.Thread):
# 定义请求头
def __init__(self, thread_id):
threading.Thread.__init__(self)
self.threadID = thread_id
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
# 重写run方法 线程和进程类的run方法会自动运行 无需调用
def run(self):
print('Starting', self.threadID)
self.qiushi_spider()
print('Exiting', self.threadID)
# 爬取网站数据的方法
def qiushi_spider(self):
# 判断是否还有需要爬取的页面
while not page_queue.empty():
page = page_queue.get()
url = 'http://www.qiushibaike.com/8hr/page/' + str(page) + '/'
print('爬虫的id:', self.threadID, ', 爬取网页的页码:', str(page))
timeout = 4
while timeout > 0:
timeout = timeout - 1
try:
content = requests.get(url, headers=self.headers, timeout=0.5)
data_queue.put(content.text)
break
except Exception as e:
print('qiushi_spider', e)
# 解析页面数据类 用于解析页面数据
class ThreadParser(threading.Thread):
# 线程的id用于分辨线程
def __init__(self, thread_id, file):
threading.Thread.__init__(self)
self.threadID = thread_id
self.file = file
def run(self):
print('Starting', self.threadID)
while not exitFlag_Parser:
try:
'''
调用队列对象的get()方法从队头删除并返回一个项目。可选参数为block,默认为True。
如果队列为空且block为True,get()就使调用线程暂停,直至有项目可用。
如果队列为空且block为False,队列将引发Empty异常。
'''
item = data_queue.get(False)
if not item:
pass
self.parse_date(item)
# 提示线程join是否阻塞
data_queue.task_done()
except:
pass
print('-' * 160)
print('Exiting', self.threadID)
# 解析页面数据 item网页全部数据
def parse_date(self, item):
try:
html = etree.HTML(item)
result = html.xpath('//div[contains(@id,"qiushi_tag")]')
for site in result:
try:
img_url = site.xpath('.//img/@src')[0]
print('*'*30)
title = site.xpath('.//h2')[0].text.strip()
content = site.xpath('.//div[@class="content"]/span')[0].text.strip()
vote = ''
comments = ''
try:
vote = site.xpath('.//i')[0].text
comments = site.xpath('.//i')[1].text
except:
pass
data = {
'imgUrl': img_url,
'title': title,
'content': content,
'vote': vote,
'comments': comments,
}
if mutex.acquire():
data = json.dumps(data, ensure_ascii=False)
print('save……', data)
self.file.write(data+'\n')
mutex.release()
except Exception as e:
print('site in result', e)
except Exception as e:
print('parse_date', e)
def main():
# 判断data文件夹是否存在
if not os.path.exists('data'):
os.mkdir('data')
# 打开一个文件对象
output = open('data/qiushibaike.json', 'a', encoding='utf-8')
# 爬取数据的页码范围
for page in range(1, 11):
page_queue.put(page)
# 创建爬取数据的多线程
crawl_threads = []
crawl_list = ['crawl-1', 'crawl-2', 'crawl-3']
for thread_id in crawl_list:
thread = ThreadCrawl(thread_id)
thread.start()
crawl_threads.append(thread)
# 创建解析数据的多线程
parse_threads = []
parse_list = ['parse-1', 'parse-2', 'parse-3']
for thread_id in parse_list:
thread = ThreadParser(thread_id, output)
thread.start()
parse_threads.append(thread)
# 判断页码的队列是否为空 不为空阻塞主进程
while not page_queue.empty():
pass
# 阻塞所有爬取的线程
for t in crawl_threads:
t.join()
# 判断需要解析的数据队列是否为空 不为空阻塞主进程
while not data_queue.empty():
pass
global exitFlag_Parser
print('-'*80)
# 所有数据均已解析 终止线程
exitFlag_Parser = True
# 阻塞所有爬取的线程
for t in parse_threads:
t.join()
print('Exiting Main Thread')
if mutex.acquire():
output.close()
if __name__ == '__main__':
# 页面数据的队列
data_queue = Queue()
# 需要获取的网页的队列
page_queue = Queue(50)
# bool类型 收购退出解析
exitFlag_Parser = False
# 锁 数据的储存
mutex = threading.Lock()
main()
# msg = False
# while not msg:
# pass
#
# print('hehe')
爬虫09-多线程爬虫
猜你喜欢
转载自blog.csdn.net/qwerLoL123456/article/details/82532807
今日推荐
周排行