from threading import Thread
from queue import Queue
from fake_useragent import UserAgent
import requests
from lxml import etree
#爬虫类
class CrawInfo(Thread):
def __init__(self,url_queue,html_queue):
Thread.__init__(self)
self.url_queue = url_queue
self.html_queue = html_queue
def run(self):
headers = {
"User-Agent":UserAgent().random
}
while self.url_queue.empty()==False:
res = requests.get(self.url_queue.get(),headers=headers)
# print(res.text)
if res.status_code==200:
self.html_queue.put(res.text)
#解析类
class ParseInfo(Thread):
def __init__(self,html_queue):
Thread.__init__(self)
self.html_queue = html_queue
def run(self):
while self.html_queue.empty() == False:
e = etree.HTML(self.html_queue.get())
span_contents = e.xpath('//div[@class="content"]/span[1]')
for span in span_contents:
info = span.xpath('string(.)')
print(info)
if __name__ == '__main__':
#存储URL的容器
url_queue = Queue()
base_url = 'https://www.qiushibaike.com/text/page/{}/'
#存储内容的容器
html_queue = Queue()
for i in range(1,14):
new_url = base_url.format(i)
url_queue.put(new_url)
craw1_list = []
#创建一个爬虫
for i in range(3):
craw1 = CrawInfo(url_queue,html_queue) #url_queue,html_queue都是公共的
craw1_list.append(craw1)
craw1.start()
for c in craw1_list:
print(c,'--------->ends')
c.join()
for i in range(4):#html_queue是公共的
parse = ParseInfo(html_queue)
parse.start()
一个简单的爬虫-多线程
猜你喜欢
转载自blog.csdn.net/czw0723/article/details/87075536
今日推荐
周排行