一个简单的爬虫-多线程

from threading import Thread
from queue import Queue
from fake_useragent import UserAgent
import  requests
from lxml import etree
#爬虫类

class CrawInfo(Thread):
    def __init__(self,url_queue,html_queue):
        Thread.__init__(self)
        self.url_queue = url_queue
        self.html_queue = html_queue
    def run(self):
        headers = {
            "User-Agent":UserAgent().random
        }
        while self.url_queue.empty()==False:
            res = requests.get(self.url_queue.get(),headers=headers)
            # print(res.text)
            if res.status_code==200:
                self.html_queue.put(res.text)

#解析类
class ParseInfo(Thread):
    def __init__(self,html_queue):
        Thread.__init__(self)
        self.html_queue = html_queue

    def run(self):
        while self.html_queue.empty() == False:
            e = etree.HTML(self.html_queue.get())
            span_contents = e.xpath('//div[@class="content"]/span[1]')
            for span in span_contents:
                info = span.xpath('string(.)')
                print(info)

if __name__ == '__main__':
    #存储URL的容器
    url_queue = Queue()
    base_url = 'https://www.qiushibaike.com/text/page/{}/'
    #存储内容的容器
    html_queue = Queue()

    for i in range(1,14):
        new_url = base_url.format(i)
        url_queue.put(new_url)

    craw1_list = []

    #创建一个爬虫
    for i in range(3):
        craw1 = CrawInfo(url_queue,html_queue) #url_queue,html_queue都是公共的
        craw1_list.append(craw1)
        craw1.start()
    for c in craw1_list:
        print(c,'--------->ends')
        c.join()

    for i in range(4):#html_queue是公共的
        parse = ParseInfo(html_queue)
        parse.start()
一个简单的爬虫-多线程

猜你喜欢