糗事百科_多线程demo(2)

import requests
import threading
from queue import Queue
from lxml import etree



# 爬取糗事百科

# 多线程

class QiubaiSpider:
    def __init__(self):
        self.temp_url = 'https://www.qiushibaike.com/8hr/page/{}/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 QQBrowser/4.4.108.400'
        }
        self.url_q = Queue()
        self.html_q = Queue()
        self.content_q = Queue()



    def getUrl_list(self):
        url_list = [self.temp_url.format(i) for i in range(1,14)]
        for url in url_list:
            self.url_q.put(url)


    def parse_url(self):
        while True:
            url = self.url_q.get()

            print(url)
            resp = requests.get(url,headers=self.headers)
            self.html_q.put(resp.content.decode())
            self.url_q.task_done()


    def get_content_list(self):
        while True:
            html_str = self.html_q.get()
            html = etree.HTML(html_str)

            div_list = html.xpath('//div[@id="content-left"]/div')
            content_list =[]

            for div in div_list:
                item = {}
                text = div.xpath('.//div[@class="content"]/span/text()')
                author = div.xpath('.//h2/text()')
                # print(text)
                item['author'] = author
                item['text'] = text
                # item['text'] = [i for i in div.xpath('.//div[@class="content"]/span/text()')]
                content_list.append(item)
            self.content_q.put(content_list)
            self.html_q.task_done()


    def save_content_list(self):
        current =0
        while True:
            content_list = self.content_q.get()
            for content in content_list:
                current +=1
                print(content)
                print(current)

            self.content_q.task_done()



    def run(self):
        thread_list = []

        self.getUrl_list()

        for i in range(3):
            t_parse = threading.Thread(target=self.parse_url)
            thread_list.append(t_parse)

        t_content = threading.Thread(target=self.get_content_list)
        thread_list.append(t_content)

        t_save = threading.Thread(target=self.save_content_list)
        thread_list.append(t_save)

        for t in thread_list:
            t.setDaemon(True)
            t.start()

        for q in [self.url_q,self.html_q,self.content_q]:
            q.join()

        print('爬取完成。。。')


if __name__ == '__main__':
    qiubai = QiubaiSpider()
    qiubai.run()

猜你喜欢

转载自blog.csdn.net/zzw19951261/article/details/81075987