多线程爪巴虫下载进击的巨人 v.1

在这里插入图片描述

在这里插入图片描述
这个爪巴虫还是有很多bug,有些图片超时后就没下载下来,导致每一刊都少了几页 o(╥﹏╥)o

暂时把第1版贴在这,准备升级爪巴虫2.0

import requests
from bs4 import BeautifulSoup
import threading
from lxml import etree
import urllib
from urllib import request, error
import os
import re
from queue import Queue
import traceback
import socket
import time

root = 'https://manhua.fzdm.com/39/'

global num_thread
num_thread = 0

class Producer(threading.Thread):
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
    }

    def __init__(self, url_queue, img_queue, *args,**kwargs):
        super(Producer, self).__init__(*args,**kwargs)
        global num_thread
        self.url_queue = url_queue
        self.img_queue = img_queue
        self.id = num_thread
        num_thread += 1
 
    def run(self):
        while True:
            if self.url_queue.empty():
                print('Producer No.{}\t bye'.format(self.id))
                break
            
            url = self.url_queue.get()
            try:
                html = requests.get(url,headers=self.headers)
                raw = html.text
                img = re.findall('mhurl="(.*?jpg)"', raw)
                
                if len(img) > 0:
                    prefix = 'http://p1.manhuapan.com/'
                    if int(img[0].split('/')[0]) < 2016:
                        prefix = 'http://p5.manhuapan.com/'
                    img = prefix + img[0]
                    path = os.path.join('巨人', url.split('/')[-2] , url.split('.')[-2].split('_')[-1] + '.jpg')
                    self.img_queue.put((img, path))

            except Exception as e:
                traceback.print_exc()
                print('Producer No.{}\t bye'.format(self.id))
                break
                


class Consumer(threading.Thread):
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
    }

    def __init__(self, url_queue, *args,**kwargs):
        super(Consumer, self).__init__(*args,**kwargs)
        global num_thread
        self.url_queue = url_queue
        self.id = num_thread
        num_thread += 1
 
    def run(self):
        while True:
            if self.url_queue.empty():
                print('Consumer thread \t{}\t bye'.format(self.id))
                break
            print('剩余:', url_queue.qsize())
            
            url, path = self.url_queue.get()
            try:
                req = request.Request(url, headers=self.headers)
                response = request.urlopen(req, timeout=3)
                with open(path, 'wb') as f_save:
                    f_save.write(response.read())
                    f_save.flush()
                    f_save.close()
            except urllib.error.URLError as error:
                if isinstance(error.reason, socket.timeout):
                    print('socket timed out - URL:', url)
                    self.url_queue.put((url, path))
                    print('Consumer thread \t{}\t bye'.format(self.id))
                    break
            else:
                traceback.print_exc()
            
            time.sleep(0.5)
   
            
                

html = requests.get('https://manhua.fzdm.com/39/').text
chapters = re.findall('<li class="pure-u-1-2 pure-u-lg-1-4"><a href="(.*?)[/]+" title="进击的巨人', html)

urls = []
for i in chapters:
    for j in range(100):
        u = root + i + '/index_' + str(j) + '.html'
        urls.append(u)

for i in chapters:
    diry = '巨人/'+i
    if not os.path.exists(diry):
        os.makedirs(diry)
        
N = len(urls)
N_threads = 1000
url_queue = Queue(N)
img_queue = Queue(N)
for u in urls:
    url_queue.put(u)

producers = []
for x in range(N_threads):
    t = Producer(url_queue, img_queue)
    producers.append(t)
    t.start()

consumers = []
for x in range(N_threads):
     t = Consumer(img_queue)
     consumers.append(t)
     t.start()
    

for w in producers:
    w.join()

for w in consumers:
    w.join()

print('finish')

猜你喜欢

转载自blog.csdn.net/itnerd/article/details/108751427