作者:IT小样
以单线程来爬取所有页的豆瓣书名,那么运行时间会非常久,因此考虑使用多线程来增加并发,减少爬取时长。实现方案如下:
import requests
import threading
from bs4 import BeautifulSoup
import queue
import random,time
count_crawel = 3
count_parse = 3
class Thread_Crawel(threading.Thread):
def __init__(self,url_queue,outcome_queue,number):
threading.Thread.__init__(self)
self.url_queue = url_queue
self.outcome_queue = outcome_queue
self.number = number
self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'}
def run(self):
print("启动采集线程",self.number)
while self.url_queue.qsize()>0:
url = self.url_queue.get()
print(self.number,"线程采集url:",url)
time.sleep(random.randint(5,15)/10)
response= requests.get(url,headers = self.header,verify=False)
if response.status_code == 200:
self.outcome_queue = response.text
class Thread_Parse(threading.Thread):
def __init__(self,number,outcome_queue,req_thread):
threading.Thread.__init__(self)
self.number = number
self.outcome_queue = outcome_queue
self.req_thread = req_thread
self.is_parse = True
def run(self):
print("启动解析线程",self.number)
while True:
for t in self.req_thread:
if t.is_alive():
break
else:
if self.outcome_queue.qsize()==0:
self.is_parse = False
if self.is_parse == True:
try:
data = self.outcome_queue.get(timeout=3)
except Exception as e:
data = None
if data is not None:
self.parse(data)
else:
break
print("退出解析线程",self.number)
def parse(self,data):
text = []
soup = BeautifulSoup(data)
ul_soup = soup.find(attrs={"class":"subject-list"})
li_soup = soup.find_all("li",attrs={"class":"subject-item"})
for li in li_soup:
result_list = []
title = li.h2.get_text().replace(' ',' ').replace('\n',' ')
author = li.find("div",attrs={"class":"pub"}).get_text().replace(' ','').replace('\n','')
result_list.append(title)
result_list.append(author)
text.append(result_list)
with open(result.txt,"a+",encoding='utf-8') as f:
for book in text:
book_author = ' '.join(book)
f.write(book_author)
f.write('\n')
def main():
url_queue = queue.Queue()
outcome_queue = queue.Queue()
offset = 20
url_temp = "https://book.douban.com/tag/%E6%BC%AB%E7%94%BB?start={}&type=T"
for i in range(0,10):
start = offset*i
url = url_temp.format(start)
url_queue.put(url)
req_thread = []
for i in range(count_crawel):
t = Thread_Crawel(url_queue,outcome_queue,i+1)
t.start()
req_thread.append(t)
parse_thread = []
for i in range(count_parse):
t = Thread_Parse(i+1,outcome_queue,req_thread)
t.start()
parse_thread.append(t)
for t in req_thread:
t.join()
for t in parse_thread:
t.join()
if __name__ == "__main__":
main()