#coding:utf-8
import time
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
class spider_qiushi_baike(object):
def __init__(self):
print('Welcome to spider_qiushi_baike')
def get_hot_duanzi(self,url):
duanzi_list = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4882.400 QQBrowser/9.7.13059.400'}
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
page = res.text
soup = BeautifulSoup(page, 'html.parser')
author = soup.select('.author > a > h2')
duanzi = soup.select('.content > span')
for author, duanzi in zip(author, duanzi):
data = {
'author' : author.text.strip(),
'duanzi' : duanzi.text.strip()
}
duanzi_list.append(data)
return duanzi_list
if __name__ == '__main__':
while True:
try:
page_num = int(input('please input how many pages want to get: '))
break
except Exception as e:
print('请输入数字: ')
urls = [r'https://www.qiushibaike.com/8hr/page/%s/' % (i + 1) for i in range(0, page_num)]
spider = spider_qiushi_baike()
#for url in urls:
#duanzi_list = spider.get_hot_duanzi(url)
#for duanzi_info in duanzi_list:
#print(duanzi_info)
#单进程
start_time = time.time()
for url in urls:
duanzi_list = spider.get_hot_duanzi(url)
end_time = time.time()
use_time = end_time-start_time
print('单进程耗时:%s'%use_time)
#双进程
start_time = time.time()
pool = Pool(processes = 2)
#参数:函数,可迭代对象
pool.map(spider.get_hot_duanzi, urls)
end_time = time.time()
use_time = end_time-start_time
print('双进程耗时:%s'%use_time)
#四进程
start_time = time.time()
pool = Pool(processes = 4)
pool.map(spider.get_hot_duanzi, urls)
end_time = time.time()
use_time = end_time-start_time
print('双进程耗时:%s'%use_time)
运行结果:
please input how many pages want to get: 10
Welcome to spider_qiushi_baike
单进程耗时:3.4249415397644043
双进程耗时:2.0948898792266846
双进程耗时:1.0953943729400635