import requests
import os
import gevent
from gevent import monkey
import random
import re
from lxml import etree
from bs4 import BeautifulSoup
monkey.patch_all(select=False)
from urllib import parse
import time
IPs = [{'HTTPS': 'HTTPS://182.114.221.180:61202'},
{'HTTPS': 'HTTPS://60.162.73.45:61202'},
{'HTTPS': 'HTTPS://113.13.36.227:61202'},
{'HTTPS': 'HTTPS://1.197.88.101:61202'}]
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'UM_distinctid=1638b54c8f3279-0003db1d70474a-39614807-384000-1638b54c8f4843; CNZZDATA1261736110=613318700-1527048008-null%7C1530014624; Hm_lvt_5ee23c2731c7127c7ad800272fdd85ba=1530014621,1530014629,1530014706,1530015295; bookid=34778; bcolor=; font=; size=; fontcolor=; width=; chapterid=1896093; chaptername=%25u7B2C1%25u7AE0%2520%25u65B0%25u4E16%25u754C%25u548C%25u65B0%25u8EAB%25u4EFD; Hm_lpvt_5ee23c2731c7127c7ad800272fdd85ba=1530016490'
}
def setDir():
if 'Noval' not in os.listdir('./'):
os.mkdir('./Noval')
def getNoval(url, id, data, faillist):
try:
headers = HEADERS
IP = random.choice(IPs)
res = requests.get(url, headers=headers, proxies=IP, timeout=5)
res.encoding = res.apparent_encoding
html = res.text.replace(' ', ' ') # 替换掉这个字符 换成空格~ 意思是一样的
soup = BeautifulSoup(html, 'lxml')
content = soup.find('div', attrs={'id': 'content'})
name = soup.find('div', attrs={'class': 'bookname'}).h1.text
if name:
s = name + '\n'
s = s + content.text
data[id] = s
except Exception:
faillist.append(id)
def getNoval2(url, id, data):
while True:
try:
headers = HEADERS
IP = random.choice(IPs)
res = requests.get(url, headers=headers, proxies=IP)
res.encoding = res.apparent_encoding
html = res.text.replace(' ', ' ') # 替换掉这个字符 换成空格~ 意思是一样的
soup = BeautifulSoup(html, 'lxml')
content = soup.find('div', attrs={'id': 'content'})
name = soup.find('div', attrs={'class': 'bookname'}).h1
if name:
s = name.text + '\n'
s = s + content.text
data[id] = s
except Exception:
continue
else:
break
def getContentFile2(url):
headers = HEADERS
IP = random.choice(IPs)
res = requests.get(url, headers=headers, proxies=IP)
res.encoding = res.apparent_encoding
soup = BeautifulSoup(res.text, 'lxml')
info = soup.find('div', attrs={'id': 'info'})
bookname = info.h1.text
datalist = soup.find('div', attrs={'id': 'list'})
data = list(
map(lambda x: 'https://www.qu.la' + x['href'], datalist.find_all(attrs={'href': re.compile('/book.*?\.html')})))
return data, bookname
def BuildGevent(baseurl):
content, bookname = getContentFile2(baseurl) # version2
steps = 2
beginIndex, length = steps, len(content)
count = 0
name = "%s.txt" % bookname
data = {}
faillist = []
while (count - 1) * steps < length:
WaitigList = [gevent.spawn(getNoval, content[i + count * steps], i + count * steps, data, faillist) for i in
range(steps) if
i + count * steps < length]
gevent.joinall(WaitigList)
print(count)
count += 1
count = 0
print("HE")
faillistlen = len(faillist)
while count * steps < faillistlen:
WaitigList = [gevent.spawn(getNoval2, content[faillist[i + count * steps]], faillist[i + count * steps], data)
for i in range(steps) if i + count * steps < faillistlen]
gevent.joinall(WaitigList)
String = '\n'.join(data.values())
with open('./Noval/' + name, 'w', encoding='gb18030', errors='ignore') as ff:
ff.write(String)
if __name__ == '__main__':
starttime = time.time()
setDir()
url = 'https://www.qu.la/book/34778/'
BuildGevent(url)
endtime = time.time()
print("Total use time: %.6f" % (endtime - starttime))
笔趣阁(www.qu.la)小说爬取代码(python实现)
猜你喜欢
转载自blog.csdn.net/a19990412/article/details/80822140
今日推荐
周排行