笔趣阁（www.qu.la）小说爬取代码（python实现）

import requests
import os
import gevent
from gevent import monkey
import random
import re
from lxml import etree
from bs4 import BeautifulSoup

monkey.patch_all(select=False)
from urllib import parse
import time

IPs = [{'HTTPS': 'HTTPS://182.114.221.180:61202'},
       {'HTTPS': 'HTTPS://60.162.73.45:61202'},
       {'HTTPS': 'HTTPS://113.13.36.227:61202'},
       {'HTTPS': 'HTTPS://1.197.88.101:61202'}]
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cookie': 'UM_distinctid=1638b54c8f3279-0003db1d70474a-39614807-384000-1638b54c8f4843; CNZZDATA1261736110=613318700-1527048008-null%7C1530014624; Hm_lvt_5ee23c2731c7127c7ad800272fdd85ba=1530014621,1530014629,1530014706,1530015295; bookid=34778; bcolor=; font=; size=; fontcolor=; width=; chapterid=1896093; chaptername=%25u7B2C1%25u7AE0%2520%25u65B0%25u4E16%25u754C%25u548C%25u65B0%25u8EAB%25u4EFD; Hm_lpvt_5ee23c2731c7127c7ad800272fdd85ba=1530016490'
}


def setDir():
    if 'Noval' not in os.listdir('./'):
        os.mkdir('./Noval')


def getNoval(url, id, data, faillist):
    try:
        headers = HEADERS
        IP = random.choice(IPs)
        res = requests.get(url, headers=headers, proxies=IP, timeout=5)
        res.encoding = res.apparent_encoding
        html = res.text.replace('&nbsp;', ' ')  # 替换掉这个字符 换成空格~ 意思是一样的
        soup = BeautifulSoup(html, 'lxml')
        content = soup.find('div', attrs={'id': 'content'})
        name = soup.find('div', attrs={'class': 'bookname'}).h1.text

        if name:
            s = name + '\n'
            s = s + content.text
            data[id] = s

    except Exception:
        faillist.append(id)


def getNoval2(url, id, data):
    while True:
        try:
            headers = HEADERS
            IP = random.choice(IPs)
            res = requests.get(url, headers=headers, proxies=IP)
            res.encoding = res.apparent_encoding
            html = res.text.replace('&nbsp;', ' ')  # 替换掉这个字符 换成空格~ 意思是一样的
            soup = BeautifulSoup(html, 'lxml')
            content = soup.find('div', attrs={'id': 'content'})
            name = soup.find('div', attrs={'class': 'bookname'}).h1
            if name:
                s = name.text + '\n'
                s = s + content.text
                data[id] = s
        except Exception:
            continue
        else:
            break


def getContentFile2(url):
    headers = HEADERS
    IP = random.choice(IPs)
    res = requests.get(url, headers=headers, proxies=IP)
    res.encoding = res.apparent_encoding
    soup = BeautifulSoup(res.text, 'lxml')
    info = soup.find('div', attrs={'id': 'info'})
    bookname = info.h1.text
    datalist = soup.find('div', attrs={'id': 'list'})
    data = list(
        map(lambda x: 'https://www.qu.la' + x['href'], datalist.find_all(attrs={'href': re.compile('/book.*?\.html')})))
    return data, bookname


def BuildGevent(baseurl):
    content, bookname = getContentFile2(baseurl)  # version2
    steps = 2
    beginIndex, length = steps, len(content)
    count = 0
    name = "%s.txt" % bookname
    data = {}
    faillist = []
    while (count - 1) * steps < length:
        WaitigList = [gevent.spawn(getNoval, content[i + count * steps], i + count * steps, data, faillist) for i in
                      range(steps) if
                      i + count * steps < length]
        gevent.joinall(WaitigList)
        print(count)
        count += 1
    count = 0
    print("HE")
    faillistlen = len(faillist)
    while count * steps < faillistlen:
        WaitigList = [gevent.spawn(getNoval2, content[faillist[i + count * steps]], faillist[i + count * steps], data)
                      for i in range(steps) if i + count * steps < faillistlen]
        gevent.joinall(WaitigList)
    String = '\n'.join(data.values())
    with open('./Noval/' + name, 'w', encoding='gb18030', errors='ignore') as ff:
        ff.write(String)


if __name__ == '__main__':
    starttime = time.time()
    setDir()
    url = 'https://www.qu.la/book/34778/'
    BuildGevent(url)
    endtime = time.time()
    print("Total use time: %.6f" % (endtime - starttime))
笔趣阁（www.qu.la）小说爬取代码（python实现）

猜你喜欢