python爬全书网

import requests
import re
import os,threading
from bs4 import BeautifulSoup
from urllib import request
thread_lock = threading.BoundedSemaphore(value=10)
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"}
proxies = {"https":"https://125.118.151.214:6666"}

解析主页

def get_title_url(url):
    try:
        r = requests.get(url,headers = headers,proxies = proxies)
        soup = BeautifulSoup(r.content,'lxml')
        title_list = soup.select("nav > ul > li > a")
        return title_list
    except Exception as e:
        print("请求错误")

获取分类URL列表

def get_chapter():
    chapter_list = []
    for list in get_title_url(url):
        chapter_list.append(list.get('href'))
    return chapter_list

获取每部书的URL列表

def get_book_url():
    book_url_list = []
    for book_url in get_chapter():
        # print(book_url
        req = request.Request(book_url,headers = headers)
        response = request.urlopen(book_url).read().decode("gbk")
        pattern = re.compile(r'<a target="_blank" title="(.*?)" href="(.*?)" class="clearfix stitle">',re.S)
        items = re.findall(pattern ,response)
        book_url_list.extend(items)
    return book_url_list

获取所有书名及其连接

def get_chapter_read_url():
    chapter_url = []
    for items in get_book_url():
        r = requests.get(items[1],headers = headers,proxies = proxies)
        r.encoding = "gbk"
        html = r.text
        soup = BeautifulSoup(html,'lxml')
        read_list = soup.find('a',class_='reader')
        chapter_url.append(read_list.get("href"))
        for url in chapter_url:
            r = requests.get(url)
            r.encoding = "gbk"
            html = r.text
            soup = BeautifulSoup(html, 'lxml')
            soup = soup.find('div', class_="clearfix dirconone").find_all('a')
            n = 1
            for i in soup:
                reg = requests.get(i.get("href"))
                reg.encoding = "gbk"
                html1 = reg.text
                tag = BeautifulSoup(html1, 'lxml')
                # print(tag)
                tag = tag.find('div', class_="mainContenr")
                try:
                    tag = tag.get_text()
                    print("正在下载{}第{}章".format(items[0], n))
                except Exception as e:
                    print("转码错误")
                path = items[0]
                n = n + 1
                try:
                    with open(path + '.txt', "a", encoding="utf-8")as f:
                        f.write(tag)
                except Exception as e:
                    print("存储错误")
url = 'http://www.quanshuwang.com'
print(get_chapter_read_url())

猜你喜欢

转载自blog.csdn.net/qq_39001049/article/details/81412668