爬虫第四天：爬取我要看的小说

前言：平时看小说都是手机上找个免费软件看的，但是现在的APP要么是只给你几天免费，然后就收钱；要么就是免费但是不断出现广告，忍无可忍！PC端可以找到可以看的免费小说，但是一来必须坐到电脑前不方便，二来总是会自动弹到广告页面，手机浏览器上网看小说也是一样。
所以决定还是自己写个程序把要看的小说爬下来吧~
不多说了，代码如下：

11月10日更新：增加了selenium的代码，可以通过输入书名和最近看的章节名字，自动爬取需要的小说章节
注：selenium的使用必须安装相应的浏览器控件，比如我这里用了Chrome，所以首先必须安装Chrome浏览器，然后还要下载版本相应的chromedriver

import requests
from bs4 import BeautifulSoup
import re
import datetime
from selenium import webdriver
import time

#首先通过chrome获得headers，包括user-agent和cookie
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Cookie': '__cdnuid=5c8a2b4ff4e03d4abf3810d89332b919'
}
#通过selenium获得书名对应的链接
def get_bookurl(book_name):
    #通过selenium获得书名对应的链接
    browser = webdriver.Chrome()
    url = "http://www.biquge.com.tw/"
    browser.get(url)
    input = browser.find_element_by_xpath('//*[@id="wd"]') # 点击“搜索栏”
    input.send_keys(book_name)
    button = browser.find_element_by_xpath('//*[@id="sss"]') # 点击“搜索栏”
    button.click()
    time.sleep(1)
    #因为这个网站使用的是弹出新窗口，所以需要跳转到新的页面才能获得url
    windows = browser.current_window_handle
    all_handers = browser.window_handles
    for handle in all_handers:
        if handle != windows:
            browser.switch_to_window(handle)
    current_url = browser.current_url
    browser.quit()
    # print(current_url)
    book_id = re.search('^http://www.biquge.com.tw/(.*?)/',current_url).group(1)
    # print(book_id)
    return  book_id

#获得小说的标题和正文
def get_novel(href,book_id):
    url = r"http://www.biquge.com.tw/"+book_id+"/"+href+".html"
    r = requests.get(url=url, headers=headers)
    r.encoding = 'gbk'
    soup = BeautifulSoup(r.text, 'lxml')
    #获取小说的标题
    novel_title = soup.find(name='div',attrs={'class':'bookname'}).h1.text.replace(u'\xa0', u' ').replace(u'\ufffd', u' ')
    # print(novel_title)
    #获取小说的正文
    novel_content = soup.find(name='div',attrs={'id':'content'}).text.replace(u'\xa0', u' ').replace(u'\ufffd', u' ')
    # print(novel_content)
    return (novel_title,novel_content)

#导出小说
def make_novel(novel_title,novel_content,book_name):
    with open(book_name+'.txt', 'a', encoding='gbk') as file:
        file.write(novel_title)
        file.write('\r\n')
        file.write(novel_content)
        file.write('\r\n')
        file.write('\r\n')

#获得小说的目录数据
def main():
    starttime = datetime.datetime.now()
    book_name = input("请输入书名：")
    last_chapter = input('请输入最近看的一章的名称：')
    book_id = get_bookurl(book_name)
    url= r"http://www.biquge.com.tw/"+book_id+"/"
    r = requests.get(url=url, headers=headers)
    r.encoding = 'gbk'
    soup = BeautifulSoup(r.text, 'lxml')
    # 因为我已经看了很多了，所以要从中间开始爬
    dd = soup.find(name='a',text=re.compile(last_chapter)).find_all_next(name='dd')
    for i in range(len(dd)):
        href_temp = str(dd[i])
        # print(href_temp)
        href = re.search('^<dd><a href="/.*?/(.*?).html">.*?</a></dd>',href_temp).group(1)
        # print(href)
        novel_title = get_novel(href,book_id)[0]
        novel_content = get_novel(href,book_id)[1]
        make_novel(novel_title,novel_content,book_name)
        print('已爬取小说的{}'.format(novel_title))
    endtime = datetime.datetime.now()
    total_time = (endtime - starttime).seconds
    print("小说爬取完毕，总共耗时{}秒".format(total_time))

if __name__=='__main__':
    main()

下完后是一个txt文件，导入到手机比较好的阅读类APP中即可！

爬虫第四天：爬取我要看的小说

猜你喜欢