爬取小说

这次带来的是爬取一个网站的多个页面的小说并每本小说写入一个txt文档

获取网站网址
爬取小说的链接
爬取目录的链接
爬取各章小说的目录和内容

1.网站网址
http://www.biquge.com.tw/
2.爬取小说的链接
爬取小说的链接可以获取到每本小说，链接作为获取目录链接的入口

url1 = 'http://www.biquge.com.tw/'
html = requests.get(url1).content
soup = BeautifulSoup(html,'html.parser')
article = soup.find(id="main")
texts = []
for novel in article.find_all(href=re.compile('http://www.biquge.com.tw/')):
    #小说链接
    nt = novel.get('href')
    texts.append(nt)
    #print nt     #可供检验
    new_text = []
    for text in texts:
        if text not in new_text:
            new_text.append(text)

3.爬取目录链接
目录链接作为获取每章内容的入口

url2 = 小说链接
html = requests.get(url2).content
soup = BeautifulSoup(html, 'html.parser')
a = []
#爬取相关信息及目录
for catalogue in soup.find_all(id="list"):
    timu = soup.find(id="maininfo")
    name1 = timu.find('h1').get_text()
    tm = timu.get_text()
    e_cat = catalogue.get_text('\n')
for link in catalogue.find_all(href=re.compile(".html")):
   lianjie = 'http://www.biquge.com.tw/' + link.get('href')
   a.append(lianjie)

4.爬取各章小说的目录和内容
目录链接作为爬取各章目录和内容的入口

finallyurl = 目录链接
html = requests.get(finallyurl).content
soup = BeautifulSoup(html, 'html.parser')
tit = soup.find('div', attrs={'class': 'bookname'})
title = tit.h1
content = soup.find(id='content').get_text()
print title.get_text()
print content

5.完整代码

# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests
import re

#解决出现的写入错误
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

#可以获取多本文章
MAX_RETRIES = 20
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
session = requests.Session()
adapter = requests.adapters.HTTPAdapter(max_retries=MAX_RETRIES)
session.mount('https://', adapter)
session.mount('http://', adapter)
r = session.get(url)

#爬取首页各小说链接，并写入列表
url1 = 'http://www.biquge.com.tw/'
html = requests.get(url1).content
soup = BeautifulSoup(html,'html.parser')
article = soup.find(id="main")
texts = []
for novel in article.find_all(href=re.compile('http://www.biquge.com.tw/')):
    #小说链接
    nt = novel.get('href')
    texts.append(nt)
    #print nt     #可供检验
    new_text = []
    for text in texts:
        if text not in new_text:
            new_text.append(text)
#将刚刚的列表写入一个新列表，以供遍历，获取各个链接
h = []
h.append(new_text)
l = 0
for n in h:
    while l<=len(n)-1:
        #爬取小说的相关信息及目录和目录链接
        url2 = n[l]
        html = requests.get(url2).content
        soup = BeautifulSoup(html, 'html.parser')
        a = []
        #爬取相关信息及目录
        for catalogue in soup.find_all(id="list"):
            timu = soup.find(id="maininfo")
            name1 = timu.find('h1').get_text()
            tm = timu.get_text()
            e_cat = catalogue.get_text('\n')
            print name1
            print tm
            print e_cat
            end1 = u'%s%s%s%s' % (tm, '\n', e_cat, '\n')
        # 写入文档
            one1 = end1.encode('utf-8')
            fo = open(name1+'.txt', 'a')
            fo.write(one1 + '\n')
            fo.close()
        #爬取各章链接
        for link in catalogue.find_all(href=re.compile(".html")):
           lianjie = 'http://www.biquge.com.tw/' + link.get('href')
           a.append(lianjie)
        #将各章的链接列表写入一个新列表，以供遍历，获取各章的列表
        k = []
        k.append(a)
        j = 0
        for i in k:
           while j <= len(i) - 1:
                #爬取各章小说内容
                url = 'http://www.biquge.com.tw/14_14055/9194140.html'
                finallyurl = i[j]
                html = requests.get(finallyurl).content
                soup = BeautifulSoup(html, 'html.parser')
                tit = soup.find('div', attrs={'class': 'bookname'})
                title = tit.h1
                content = soup.find(id='content').get_text()
                print title.get_text()
                print content
                j += 1
                end2 = u'%s%s%s%s' % (title , '\n' , content , '\n')
                #写入文档
                one2 = end2.encode('utf-8')
                fo = open(name1 + ".txt", 'a')
                fo.write(one2 + '\n')
                fo.close()
        l+=1

结果展示（有点多，就截了一点儿）
在这里插入图片描述

猜你喜欢