这次带来的是爬取一个网站的多个页面的小说并每本小说写入一个txt文档
- 获取网站网址
- 爬取小说的链接
- 爬取目录的链接
- 爬取各章小说的目录和内容
1.网站网址
http://www.biquge.com.tw/
2.爬取小说的链接
爬取小说的链接可以获取到每本小说,链接作为获取目录链接的入口
url1 = 'http://www.biquge.com.tw/'
html = requests.get(url1).content
soup = BeautifulSoup(html,'html.parser')
article = soup.find(id="main")
texts = []
for novel in article.find_all(href=re.compile('http://www.biquge.com.tw/')):
#小说链接
nt = novel.get('href')
texts.append(nt)
#print nt #可供检验
new_text = []
for text in texts:
if text not in new_text:
new_text.append(text)
3.爬取目录链接
目录链接作为获取每章内容的入口
url2 = 小说链接
html = requests.get(url2).content
soup = BeautifulSoup(html, 'html.parser')
a = []
#爬取相关信息及目录
for catalogue in soup.find_all(id="list"):
timu = soup.find(id="maininfo")
name1 = timu.find('h1').get_text()
tm = timu.get_text()
e_cat = catalogue.get_text('\n')
for link in catalogue.find_all(href=re.compile(".html")):
lianjie = 'http://www.biquge.com.tw/' + link.get('href')
a.append(lianjie)
4.爬取各章小说的目录和内容
目录链接作为爬取各章目录和内容的入口
finallyurl = 目录链接
html = requests.get(finallyurl).content
soup = BeautifulSoup(html, 'html.parser')
tit = soup.find('div', attrs={'class': 'bookname'})
title = tit.h1
content = soup.find(id='content').get_text()
print title.get_text()
print content
5.完整代码
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests
import re
#解决出现的写入错误
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#可以获取多本文章
MAX_RETRIES = 20
url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
session = requests.Session()
adapter = requests.adapters.HTTPAdapter(max_retries=MAX_RETRIES)
session.mount('https://', adapter)
session.mount('http://', adapter)
r = session.get(url)
#爬取首页各小说链接,并写入列表
url1 = 'http://www.biquge.com.tw/'
html = requests.get(url1).content
soup = BeautifulSoup(html,'html.parser')
article = soup.find(id="main")
texts = []
for novel in article.find_all(href=re.compile('http://www.biquge.com.tw/')):
#小说链接
nt = novel.get('href')
texts.append(nt)
#print nt #可供检验
new_text = []
for text in texts:
if text not in new_text:
new_text.append(text)
#将刚刚的列表写入一个新列表,以供遍历,获取各个链接
h = []
h.append(new_text)
l = 0
for n in h:
while l<=len(n)-1:
#爬取小说的相关信息及目录和目录链接
url2 = n[l]
html = requests.get(url2).content
soup = BeautifulSoup(html, 'html.parser')
a = []
#爬取相关信息及目录
for catalogue in soup.find_all(id="list"):
timu = soup.find(id="maininfo")
name1 = timu.find('h1').get_text()
tm = timu.get_text()
e_cat = catalogue.get_text('\n')
print name1
print tm
print e_cat
end1 = u'%s%s%s%s' % (tm, '\n', e_cat, '\n')
# 写入文档
one1 = end1.encode('utf-8')
fo = open(name1+'.txt', 'a')
fo.write(one1 + '\n')
fo.close()
#爬取各章链接
for link in catalogue.find_all(href=re.compile(".html")):
lianjie = 'http://www.biquge.com.tw/' + link.get('href')
a.append(lianjie)
#将各章的链接列表写入一个新列表,以供遍历,获取各章的列表
k = []
k.append(a)
j = 0
for i in k:
while j <= len(i) - 1:
#爬取各章小说内容
url = 'http://www.biquge.com.tw/14_14055/9194140.html'
finallyurl = i[j]
html = requests.get(finallyurl).content
soup = BeautifulSoup(html, 'html.parser')
tit = soup.find('div', attrs={'class': 'bookname'})
title = tit.h1
content = soup.find(id='content').get_text()
print title.get_text()
print content
j += 1
end2 = u'%s%s%s%s' % (title , '\n' , content , '\n')
#写入文档
one2 = end2.encode('utf-8')
fo = open(name1 + ".txt", 'a')
fo.write(one2 + '\n')
fo.close()
l+=1
结果展示(有点多,就截了一点儿)