核心代码如何构建请参考作者谢乾坤–《Python爬虫开发:从入门到实站》第四章-4.4。这里只补存剩下的代码。
import requests
import os #python对操作系统的封装的库(python自带)
import re #正则表达库(python自带)
start_url = 'https://www.kanunu8.com/book3/6879/'#网址
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
def get_html(url):#提取页面信息
html = requests.get(url,headers = headers).content.decode('gbk')
return html
def get_toc(html):#提取各章节链接
toc_url_list = []
toc_block = re.findall('正文(.*?)</tbody>',html,re.S)[0]
toc_url = re.findall('href="(.*?)"',toc_block,re.S)
for url in toc_url:
toc_url_list.append(start_url+url)#将相对路径改成绝对路径
return toc_url_list
def get_article(html):#获取章节内容
chapter_name = re.search('size="4">(.*?)<',html,re.S).group(1)
text_block = re.search('<p>(.*?)</p>',html,re.S).group(1)
text_block = text_block.replace('<br />','')#将内容里的<br />标签替换成空字符串
return chapter_name,text_block
def save(chapter,article):#存入文件
os.makedirs('动物农场',exist_ok=True)#文件不存在则新建,存在则继续
with open(os.path.join('动物农场',chapter+'.txt'),'w',encoding='utf-8') as f:
f.write(article)
html1 = get_html(start_url)
html2 = get_toc(html1)
for i in html2:
s = get_html(i)
a = get_article(s)
if __name__=='__main__':#运行
save(a[0],a[1])
运行结果如图: