版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
批量下载晋江城的小说
import requests
from bs4 import BeautifulSoup
import re
import os
import pandas as pd
#0.获取网页基本信息
def get_html(url):
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:70.0)"+"Gecko/20100101 Firefox/70.0"} #设置请求头
r=requests.get(url,headers=headers)
html=r.text.encode(r.encoding).decode("GBK")
soup=BeautifulSoup(html,"lxml")
# 1.获取小说名称
def get_article_title(main_url):
soup=get_html(main_url)
article_title=soup.find("span",itemprop="articleSection").text
return article_title
# 2.获取所有章节的地址
def get_allurl(url):
soup=get_html(url)
href=soup.findAll("a",itemprop="url")
allurl=[i.attrs["href"] for i in href]
return allurl
# 3.小说章节下载
def chapter_Download(file_savePath,url,article_title,number):
#1.获取网页数据
soup=get_html(url)
#2.清洗数据
title=soup.find("div",align="center").h2.text
content=soup.find("div",class_="noveltext").text
content=re.sub("(\r|\n|\u3000|\xa0)","",content) #出去换行标记等等
content=re.sub("插入书签","",content)
content=re.sub("电子书下载TXT下载举报色情反动举报刷分其他文章收藏 为收藏文章分类定制收藏类别查看收藏列表","",content)
content=re.sub("\[.*?\]","",content) #懒惰匹配 *? : 1.找最近的()匹配
content=re.sub("\(.*\)","",content) #懒惰匹配 *? : 1.找最近的()匹配
content=re.sub(title,"",content)
content=re.sub(" ","",content)
content=re.sub("displayyrt","",content)
content=re.sub(";"," ",content)
#3.保存小说
filedir=file_savePath+"/《"+article_title+"》" #1.创建路径
if not os.path.exists(filedir): #2.创建目录
os.mkdir(filedir)
with open(filedir+"/"+str(number)+".%s.doc"%title,mode="w",encoding="utf-8") as f: #打开文件,放入内容
f.write(title+"\n"+content)
#4.小说下载
def novel_Download(index):
try:
#1.获取小说主页的地址
index=int(index)
base="http://www.jjwxc.net/onebook.php?novelid="
main_url=base+str(index)
#2.下载预处理处理工作
file_savePath="E:\小说" #1.存放路径
allurl=get_allurl(main_url) #2.获取所有章节地址的集合
article_title=get_article_title(main_url) #3.获取小说名称
number=1 #4.章节编号
#3.遍历下载每一章节
for url in allurl:
chapter_Download(file_savePath,url,article_title,number)
number+=1
print("下载完成,存放路径为",file_savePath)
except:
print("下载失败")
finally:
print("谢谢使用")
x=input("请输入书的id:")
novel_Download(x)