import requests
from bs4 import BeautifulSoup
import time
import os
print("爬虫对应小说网站,新笔趣阁,地址:http://www.xbiquge.la")
novel_Code = input("请输入小说代码(格式:xx-xx ,例:0-69)(地址栏的后面的数字):")
url = "http://www.xbiquge.la/%s/%s/" % (novel_Code.split("-")[0], novel_Code.split("-")[1])
print("正在爬取,请稍等")
html_code = requests.get(url)
html_code.encoding = "utf-8"
soup_1 = BeautifulSoup(html_code.text, "html.parser")
novel_Name = soup_1.find(id="info").find("h1").get_text()
chapter_Name = []
chapter_Link = []
for cN in soup_1.find(id="list").find_all("a"):
chapter_Name.append(cN.get_text())
chapter_Link.append("http://www.xbiquge.la" + cN.get("href"))
path = "./%s" % (novel_Name)
if os.path.exists(path):
pass
else:
os.makedirs(path)
i = 0
for cL in range(len(chapter_Link)):
i = i + 1
print("%s - 章节名称:%s,章节地址:%s" % (str(i), chapter_Name[i - 1], chapter_Link[i - 1]))
novel_Content_code = requests.get(chapter_Link[i - 1])
novel_Content_code.encoding = "utf-8"
novel_Content_1 = BeautifulSoup(novel_Content_code.text, "html.parser")
adver = "亲,点击进去,给个好评呗,分数越高更新越快,据说给新笔趣阁打满分的最后都找到了漂亮的老婆哦!手机站全新改版升级地址:http://m.xbiquge.la,数据和书签与电脑站同步,无广告清新阅读!"
novel_Content_2 = novel_Content_1.find(id="content").get_text().replace(adver, "").replace("<br />", "").replace(
" ", "")
with open(path + "/" + str(i) + "-" + chapter_Name[i - 1] + ".txt", "w", encoding="utf-8") as f:
f.write(novel_Content_2)
time.sleep(1)
相关运行截图:
注意:由于这个小说网站设置了反爬,所以我设置了1秒爬取一节。