import requests
import re
import time
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
f = open('E:/HFTX.txt','a+')
url_list=[]
def get_urls(start_url):
wb_data=requests.get(start_url,headers=headers)
soup=BeautifulSoup(wb_data.text,'lxml')
links=soup.select('#readlist > ul > li > a')
for link in links:
page_url='http://www.quanshuwu.com/'+link.get('href')
url_list.append(page_url)
def get_info(url):
res = requests.get(url,headers=headers)
if res.status_code == 200:
contents = re.findall('<p>(.*?)</p>',res.content.decode('utf-8',errors='ignore'),re.S)
for content in contents:
try:
f.write(content+'\n')
except:
print('error')
else:
pass
if __name__ == '__main__':
start_url='http://www.quanshuwu.com/book/2039.aspx'
get_urls(start_url)
for url in url_list:
get_info(url)
time.sleep(1)
f.close()
小说爬取
猜你喜欢
转载自blog.csdn.net/qq_42052864/article/details/80737990
今日推荐
周排行