今天很气,查点答案结果查到百度文库里头了,进去又没看到答案,气得我写了这个爬虫。
import requests
from lxml import etree
import re
def get_text(url):
page_url = url
page_headers ={
"User-Agent": "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true",
"Referer": "https://wk.baidu.com/?pcf=2",
"Accept-Encoding":"gzip, deflate, br",
}
html_code = requests.get(url=page_url,headers=page_headers)
html_code.encoding = "gb2312"
html_etree = etree.HTML(html_code.text)
info = html_etree.xpath('//script[@type="text/javascript"]/text()')
find_need_infos_reg = re.compile('WkInfo.htmlUrls(.*)WkInfo.verify_user_info')
find_need_infos = find_need_infos_reg.search(str(info))
need_infos = (find_need_infos.group().strip("WkInfo.htmlUrls = ").strip(r";\n WkInfo.verify_user_i").replace(r"\\x22","").replace(r"\\\\\\/","\\").replace("'",'"'))
url_find_reg = re.compile(r'pageLoadUrl:(.+?)}')
url_lists = url_find_reg.findall(need_infos)
for info_url in url_lists:
info_url = info_url.replace("\\","/")
text_html = requests.get(url=info_url,headers=page_headers)
text_find_reg = re.compile(r',{"c":(.*?),')
text_lists = text_find_reg.findall(text_html.text)
for text in text_lists:
try:
text_str=eval(text)
if text_str !="\n":
text_str= text_str.replace('\n','')
DownWrite(text_str)
except:
print("出错")
pass
def DownWrite(str):
global down_times
down_times = down_times + 1
writeLine =str+"\n"
with open('百度文库.txt','a+',encoding='utf8') as f:
f.write(writeLine)
print("第%d条txt数据写入成功!"%(down_times))
if __name__ == '__main__':
url = 'https://wenku.baidu.com/view/0cfd025e0875f46527d3240c844769eae109a34a.html'
down_times=0
get_text(url)