requests爬取糗事百科,由于糗事百科是静态页面,用简单的requests即可代码如下:
import requests
import lxml.html
class Qiu:
def __init__(self, name_, url_base):
"""
设置基础参数
:param addr:
"""
self.name_ = name_
self.url_base = url_base + "/{}/"
# 设置headers
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
}
def make_url_lists(self):
"""
生成下载的url地址列表
:return:
"""
url_lists = [self.url_base.format(i) for i in range(2)]
return url_lists
def download_url(self, down_url):
"""
下载指定页面
:return: 下载到的html页面内容
"""
result = requests.get(down_url, headers=self.headers)
print(result.text)
return result.content
def save_result(self, result, page_num):
file_path = "qiushi/{}第{}页.html".format(self.name_, page_num)
with open(file_path, "wb") as f:
f.write(result)
def run(self):
# 获取url下载页面列表
url_lists = self.make_url_lists()
# 遍历这个列表
for url_list in url_lists:
# 调用download_url函数获取每个url页面的content
result = self.download_url(url_list)
# 获取这个页面是所有下载列表中的第几个
pg_num = url_lists.index(url_list) + 1
# 调用save_result函数保存这个html页面
self.save_result(result, pg_num)
# 对html页面内容解码为字符串格式
result = result.decode('utf-8')
#
url_content = lxml.html.fromstring(result)
html_data = url_content.xpath("//div[@class='content']/span")
txt = 'text/糗事百科第{}页.txt'.format(pg_num)
with open(txt, 'wb') as f:
for i in html_data:
f.write(i.text.encode('utf-8'))
if __name__ == "__main__":
qiu = Qiu("糗事百科", "https://www.qiushibaike.com/8hr/page")
qiu.run()
# qiu.result_content()