爬取贴吧网页保存到本地
"""
@author: Administrator
@file: tieba_test.py
@time: 2020/09/15
@desc:
"""
import requests
import os
class tiebaSpider():
def __init__(self, tieba_name):
self.tieba_name = tieba_name
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
self.base_url = "https://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"
def get_url_list(self):
url_list = []
for i in range(10):
url_list.append(self.base_url.format(i * 50))
return url_list
def parse_url(self, url):
print(url)
response = requests.get(url, headers=self.headers)
return response.content.decode()
def save_url(self, url_html, page_num):
name = '贴吧目录'
if name not in os.listdir('./'):
os.mkdir(name)
path = './贴吧目录/{}贴吧-第{}页'.format(self.tieba_name, page_num)
with open(path, 'w', encoding='utf8') as f:
f.write(url_html)
def run(self):
url_list = self.get_url_list()
for url in url_list:
url_html = self.parse_url(url)
page_nam = url_list.index(url) + 1
self.save_url(url_html, page_nam)
if __name__ == '__main__':
tiebaSpider('lol').run()