#coding=utf-8
import requests
class TiebaSpider():
'''
爬取贴吧
'''
def __init__(self,tiebaname):
self.tiebaname=tiebaname
self.init_url="https://tieba.baidu.com/f?kw="+tiebaname+"ie=utf-8&pn={}"
#定义headers,模拟浏览器访问
self.header={'User-Agent': 'ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}
def create_urllist(self):
return [self.init_url.format(i*50) for i in range(100)] #爬取100页
def get_content(self,url):
response=requests.get(url,headers=self.header)
return response.content
def save_content(self,content,yema):
file='{}吧第{}页.html'.format(self.tiebaname,yema+1)
with open(file,'wb') as f:
f.write(content)
def run(self):
#建立url列表
url_list=self.create_urllist()
for url in url_list:
#发送请求,得到数据
content=self.get_content(url)
#保存到本地
yema=url_list.index(url)
self.save_content(content,yema)
if __name__ == '__main__':
tieba=TiebaSpider('李毅')
tieba.run()
爬虫(四):简单爬取贴吧
猜你喜欢
转载自blog.csdn.net/chengmo123/article/details/84712130
今日推荐
周排行