版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/yefengzhichen/article/details/53424071
最近用python处理了蛮多数据,也自己稍微学习爬取了一些数据。主要是用requests和BeautifulSoup。以下例子是糗事百科的内容爬取,保存的格式为:(user_name, user_picture, qiushi, [good_cmt]),good_cmt可能不存在。
代码如下:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: yefeng
"""
import requests
from bs4 import BeautifulSoup
import re
if __name__ == "__main__":
#糗事百科纯文字前缀
root = "http://www.qiushibaike.com/text/page/" #http://www.qiushibaike.com/text/\
#page: 1 - 20 1 - 3 翻页,可以自由设置。
url_list = [] #url集合
for i in range(1, 4):
tmp = root + str(i)
url_list.append(tmp)
cnt = 0
fout = open("qiushibaike_data.txt","w") #保存路径
for url in url_list:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser') #每个页面的内容都是以qiushi_tag_开头
cont_list = soup.find_all(id = re.compile(r'qiushi_tag_\d+')) #使用正则匹配
for cont in cont_list:
user_info = cont.find(class_ = 'author').find_all("a") #文字作者用户信息
user_picture = "null"
user_name = "null"
if user_info is not None and len(user_info) >= 2:
user_picture = user_info[0].find("img")["src"] #头像链接
user_name = user_info[1].find("h2").text #昵称
# print(user_name)
# print(user_picture)
# qiushi = cont.find(class_="content").find("span").text
qiushi = cont.find("span").text
# print(qiushi)
good_cmt = cont.find(class_ = "indexGodCmt") #最佳评论
if good_cmt is not None:
good_cmt = good_cmt.find(class_ = "main-text").get_text("|||", strip=True)
fout.write("%s\t%s\t%s\t%s\n" % (user_name, user_picture, qiushi, good_cmt))
else:
fout.write("%s\t%s\t%s\n" % (user_name, user_picture, qiushi))
# print(good_cmt)
cnt = cnt + 1
print(url, cnt)
fout.flush()
fout.close()