最近科研老师打算分析微博数据,先从转发关系的分析入手,当然就需要转发关系数据啦!查了许多资料,GitHub上也有成熟代码,但仍决定从0开始根据自己的需求自编一个。
基于m/weibo.cn 这个网站进行爬取是不需要cookies也无需账号登录的,但需要解析json数据,整个爬取过程比较机械繁琐,但不担心被封啊哈哈哈哈哈~~
具体思路见代码,主要是对id的获取。
简单描述思路:
- 需要手动输入一些大v用户id到get_wbid.py,获取用户id下的所有微博id
- 使用relationship.py 读取爬取到的微博id,可以访问微博正文所在链接,进而爬取需要的数据
- 优化:通过relationship.py爬取点赞/评论/转发用户id,扩大get_wbid.py的输入
- 问题:目前只是形成了最简版本,主要作为工具接口调用,如果要放在后台不断爬取,还需要加上时间识别/去重/……
- 备注:微博网页主要分两种,一是用户主页类型(思路:get_wbid.py),二是微博正文(思路:relationship.py),小伙伴们可以基于我的思路去爬微博其他网页获取数据,原理都差不多。加入了proxy代理池,可删。
get_wbid.py
# 尝试用用户id获得用户所有的微博博文id
# 输入用户id
# 输出微博博文id
import csv
import requests
import re
import time
import json
def get_user_containerid(user_id): #containerid和usid不一致,查看用户的关注列表需要他的containerid,usid用于获取用户主页信息
url = 'http://m.weibo.cn/api/container/getIndex?type=uid&value={user_id}'.format(user_id=user_id)
resp = requests.get(url)
jsondata = resp.json()
jsondata = jsondata['data']
fans_id=jsondata.get('follow_scheme')
items = re.findall(r"&lfid=(\w+)*", fans_id, re.M)
for i in items:
return i
def get_luicode_lfid(sheader):
url = sheader
proxypool_url = 'http://127.0.0.1:5555/random'
proxies = {
'http': 'http://' + requests.get(proxypool_url).text.strip()}
response = requests.get(url, proxies=proxies)
html = json.loads(response.content.decode('utf-8'))
s = html.get('data').get('scheme')
luicode = s[s.find('luicode=')+8:s.find('&lfid=')]
lfid = s[s.find('&lfid=')+6:]
for i in html.get('data').get('tabsInfo').get('tabs'):
if i.get('tabKey') == 'weibo':
containerid = i.get('containerid')
return [luicode,lfid,containerid]
# 获取微博博文bw_id
def get_bw_id(user_id,sheader): # 用户id和主页前缀
b = True
n = 0
sid = '1'
url = sheader
error = {
}
while b:
try:
n += 1
print('正在处理主页--->', url)
proxypool_url = 'http://127.0.0.1:5555/random'
proxies = {
'http': 'http://' + requests.get(proxypool_url).text.strip()}
response = requests.get(url,proxies=proxies)
html = json.loads(response.content.decode('utf-8'))
if 'data' in html.keys():
if 'since_id' in html.get('data').get('cardlistInfo'):
if html.get('data').get('cardlistInfo').get('since_id') == sid:
break
elif sid == '':
break
else:
sid = html.get('data').get('cardlistInfo').get('since_id')
else:
break
if 'cards' in html.get('data'):
for i in html.get('data').get('cards'):
if i.get('mblog',-1) != -1:
screen_name = i['mblog'].get('user').get('screen_name')
content = [user_id,screen_name,i['mblog'].get('id')]
write_file(content)
else:
break
time.sleep(1)
except Exception as e:
print('请求主页出错--->', url)
if str(e) == 'Expecting value: line 1 column 1 (char 0)' and error.get(url, -1) == -1:
error[url] = 1
n -= 1
print('重新请求主页--->', url)
time.sleep(5)
elif str(e) == 'Expecting value: line 1 column 1 (char 0)' and error.get(url, -1) == 1:
time.sleep(5)
else:
b = False
print('错误信息:\n',e)
url = sheader + '&since_id=' + str(sid)
print('共处理主页 ',n)
def write_file(content):
with open('user+screen_name+bw.csv','a') as f:
writer = csv.writer(f)
writer.writerow(content)
if __name__ == '__main__':
result_headers = [
'user_id',
'screen_name',
'id',
]
with open('user+screen_name+bw.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(result_headers)
# 此处读取 '各界大v用户id.txt' 并进行遍历
user_id = 1288739185
containerid = get_user_containerid(str(user_id))
sheader = 'https://m.weibo.cn/api/container/getIndex?uid=' \
''+str(user_id)+'&type=uid&value='+str(user_id)+\
'&containerid='+str(containerid)
l = get_luicode_lfid(sheader)
sheader = 'https://m.weibo.cn/api/container/getIndex?uid=' \
''+str(user_id)+'&luicode='+str(l[0])+'&lfid='+str(l[1])+\
'&type=uid&value='+str(user_id)+'&containerid='+str(l[2])
get_bw_id(user_id,sheader)
relationship.py
# 通过微博博文id建立转发关系
# 输入用户id和微博博文id
import csv
import json
import requests
import time
# 爬取单个微博
def get_fs_info(u_id,u_screen_name,bw_id):
b = True
n = 0
error = {
}
while b:
try:
n += 1
url = 'https://m.weibo.cn/api/statuses/repostTimeline?id=' + str(bw_id) + '&page=' + str(n)
print('正在处理-->',url)
proxypool_url = 'http://127.0.0.1:5555/random'
proxies = {
'http': 'http://' + requests.get(proxypool_url).text.strip()}
response = requests.get(url, proxies=proxies)
html = json.loads(response.content.decode('utf-8'))
if 'data' in html.keys():
if 'data' in html.get('data').keys():
for i in html.get('data').get('data'):
fs_id = i.get('user').get('id')
fsbw_id = i.get('id')
screen_name = i.get('user').get('screen_name')
write_csv([u_id,u_screen_name,bw_id,fs_id,screen_name,fsbw_id])
else:
b = False
except Exception as e :
if str(e) == 'Expecting value: line 1 column 1 (char 0)' and error.get(url, -1) == -1:
error[url] = 1
n -= 1
time.sleep(5)
print('重新请求-->', url)
elif str(e) == 'Expecting value: line 1 column 1 (char 0)' and error.get(url, -1) == 1:
time.sleep(5)
else:
b = False
print('Error:\n',e)
time.sleep(1)
def write_csv(result_data): # result_data四元数组
"""将爬取的信息写入csv文件"""
try:
with open('relationship.csv','a',encoding='utf-8-sig',newline='') as f:
writer = csv.writer(f)
writer.writerow(result_data)
except Exception as e:
print('Error: ', e)
if __name__ == '__main__':
result_headers = [
'user_id',
'screen_name',
'id',
'fs_user_id',
'fs_screen_name',
'fs_id',
]
with open('relationship.csv', 'w', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerows([result_headers])
# # 在此处读取 'user+screen_name+bw.csv' 并进行遍历
with open('user+screen_name+bw.csv','r') as f:
for line in f:
l = line.replace('\n','').replace('\r','').split(',')
if l != ['']:
u_id = l[0]
screen_name = l[1]
bw_id = l[2]
get_fs_info(u_id,screen_name,bw_id)
# u_id = 1288739185
# screen_name = '关晓彤'
# bw_id = 4503001732052045
# get_fs_info(u_id, screen_name, bw_id)