小白逆袭大神之综合大作业
这里是三岁,转眼间已经到了大作业了,成功与否最后一搏,最后给小白一点建议吧!加油!等到全部结束后回过头再看一遍效果特别好,会发现忽然间恍然大悟!
综合大作业
一:先爬取评论
二:数据处理,分词,清洗
三:词频统计
四:词云生成
其他具体的在文章里面已经很齐全了
不需要多说什么了
小白专属嘛,怕大家没有爬取过评论,没有做过词云,在这里把前段时间做的B站up主评论爬取和中文词云制作模板给大家做以参考
'''
B站爬取
https://api.bilibili.com/x/v1/dm/list.so?oid=837806779 弹幕api
https://api.bilibili.com/x/v2/reply?type=1&oid=837806779&&pn=1 评论api
弹幕只能够用oid,目前抓包未在到oid集中出现地址
bug:部分网站没有那么严格按照['data']['replies']['content']['message']的顺序来
'''
#导入库
import requests
from bs4 import BeautifulSoup
import re
import json
def Gethtml(url): #获取网页
kv = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36"}
r = requests.get(url, headers = kv)
# print(r.text)
r.encoding = r.apparent_encoding
# print(r.text)
return r.text
#获取up的uid
def u_id(uid_html):
# url = f"https://search.bilibili.com/all?keyword={name}"
# r = Gethtml(url)
# print(r)
# r = requests.get(url, headers=kv)
html = BeautifulSoup(uid_html,'lxml')
# print(html)
uid = html.find(name = 'a',attrs = {"class":"title"})
# print(uid)
#uid = '<a class="title" href="//space.bilibili.com/390461123?from=search" target="_blank" title="徐大sao">徐大sao</a>'
uid = re.findall(r'<a.*?href="//space.bilibili.com/(.+)f.*',str(uid))
uid = uid[0][:-1]
print('以获取up主的id为',uid)
return uid
def a_id(aid_html):#文章的aid
videos = json.loads(aid_html)
# print(videos)
videos_lists = videos['data']['list']['vlist']
aid_list = []
for videos_list in videos_lists:
aid = videos_list['aid']
aid_list.append(aid)
print('已获得视频id长度为:',len(aid_list))
return aid_list
def comment_save(name, comment_html): #爬取每个视频下的评论
videos = json.loads(comment_html.text)
videos_lists = videos['data']['replies']#['replies']['content']
# print(videos_lists)
bvid_list = []
if videos_lists :
for videos_list in videos_lists:
bvid = videos_list['content']['message']
bvid_list.append(bvid)
print(bvid_list)
with open(f'{name}.txt','a+',encoding ='utf-8') as f:
f.write(bvid)
print('提取完毕!')
def main(name): #主函数
#获得阿婆主的id
url_uid = f"https://search.bilibili.com/all?keyword={name}"
uid_html = Gethtml(url_uid)
uid = u_id(uid_html)
# print(uid)
# 循环获取至少9页的视频(不一定有那么多)
for i in range(1,10):
aid_url = f"https://api.bilibili.com/x/space/arc/search?mid={uid}&ps=30&tid=0&pn={i}&keyword=&order=pubdate&jsonp=jsonp"
aid_html = Gethtml(aid_url)
aid_list = a_id(aid_html)
# print(aid_list)
#获取保存评论
for j in range(len(aid_list)):
for i in range(1,15):
comment_uil = f"https://api.bilibili.com/x/v2/reply?type=1&oid={aid_list[j]}&&pn={i}"
comment_html = requests.get(comment_uil)
comment_save(name, comment_html)
# print(comment_html)
main('敬汉卿')#调用主函数
#贤宝宝baby
#老师好我叫何同学
#大祥哥来了
#女胖胖
#记录生活的蛋黄派
词云生成器(自己修改)
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
d = path.dirname(__file__)
# Read the whole text. encoding = ' gbk'或'utf-8'
#文档地址
f = open("D:\python3.7.4\爬虫\女胖胖.txt", 'r' , encoding = ' utf-8')
t = f.read()
ls = jieba.lcut(t)
text = ' '.join(ls)
#线下词云图必须是png格式白底的不然不好看!
alice_coloring = np.array(Image.open(path.join(d, "手绘美女.png")))
# 设置停用词这里面原来是英文的需要自己添加,中文的停用词太多自己网上找
stopwords = set(STOPWORDS)
stopwords.add(" ")
# print(stopwords)
# 你可以通过 mask 参数 来设置词云形状
wc = WordCloud(background_color="white",font_path = "msyh.ttc" , max_words=2000, mask=alice_coloring,
stopwords=stopwords, max_font_size=40, random_state=42)
# generate word cloud
wc.generate(text)
# create coloring from image
image_colors = ImageColorGenerator(alice_coloring)
# show
# 在只设置mask的情况下,你将会得到一个拥有图片形状的词云
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.figure()
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
# 我们还可以直接在构造函数中直接给颜色
# 通过这种方式词云将会按照给定的图片颜色布局生成字体颜色策略
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.figure()
plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
plt.axis("off")
plt.show()
李子柒最后的词云图,因为数据处理太麻烦,没有好好处理,大家多多包涵!!!
大家加油!时间不多,还得多多努力!!!奥利给!