import jieba
from lxml import etree
import urllib
import urllib.request
from wordcloud import WordCloud
import pandas as pd
from imageio import imread
import matplotlib.pyplot as plt
def getpage(url):
req=urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
data=urllib.request.urlopen(req).read().decode('utf-8')
return data
def getdata(data):
html=etree.HTML(data)
top_search=html.xpath('//td[@class="td-02"]/a[@href]/text()')
return top_search
def cut_words(top_search):
top_cut=[]
for top in top_search:
top_cut.extend(list(jieba.cut(top))) #使用精确模式切割词汇
return top_cut
if __name__=="__main__":
url="https://s.weibo.com/top/summary?cate=realtimehot"
top_search = getdata(getpage(url))
all_words = cut_words(top_search)
#定义停用词
stop = ['的','你','了','将','为','例',' ','多','再','有','是','等','天','次']
words_cut = []
for word in all_words:
if word not in stop:
words_cut.append(word)
word_count = pd.Series(words_cut).value_counts()
back_ground = imread("E:\\python\\flower.jpg")
wc = WordCloud(
font_path="C:\\Windows\\Fonts\\simhei.ttf", #设置字体
background_color="white", #设置词云背景颜色
max_words=1000, #词云允许最大词汇数
mask=back_ground, #词云形状
max_font_size=200, #最大字体大小
random_state=50 #配色方案的种数
)
wc1 = wc.fit_words(word_count) #生成词云
plt.figure()
plt.imshow(wc1)
plt.axis("off")
plt.show()
wc.to_file("ciyun.png")
利用python爬取微博热搜榜制作词云图
猜你喜欢
转载自blog.csdn.net/qq_38883271/article/details/104497482
今日推荐
周排行