基础语法解决------》哈姆雷特词频统计
'''
Text word frequency statistics
use wordcloud
'''
import wordcloud
import time
def file_change(path="../use_data/hamlet.txt"):
'''处理特殊字符和大写'''
with open(path) as fe:
fe = fe.read().lower() # 全部转为小写
for i in '!"#$%^&*()_+-=~`:;{}[]\|<>,.?/':
fe = fe.replace(i, " ")
return fe
def main():
hamlet_txt = file_change()
hamlet_list = hamlet_txt.split()
# split() 去除所有空字符 \n " " \t \r
# split(" ") 只去除空格
dic = {}
for i in hamlet_list:
c = hamlet_list.count(i) # 得到词频统计,耗时较多
dic[i] = c
ham_list = list(dic.items()) # 转化为列表
# ham_list.sort(key = lambda s:s[1],reverse = True) # 将列表进行排序
ham_list = sorted(ham_list, key=lambda s: s[1], reverse=True) # sorted方法
for i in range(10):
m, n = ham_list[i]
print("{}:{}".format(m, n)) # 将出现频率最高的十个单词打印
def main1():
ham_ls = file_change()
words = ham_ls.split()
counts = {}
for word in words:
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda s: s[1], reverse=True)
for i in range(10):
m, n = items[i]
print("{0:<10}{1:>5}".format(m, n))
start = time.perf_counter()
if __name__ == '__main__':
main()
print("spend {} second".format(time.perf_counter() - start))
在程序段,使用count统计词频,耗时13秒
使用字典get方法只需要0.11秒
使用 wordcloud 库显示词频
'''
Text word frequency statistics
use wordcloud
'''
import wordcloud
import time
def file_change(path="../use_data/hamlet.txt"):
'''处理特殊字符和大写'''
with open(path) as fe:
fe = fe.read().lower() # 全部转为小写
for i in '!"#$%^&*()_+-=~`:;{}[]\|<>,.?/':
fe = fe.replace(i, " ")
return fe
# 使用wordcloud 库
start = time.perf_counter()
text = file_change()
# 创建wordcloud对象
words = wordcloud.WordCloud(width = 500,\
height = 500,
max_words = 20,
min_font_size = 10)
# 统计词汇
words.generate(text)
# 将词汇按数量的大小变为字体的大小生成图片
words.to_file("../use_data/hamlet_words.jpg")
print("spend {} second".format(time.perf_counter() - start))
扫描二维码关注公众号,回复:
1811220 查看本文章