今天帮别人写的一个文本数据分析程序
import jieba
import jieba.posseg
import jieba.analyse
import re
import nltk
from nltk.text import ContextIndex
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import brown
from nltk import FreqDist
import numpy as np
import matplotlib.pyplot as plt
def count_word():
global temp1
file = open("D:/数据/圣女的救济.txt",'rb+')
article =file.readlines()
Chc_sum = 0
noChc_sum = 0
for line in article:
Chc_sum += len(re.findall(r'\\x..','%r'%line))/3
noChc_sum += len(re.sub(r'\\.','',re.sub(r'\\x..','','%r'%line)))-2
temp1 = Chc_sum
print('文中含有 %d 个汉字' % Chc_sum)
print('文中含有 %d 个标点符号' % noChc_sum)
print('文中共有 %d 个字' % (Chc_sum+noChc_sum))
file.close()
def terms_count():
global temp2
textopen = open("D:/数据/圣女的救济.txt", 'r',encoding="utf-8")
lines = textopen.readlines()
l = []
word_count = 0
for line in lines:
clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', line))
wordlist = jieba.lcut(clean_data)
word_count += len(wordlist)
temp2 = word_count
textopen.close()
print("文中共有{}个词语".format(word_count))
def average_word_length():
count_word()
terms_count()
average_word_length = temp1 / temp2
print("词语的平均长度是{:.2f}".format(average_word_length))
average_word_length()
def word_character_frequency():
count = 0
number = {'a' : count,
'd' : count,
'e' : count,
'n' : count,
'nr' : count,
'ns' : count,
'p' : count,
'r' : count
}
textopen = open("D:/数据/圣女的救济.txt", 'r', encoding="utf-8")
lines = textopen.readlines()
for line in lines:
clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', line))
words = jieba.posseg.cut(clean_data)
for word in words:
st = str(word.flag)
if st == 'a' or st == 'd' or st == 'e' or st == 'n' or st == 'nr' or st == 'ns' or st == 'p' or st == 'r':
number[st] += 1
plt.style.use('ggplot')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.subplot(1, 1, 1)
x = np.array(['形容词', '副词', '叹词', '名词', '人名', '地名', '介词', '代词'])
y = np.array(list(number.values()))
plt.bar(x, y, width=0.5, align='center', label='数量', color='b')
plt.title("词性频率分布", color='k')
for a, b in zip(x, y):
plt.text(a, b, b, ha='center', va='bottom', fontsize=11, color='k')
plt.xlabel("词性")
plt.ylabel("数量")
plt.legend()
plt.show()
print(number)
def word_frequency():
data = open("D:/数据/圣女的救济.txt", 'r', encoding='utf-8').read()
tag = jieba.analyse.extract_tags(data, 50)
print(tag)
def look_up():
data = open("D:/数据/圣女的救济.txt", 'r', encoding='utf-8').read()
clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', data))
wordlist = jieba.lcut(clean_data)
text = nltk.Text(wordlist)
text.concordance(word='绫音',width=20,lines=10)
text.common_contexts(['绫音', '若山宏美'])
def word():
data = open("D:/数据/圣女的救济.txt", 'r', encoding='utf-8').read()
clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', data))
wordlist = jieba.lcut(clean_data)
text = nltk.Text(wordlist)
text.count(word='下毒')
words=['凶手','砒霜','刑警','感觉']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
text.dispersion_plot(words)
def Similarity():
data = open("D:/数据/圣女的救济.txt", 'r', encoding='utf-8').read()
clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', data))
wordlist = jieba.lcut(clean_data)
text = nltk.Text(wordlist)
contentindex = ContextIndex(wordlist)
similarity_scores = contentindex.word_similarity_dict(word='下毒')
for key, value in similarity_scores.items():
if value > 0.02:
print(key, value)
def feel_analyse():
def word_feats(words):
return dict([(word, True) for word in words])
positive_vocab = ['美丽', '可爱', '自信', '大方', '勇敢', '希望', '包容', '贡献', '诚实', '健康']
negative_vocab = ['暗淡', '暗示', '傲慢', '懊恼', '罢工', '杀人', '白费', '霸占', '昂贵', '伤心']
neutral_vocab = ['音乐', '电影', '是', '的', '行动', '做', '词语', '说', '你', '听']
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]
train_set = negative_features + positive_features + neutral_features
classifier = NaiveBayesClassifier.train(train_set)
neg = 0
pos = 0
data = open("D:/数据/圣女的救济.txt", 'r', encoding='utf-8').read()
clean_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', data))
wordlist = jieba.lcut(clean_data)
text = nltk.Text(wordlist)
data_new = list(text)
for word in data_new:
classResult = classifier.classify(word_feats(word))
if classResult == 'neg':
neg = neg + 1
if classResult == 'pos':
pos = pos + 1
print('积极: ' + str(float(pos) / len(data_new)))
print('消极: ' + str(float(neg) / len(data_new)))
feel_analyse()