NLP英文数据分析
一、全套英文预处理代码
# 英文句子处理模块
from nltk.corpus import stopwords as pw
import sys
import re
cacheStopWords=pw.words("english")
def English_processing(sentence):
if sentence:
sentence = sentence.lower() # 大写转小写
for ch in "“”!?.\;'',()<>\{}/-1234567890$&#%~":
sentence = sentence.lower().replace(ch," ") # 去除符号
sentence=''.join([word+" " for word in sentence.split() if word not in cacheStopWords]) # 去除停用词
sentence=''.join([word+" " for word in sentence.split() if word not in ['br','w','b','bc']]) # 去除指定特殊词
return sentence
二、统计词频+词云图分析
统计词频
def concat_sentence(sen_list): # 拼接所有句子
all_sen = ""
for i in sen_list:
all_sen+=' '
all_sen+=str(i)
return all_sen
def compute_word_fre(sentence): # 英文句子预处理+统计词频
if sentence:
word_fre ={
}
sentence = English_processing(sentence)
words = sentence.strip().split() # 统计词频
for word in words :
word_fre[word]=word_fre.get(word,0)+1
return word_fre
def output(word_fre): # 输入词频字典,排序后输出
if word_fre:
sort_word = sorted(word_fre.items(),key =lambda s:s[1],reverse = True)
return sort_word
词云图分析
sentences_list 是句子列表输入
concat_sentence 将句子拼接起来,形成一篇文章,再进行统计词频
如果直接是一篇文章,可以跳过第一行的代码
all_sen = concat_sentence(sentences_list) #注意:只调整输入即可
word_fre = compute_word_fre(all_sen)
import matplotlib.pyplot as plt
from imageio import imread,imsave
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
wordcloud = WordCloud(background_color='white',collocations=False,mask=imread('cloud.png',pilmode="RGB"),
max_words=30,random_state=2021,width=1200, height=800).fit_words(word_fre)
# 绘制词云图
plt.imshow(wordcloud, interpolation='bilinear')
wordcloud.to_file("wordcloud.png")
#plt.savefig("other_wordcloud.png",dpi=600) #另一种保存形式
三、情感分析
NLTK
NLTK,全称Natural Language Toolkit,自然语言处理工具包,是NLP研究领域常用的一个Python库,由宾夕法尼亚大学的Steven Bird和Edward Loper在Python的基础上开发的一个模块,至今已有超过十万行的代码。这是一个开源项目,包含数据集、Python模块、教程等;
情感分析实战
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sentences = ['Hello, world. I am terrible']
for sentence in sentences:
print(sentence)
point = sia.polarity_scores(sentence)
print(point)
for k in sorted(point): print('{0}: {1}, '.format(k, point[k]), end='')
四、相似度分析(LDA、LSI、Tfidf)
全套代码
# 英文句子处理模块
from nltk.corpus import stopwords as pw
import sys
import re
cacheStopWords=pw.words("english")
def English_processing(sentence):
if sentence:
sentence = sentence.lower() # 大写转小写
for ch in "“”!?.\;'',()<>\{}/-1234567890$&#%~":
sentence = sentence.lower().replace(ch," ") # 去除符号
sentence=''.join([word+" " for word in sentence.split() if word not in cacheStopWords]) # 去除停用词
sentence=''.join([word+" " for word in sentence.split() if word not in ['br','w','b','bc']]) # 去除指定特殊词
return sentence
def concat_sentence(sen_list): # 拼接所有句子
all_sen = ""
for i in sen_list:
all_sen+=' '
all_sen+=str(i)
return all_sen
import gc
import tqdm
import numpy as np
from gensim import corpora, models, similarities
from collections import defaultdict
import time
class SentenceSimilarity():
def __init__(self, sentences,min_frequency= 1):
self.sentences = []
for i in range(0, len(sentences)):
self.sentences.append(English_processing(sentences[i]))
self.sentences_num = len(self.sentences)
self.min_frequency = min_frequency
# 获取切过词的句子
def get_cuted_sentences(self):
cuted_sentences = []
for sentence in self.sentences:
cuted_sentences.append(sentence.strip().split())
return cuted_sentences
# 构建其他复杂模型前需要的简单模型
def simple_model(self):
self.texts = self.get_cuted_sentences()
# 删除低频词
frequency = defaultdict(int)
for text in self.texts:
for token in text:
frequency[token] += 1
self.texts = [[token for token in text if frequency[token] > self.min_frequency] for text in self.texts]
self.dictionary = corpora.Dictionary(self.texts)
self.corpus_simple = [self.dictionary.doc2bow(text) for text in self.texts]
# tfidf模型
def TfidfModel(self):
self.simple_model()
# 转换模型
self.model = models.TfidfModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# lsi模型
def LsiModel(self):
self.simple_model()
# 转换模型
self.model = models.LsiModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# lda模型
def LdaModel(self):
self.simple_model()
# 转换模型
self.model = models.LdaModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# 对新输入的句子(比较的句子)进行预处理
def sentence2vec(self, sentence):
sentence = English_processing(sentence)
vec_bow = self.dictionary.doc2bow(sentence.strip().split())
return self.model[vec_bow]
def bow2vec(self):
vec = []
length = max(self.dictionary) + 1
for content in self.corpus:
sentence_vectors = np.zeros(length)
for co in content:
sentence_vectors[co[0]] = co[1] # 将句子出现的单词的tf-idf表示放入矩阵中
vec.append(sentence_vectors)
return vec
# 求最相似的句子
# input: test sentence
def similarity(self, sentence):
sentence_vec = self.sentence2vec(sentence)
sims = self.index[sentence_vec]
sim = max(enumerate(sims), key=lambda item: item[1])
index = sim[0]
score = sim[1]
sentence = self.sentences[index]
return index,score # 返回最相似的句子的下标和相似度得分
# 求最相似前k个句子
def similarity_k(self, sentence, k):
sentence_vec = self.sentence2vec(sentence)
t1 = time.time()
sims = self.index[sentence_vec]
t2 = time.time()
print('特征检索耗时:{:.4f}ms, 检索样本总数:{}'.format(t2-t1, self.sentences_num))
sim_k = sorted(enumerate(sims), key=lambda item: item[1], reverse=True)[:k]
indexs = [i[0] for i in sim_k]
scores = [i[1] for i in sim_k]
return indexs, scores
五、实战演练-2021美赛C题
结果公布再更新