参考资料
提取词干:
https://www.cnblogs.com/no-tears-girl/p/6964910.html
准备
import nltk
import re
import string
f = open("out.txt", "w")
text_en = open(u'./data/text_en.txt',encoding='utf-8',errors='ignore').read()
...
f.close()
分词
words = nltk.word_tokenize(text_en)
提取词干
from nltk.stem import LancasterStemmer
stemmerlan=LancasterStemmer()
temp = [stemmerlan.stem(t) for t in words]
print(temp, file=f)
去停用词
from nltk.corpus import stopwords
text_stop_words = open(u'./data/stop_words.txt',encoding='utf-8',errors='ignore').read()
stop_words = nltk.word_tokenize(text_stop_words)
#stops=set(stopwords.words('english'))
temp = [word for word in words if word.lower() not in stop_words]
print(temp, file=f)
标点符号过滤
def filter_punctuati
on(words):
new_words = []
illegal_char = string.punctuation + '.?!,:;-–—()[]{}"\''
pattern=re.compile('[%s]' % re.escape(illegal_char))
for word in words:
new_word = pattern.sub(u'', word)
if not new_word == u'':
new_words.append(new_word)
return new_words
words_no_punc = filter_punctuation(words)
print(words_no_punc, file=f)
低频词过滤(n <= threshold)
temp = []
fdist = nltk.probability.FreqDist(words)
for word in fdist:
if fdist[word] > 20:
temp.append(word)
print(temp,file=f)
对前 20 个有意义的高频词,绘制频率分布图
fdist = nltk.probability.FreqDist(words_no_punc)
fdist.plot(20)
绘制离散图,查看指定单词(Elizabeth, Darcy, Wickham, Bingley, Jane)在文中的分布位置
spe_words = ["Elizabeth", "Darcy", "Wickham", "Bingley", "Jane"]
text = nltk.text.Text(spe_words)
text.dispersion_plot(spe_words)