本篇文章旨在记录练习NLP中基础的熟悉和练习,可关联博客内容进行理解:https://blog.csdn.net/weixin_40924580/article/details/82962200
from nltk.corpus import brown
import nltk
# print(brown.categories())
# print(len(brown.words()))
# print(len(brown.sents()))
# sentence = 'hello,world'
# tokens = nltk.word_tokenize(sentence)
# print(tokens)
import jieba
# seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
# print("Full mode:","/".join(seg_list)) #全模式,输出为“Full mode: 我/来到/北京/清华/清华大学/华大/大学”
# seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
# print("Defalt mode:","/".join(seg_list)) #精确模式,输出为“Defalt mode: 我/来到/北京/清华大学”
# seg_list = jieba.cut("他来到了网易杭研大厦") #默认是精确模式,带有新词识别:杭研
# print("/".join(seg_list)) #输出为“他/来到/了/网易/杭研/大厦”
# seg_list = jieba.cut_for_search("小明毕业于中国科学院计算机所,后在日本京都大学深造")#搜索引擎模式
# print(",".join(seg_list)) #输出为“小明,毕业,于,中国,科学,学院,科学院,中国科学院,计算,算机,计算机,计算机所,,
# # ,后,在,日本,京都,大学,日本京都大学,深造”
# tweet = 'RT @ baby:love you baby! :D http://ah.love #168cm'
# print(nltk.word_tokenize(tweet)) # :D原为表情
# #输出为:['RT', '@', 'baby:love', 'you', 'baby', '!', ':', 'D', 'http', ':', '//ah.love', '#', '168cm']
#!!!!! http://www.regexlab.com/zh/regref.htm 正则
#!!!!! https://www.ranks.nl/stopwords 英文停止词
from nltk.stem import SnowballStemmer
# Snowball_Stemmer = SnowballStemmer('english') #词干提取
# print(Snowball_Stemmer.stem('hardness')) #输出为'hard'
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# print(wordnet_lemmatizer.lemmatize('churches')) #词形归一化,输出为‘church’
#英文中Went可以是go的过去时,也可能是人名。解决办法为使用POS Tag,不添加pos表示词为名词,否则为动词
# print(wordnet_lemmatizer.lemmatize('are')) #输出为‘are’
# print(wordnet_lemmatizer.lemmatize('are',pos='v')) #输出为‘be’
# text = nltk.word_tokenize('what does the fox say')
# print(nltk.pos_tag(text)) #输出为“[('what', 'WDT'), ('does', 'VBZ'), ('the', 'DT'), ('fox', 'NNS'), ('say', 'VBP')]”
from nltk.corpus import stopwords
# filtered_words = [word for word in word_list if word not in stopwords.words('english')] #去除停止词
'''
流程:
Raw_Text
Tokenize
POS Tag
Lemma/Stemming
stopwords
word_list
'''
#词频统计
from nltk import FreqDist #借用FreqDist进行词频统计
corpus = 'This is my sentence '\
'This is my life '\
'This is the day'
tokens = nltk.word_tokenize(corpus)
# print(tokens) #输出:['This', 'is', 'my', 'sentence', 'This', 'is', 'my', 'life', 'This', 'is', 'the', 'day']
fdist = FreqDist(tokens)
# print(fdist['my']) #输出为'2'
standard_freq_vector = fdist.most_common(50)
size = len(standard_freq_vector)
# print(size) #输出为:‘7’
# print(standard_freq_vector) #输出:[('This', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)]
#func:按照出现频率的大小,记录每一个单词的位置
def position_lookup(v):
res = {}
counter = 0
for word in v:
res[word[0]] = counter
counter += 1
return res
standard_position_dict = position_lookup(standard_freq_vector)
# print(standard_position_dict)#输出为:‘{'This': 0, 'is': 1, 'my': 2, 'sentence': 3, 'life': 4, 'the': 5, 'day': 6}’
#新句子
sentence = 'This is cool'
#创建一个与标准vector 大小相同的向量
# freq_vector = [0]*size
# #进行简单的preprocessing
# tokens = nltk.word_tokenize(sentence)
# for word in tokens:
# try:
# freq_vector[standard_position_dict[word]] += 1 #若词库中有,则在标准向量对应位置加1
# except:
# continue #若出现新词,则pass
# print(freq_vector) #输出为[1, 1, 0, 0, 0, 0, 0]
#NLTK实现TF-IDF
from nltk.text import TextCollection #该类能自动帮助断句,做统计和计算
corpus = TextCollection(['this is sentence one',
'this is sentence two',
'this is sentence three'])
# print(corpus.tf_idf('this','this is sentence four')) #this 结果为0,出现频率太高
# #对于新句子
# new_sentence = 'this is sentence five'
# #遍历以便所有的vocabulary中的词
#for word in standard_vocab:
# print(corpus.tf_idf(word,new_sentence)) #新句子在vocab中进行标定