NLP基础练习

本篇文章旨在记录练习NLP中基础的熟悉和练习，可关联博客内容进行理解：https://blog.csdn.net/weixin_40924580/article/details/82962200
from nltk.corpus import brown
import nltk
# print(brown.categories())
# print(len(brown.words()))
# print(len(brown.sents()))

# sentence = 'hello,world'
# tokens = nltk.word_tokenize(sentence)
# print(tokens)

import jieba
# seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
# print("Full mode:","/".join(seg_list))             #全模式，输出为“Full mode: 我/来到/北京/清华/清华大学/华大/大学”

# seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
# print("Defalt mode:","/".join(seg_list))          #精确模式，输出为“Defalt mode: 我/来到/北京/清华大学”

# seg_list = jieba.cut("他来到了网易杭研大厦")     #默认是精确模式，带有新词识别：杭研
# print("/".join(seg_list))                         #输出为“他/来到/了/网易/杭研/大厦”

# seg_list = jieba.cut_for_search("小明毕业于中国科学院计算机所，后在日本京都大学深造")#搜索引擎模式
# print(",".join(seg_list)) #输出为“小明,毕业,于,中国,科学,学院,科学院,中国科学院,计算,算机,计算机,计算机所,，
#                             # ,后,在,日本,京都,大学,日本京都大学,深造”

# tweet = 'RT @ baby：love you baby! :D http://ah.love #168cm'
# print(nltk.word_tokenize(tweet))            # :D原为表情
# #输出为：['RT', '@', 'baby：love', 'you', 'baby', '!', ':', 'D', 'http', ':', '//ah.love', '#', '168cm']

#！！！！！   http://www.regexlab.com/zh/regref.htm     正则
#！！！！！   https://www.ranks.nl/stopwords            英文停止词

from nltk.stem import SnowballStemmer
# Snowball_Stemmer = SnowballStemmer('english')               #词干提取
# print(Snowball_Stemmer.stem('hardness'))                    #输出为'hard'

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# print(wordnet_lemmatizer.lemmatize('churches'))         #词形归一化，输出为‘church’

#英文中Went可以是go的过去时，也可能是人名。解决办法为使用POS Tag,不添加pos表示词为名词，否则为动词
# print(wordnet_lemmatizer.lemmatize('are'))              #输出为‘are’
# print(wordnet_lemmatizer.lemmatize('are',pos='v'))      #输出为‘be’


# text = nltk.word_tokenize('what does the fox say')
# print(nltk.pos_tag(text))   #输出为“[('what', 'WDT'), ('does', 'VBZ'), ('the', 'DT'), ('fox', 'NNS'), ('say', 'VBP')]”

from nltk.corpus import stopwords
# filtered_words = [word for word in word_list if word not in stopwords.words('english')] #去除停止词

'''
流程：
    Raw_Text
    Tokenize
                POS Tag
    Lemma/Stemming
    stopwords
    word_list
'''


#词频统计
from nltk import FreqDist   #借用FreqDist进行词频统计
corpus = 'This is my sentence '\
            'This is my life '\
            'This is the day'

tokens = nltk.word_tokenize(corpus)
# print(tokens)     #输出：['This', 'is', 'my', 'sentence', 'This', 'is', 'my', 'life', 'This', 'is', 'the', 'day']

fdist = FreqDist(tokens)
# print(fdist['my'])            #输出为'2'
standard_freq_vector = fdist.most_common(50)
size = len(standard_freq_vector)
# print(size)         #输出为：‘7’
# print(standard_freq_vector) #输出：[('This', 3), ('is', 3), ('my', 2), ('sentence', 1), ('life', 1), ('the', 1), ('day', 1)]





#func:按照出现频率的大小，记录每一个单词的位置
def position_lookup(v):
    res = {}
    counter = 0
    for word in v:
        res[word[0]] = counter
        counter += 1
    return res

standard_position_dict = position_lookup(standard_freq_vector)
# print(standard_position_dict)#输出为：‘{'This': 0, 'is': 1, 'my': 2, 'sentence': 3, 'life': 4, 'the': 5, 'day': 6}’

#新句子
sentence = 'This is cool'
#创建一个与标准vector 大小相同的向量
# freq_vector = [0]*size
# #进行简单的preprocessing
# tokens = nltk.word_tokenize(sentence)
# for word in tokens:
#     try:
#         freq_vector[standard_position_dict[word]] += 1      #若词库中有，则在标准向量对应位置加1
#     except:
#         continue                                           #若出现新词，则pass
# print(freq_vector)                                          #输出为[1, 1, 0, 0, 0, 0, 0]




#NLTK实现TF-IDF
from nltk.text import TextCollection            #该类能自动帮助断句，做统计和计算
corpus = TextCollection(['this is sentence one',
                         'this is sentence two',
                         'this is sentence three'])

# print(corpus.tf_idf('this','this is sentence four'))       #this 结果为0，出现频率太高

# #对于新句子
# new_sentence = 'this is sentence five'
# #遍历以便所有的vocabulary中的词
#for word in standard_vocab:
    # print(corpus.tf_idf(word,new_sentence))           #新句子在vocab中进行标定
猜你喜欢