版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/zlp_zky/article/details/82996485
# -*- coding:utf-8 -*- # __author__ = 'lipzhang' import nltk from nltk.corpus import gutenberg #古腾堡语料库 from nltk.corpus import webtext #w网络聊天文本 from nltk.corpus import nps_chat from nltk.corpus import brown #布朗语料库 from nltk.corpus import reuters #路透社语料库 print(gutenberg.fileids()) emma = gutenberg.words('austen-emma.txt') for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid))# raw() 函数给我们没有进行过任何语言学处理的文件的内容。因此,例如:len(gutenberg.raw(' blake-poems.txt')告诉我们文本中出现的词汇个数,包括词之间的空格。 num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid))#sents()函数把 文本划分成句子,其中每一个句子是一个词链表。 num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print(int(num_chars / num_words), int(num_words / num_sents), int(num_words / num_vocab), fileid)#三个统计量:平均词长、平均句子长度和本文中每个词出现的平均次数 for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65]) chatroom = nps_chat.posts('10-19-20s_706posts.xml')# print(chatroom[123]) print(brown.categories()) print(brown.words(categories='news')) print(brown.words(fileids=['cg22'])) print(brown.sents(categories=['news', 'editorial', 'reviews'])) news_text = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news_text])#统计情态动词分别出现的次数; modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: print(m + ':', fdist[m]) cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))#统计情态动词分别出现的次数; genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] cfd.tabulate(conditions=genres, samples=modals) # print(reuters.fileids()) # print(reuters.categories()) # print(reuters.categories('training/9865')) # print(reuters.fileids(['barley', 'corn'])) # print(reuters.words('training/9865')[:14]) from nltk.corpus import inaugural#就职演说语料库 print(inaugural.fileids()) print([fileid[:4] for fileid in inaugural.fileids()]) cfd = nltk.ConditionalFreqDist((target, fileid[:4])for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen']if w.lower().startswith(target)) cfd.plot()#条件频率分布图:计数就职演说语料库中所有以 america 或 citizen 开始的词。每个 演讲单独计数 。这样就能观察出随时间变化用法上的演变趋势 。计数没有与文档长度进行归 一化处理。 from nltk.corpus import udhr #世界人权宣言语料库 languages = ['Chickasaw', 'English', 'German_Deutsch','Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'] cfd = nltk.ConditionalFreqDist((lang, len(word))for lang in languages for word in udhr.words(lang + '-Latin1')) cfd.plot(cumulative=True)#. 累积字长分布:内容是 “ 世界人权宣言 ” 的 6 个翻译版本;此图显示: 5 个或 5 个 以下字母组成的词在 Ibibio 语言的文本中占约 80 %,在德语文本中占 60 %,在 Inuktitut 文 本中占 25% 。