NLTK 字符串文本预处理全部代码示例

from __future__ import print_function
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import BlanklineTokenizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import word_tokenize
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from replacers import RegexpReplacer
from replacers import RepeatReplacer
from replacers import WordReplacer
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
import matplotlib
import matplotlib.pyplot as plt

from nltk.metrics import *


# nltk.download('gutenberg')
# 将文本标记为句子
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text = "Hello everyone. Hope all are fine and doing well. Hope you find the book interesting"
print(tokenizer.tokenize(text))

# 其他语言文字的标记化
french_tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
# 将句子标记为单词
text = nltk.word_tokenize("PierreVinken , 59 years old , will join as a nonexecutive director on Nov. 29 .")
print(text)

# 使用TreebankWordTokenizer进行标记化
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize("Have a nice day. I hope you find the book interesting"))
text = nltk.word_tokenize(" Don't hesitate to ask questions")
print(text)
# ['Do', "n't", 'hesitate', 'to', 'ask', 'questions']
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize(" Don't hesitate to ask questions"))
# ['Don', "'", 't', 'hesitate', 'to', 'ask', 'questions']

# 使用正则表达式进行标记化
tokenizer = RegexpTokenizer("[\\w]+")
tokenizer.tokenize("Don't hesitate to ask questions")
print(tokenizer.tokenize("Don't hesitate to ask questions"))
#  ['Don', 't', 'hesitate', 'to', 'ask', 'questions']


sent = "Don't hesitate to ask questions"
print(regexp_tokenize(sent, pattern='\\w+|\\$[\\d\.]+|\\S+'))
# ['Don', "'t", 'hesitate', 'to', 'ask', 'questions']

tokenizer = RegexpTokenizer('\\s+', gaps=True)
tokenizer.tokenize("Don't hesitate to ask questions")
# ['Don', "'t", 'hesitate', 'to', 'ask', 'questions']

sent = " She secured 90.56 % in class X . She is a meritorious student"
capt = RegexpTokenizer('[A-Z]\\w+')
print(capt.tokenize(sent))
# ['She', 'She']
sent = " She secured 90.56 % in class X . She is a meritorious student"
print(BlanklineTokenizer().tokenize(sent))
# [' She secured 90.56 % in class X . She is a meritorious student']
sent = " She secured 90.56 % in class X . She is a meritorious student"
print(WhitespaceTokenizer().tokenize(sent))
# ['She', 'secured', '90.56', '%', 'in', 'class', 'X', '.', 'She', 'is', 'a', 'meritorious', 'student']
# 标记在句子中的偏移置。

sent = " She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(WhitespaceTokenizer().span_tokenize(sent)))
# [(1, 4), (5, 12), (13, 18), (19, 20), (21, 23), (24, 29), (30, 31), (33, 34),
# (35, 38), (39, 41), (42, 43), (44, 55), (56, 63)]

# 消除标点符号

text = [" It is a pleasant evening.", "Guests, who came from US arrived at the venue", "Food was tasty."]
tokenized_docs = [word_tokenize(doc) for doc in text]
print(tokenized_docs)
# [['It', 'is', 'a', 'pleasant', 'evening', '.'], ['Guests', ',', 'who', 'came',
# 'from', 'US', 'arrived', 'at', 'the', 'venue'], ['Food', 'was', 'tasty', '.']]

# 删除标记化的文本中的标点。
text = [" It is a pleasant evening.", "Guests, who came from USarrived at the venue", "Food was tasty."]
tokenized_docs = [word_tokenize(doc) for doc in text]
x = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = x.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    tokenized_docs_no_punctuation.append(new_review)
print(tokenized_docs_no_punctuation)
# [['It', 'is', 'a', 'pleasant', 'evening'],
# ['Guests', 'who', 'came', 'from', 'USarrived', 'at', 'the', 'venue'],
# ['Food', 'was', 'tasty']]

# 转化为小写和大写
text = 'HARdWork IS KEy to SUCCESS'
print(text.lower())
print(text.upper())

# 处理停用词
stops = set(stopwords.words('english'))
words = ["Don't", 'hesitate', 'to', 'ask', 'questions']
print([word for word in words if word not in stops])
# ["Don't", 'hesitate', 'ask', 'questions']
print(stopwords.fileids())
# ['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hungarian',
# 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese',
# 'romanian', 'russian', 'spanish', 'swedish', 'turkish']
# 计算英语中的停用词
print(stopwords.words('english'))

# ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
# 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours',
# 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
# "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself',
# 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
# 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am',
# 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
# 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the',
# 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
# 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
# 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
# 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',
# 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
# 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
# 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can',
# 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll',
# 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't",
# 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't",
# 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
# "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
# 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# 替代和纠正标记
# 使用一个文本替换另一个文本的示例
replacer = RegexpReplacer()
print(replacer.replace("Don't hesitate to ask questions"))
print(replacer.replace("She must've gone to the market but she didn't go"))
# Do not hesitate to ask questions
# She must have gone to the market but she did not go

# 在标记化之前进行替代
replacer = RegexpReplacer()
print(word_tokenize("Don't hesitate to ask questions"))
print(word_tokenize(replacer.replace("Don't hesitate to ask questions")))
# ['Do', "n't", 'hesitate', 'to', 'ask', 'questions']
# ['Do', 'not', 'hesitate', 'to', 'ask', 'questions']

# 处理重复的字符
# 删除重复字符的示例
replacer = RepeatReplacer()
print(replacer.replace('lotttt'))
# lot
print(replacer.replace('ooohhhhh'))
# ooh
print(replacer.replace('happy'))
# happy

# 使用单词的同义词替换单词
replacer = WordReplacer({'congrats': 'congratulations'})
print(replacer.replace('congrats'))
# congratulations

# 在文本上应用齐夫定律
# 齐夫定律指出，在文本中，
# 标记出现的频率正比于它在排序列表上的等级或位置。
# 这个定律描述了某种语言中标记如何分布：一些标记出现得非常频繁，
# 一些标记出现的频率一般，一些标记很少出现

# matplotlib.use('TkAgg')
# fd = FreqDist()
# for text in gutenberg.fileids():
#     for word in gutenberg.words(text):
#         fd.inc(word)
#     ranks = []
#     freqs = []
# for rank, word in enumerate(fd):
#     ranks.append(rank+1)
#     freqs.append(fd[word])
#
# plt.loglog(ranks, freqs)
# plt.xlabel('frequency(f)', fontsize=14, fontweight='bold')
# plt.ylabel('rank(r)', fontsize=14, fontweight='bold')
# plt.grid(True)
# plt.show()

# 相似性量度
training = 'PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split()
testing = 'PERSON OTHER OTHER OTHER OTHER OTHER'.split()
print(accuracy(training, testing))
# 0.6666666666666666
trainset = set(training)
testset = set(testing)
print(trainset)
# {'ORGANIZATION', 'OTHER', 'PERSON'}
print(testset)
# {'OTHER', 'PERSON'}
print(precision(trainset, testset))
# 1.0
print(recall(trainset, testset))
# 0.6666666666666666
print(f_measure(trainset, testset))
# 0.8

# 使用编辑距离算法应用相似性量度
print(edit_distance("relate", "relation"))
# 3

# 使用杰卡德系数应用相似性量度
# 可以定义杰卡德系数或Tanimoto系数为两个集合X和Y之间的相似度。
X = set([10, 20, 30, 40])
Y = set([20, 30, 60])
print(jaccard_distance(X, Y))
# 0.6

# 使用史密斯-沃特曼算法应用相似性量度
# 使用NLTK计算玛斯距离
X = set([10, 20, 30, 40])
Y = set([30, 50, 70])
print(masi_distance(X, Y))
# 0.945

import re
from nltk.corpus import wordnet
replacement_patterns = [
 (r'won\'t', 'will not'),
 (r'can\'t', 'cannot'),
 (r'i\'m', 'i am'),
 (r'ain\'t', 'is not'),
 (r'(\w+)\'ll', '\\g<1> will'),
 (r'(\w+)n\'t', '\\g<1> not'),
 (r'(\w+)\'ve', '\\g<1> have'),
 (r'(\w+)\'s', '\\g<1> is'),
 (r'(\w+)\'re', '\\g<1> are'),
 (r'(\w+)\'d', '\\g<1> would')
]


# 替代和纠正标记
class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl)
                         in
                         patterns]

    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            (s, count) = re.subn(pattern, repl, s)
        return s


# 处理重复的字符
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'

    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word


# 使用单词的同义词替换单词
class WordReplacer(object):
    def __init__(self, word_map):
        self.word_map = word_map

    def replace(self, word):
        return self.word_map.get(word, word)

NLTK 字符串文本预处理全部代码示例

猜你喜欢