# (seq.in, ./vocab/in_vocab) seq.in是输入句子,in_vab是字典
def createVocabulary(input_path, output_path, no_pad=False, no_unk=False):
if not isinstance(input_path, str):#异常
raise TypeError('input_path should be string')
if not isinstance(output_path, str):#异常
raise TypeError('output_path should be string')
vocab = {}
with open(input_path, 'r',encoding='UTF-8') as fd, \
open(output_path, 'w+',encoding='UTF-8') as out:
for line in fd:#处理每句话
line = line.rstrip('\r\n')
words = line.split()
for w in words:#针对每个单词,统计出现次数
if w == '_UNK':
if str.isdigit(w) == True:
w = '0'
if w in vocab:
vocab[w] += 1
else:
vocab[w] = 1
if no_pad == False:#sorted()排序
vocab = ['_PAD', '_UNK'] + sorted(vocab, key=vocab.get, reverse=True)
else:
vocab = ['_UNK'] + sorted(vocab, key=vocab.get, reverse=True)
for v in vocab:#将出现的词都输入到in_vocab中
out.write(v + '\n')
统计数据集(txt文档)中的单词种类
猜你喜欢
转载自blog.csdn.net/tailonh/article/details/105038660
今日推荐
周排行