wordvec_词的相似度

import gensim
from gensim.models import word2vec
import logging
import jieba
import os
import numpy as np


def cut_txt(old_file):
import jieba
global cut_file # 分词之后保存的文件名
cut_file = old_file + '_cut.txt'

try:
fi = open(old_file, 'r', encoding='utf-8')
except BaseException as e: # 因BaseException是所有错误的基类,用它可以获得所有错误类型
print(Exception, ":", e) # 追踪错误详细信息

text = fi.read() # 获取文本内容
new_text = jieba.cut(text, cut_all=False) # 精确模式
str_out = ' '.join(new_text).replace(',', '').replace('。', '').replace('?', '').replace('!', '') \
.replace('“', '').replace('”', '').replace(':', '').replace('…', '').replace('(', '').replace(')', '') \
.replace('—', '').replace('《', '').replace('》', '').replace('、', '').replace('‘', '') \
.replace('’', '') # 去掉标点符号
fo = open(cut_file, 'w', encoding='utf-8')
fo.write(str_out)
def model_train(train_file_name, save_model_file): # model_file_name为训练语料的路径,save_model为保存模型名
from gensim.models import word2vec
import gensim
import logging
# 模型训练,生成词向量
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.Text8Corpus(train_file_name) # 加载语料
model = gensim.models.Word2Vec(sentences, size=200,min_count=1) # 训练skip-gram模型; 默认window=5
model.save(save_model_file)
model.wv.save_word2vec_format(save_model_name + ".bin", binary=True) # 以二进制类型保存模型以便重用

if __name__=='__main__':
cut_txt('fenci.txt') # 须注意文件必须先另存为utf-8编码格式
save_model_name = 'fenci.model'
if not os.path.exists(save_model_name): # 判断文件是否存在
model_train(cut_file, save_model_name)
else:
print('此训练模型已经存在,不用再次训练')

# 加载已训练好的模型
model = word2vec.Word2Vec.load(save_model_name)

y1 = model.similarity("何太冲", "张无忌")
print(u"何太冲和张无忌的相似度为:", y1)

y2 = model.most_similar("张无忌", topn=10) # 10个最相关的
print(u"和张无忌最相关的词有:\n")
for item in y2:
print(item[0], item[1])



猜你喜欢

转载自www.cnblogs.com/hapyygril/p/9982307.html