# 法一:
import word2vec # 不是gensim
dataPatn = "vector.txt"
'''
第一行为词表大小和维度
如:
20000 128
我 0.001 -0.002 0.004
你 0.125 -0.215 0.112
...
共20000个词,维度为128
'''
word = "他"
model = word2vec.load(dataPatn)
indices = model.similar(word,n=10)[0]
words = [model.vocab[i] for i in indices]
print(words) # 按相似度,从大到小排列
参考文档 https://radimrehurek.com/gensim/models/keyedvectors.html
# 法二:
import gensim
from gensim.models import KeyedVectors
dataPath = "vector.txt" # 文件带首行,如20000 128
model = KeyedVectors.load_word2vec_format(datapath, binary=False, unicode_errors="jgnore")# 注意,这里的binary非常关键
# print(model)
word = "他"
res = model.most_similar(word, topn=10)
print(res)
小例子:利用腾讯词向量,返回相似词语
from tqdm import tqdm
import word2vec
if __name__ == '__main__':
dataPath = 'tencent_pre_processed_with_200.txt'
# dataPath = 'tencent_unigram.txt'
model = word2vec.load(dataPath)
while True:
print('请输入查询词:')
word = input()
if word == 'q!':
break
indices, metrics = model.similar(word, n=20)
words = model.vocab[indices]
print(words)