版权声明:本文为博主原创文章,未经博主允许不得转载(pan_jinquan) https://blog.csdn.net/guyuealian/article/details/83861888
NPL学习笔记
gensim-word2vec
训练
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing
def create_wordVectors(sentences, embedding_size = 128, window = 5, min_count = 5, word2vec_path = None):
w2vModel = Word2Vec(sentences, size=embedding_size, window=window, min_count=min_count,workers=multiprocessing.cpu_count())
w2vModel.save(word2vec_path)
载入
def load_wordVectors(word2vec_path):
w2vModel = Word2Vec.load(word2vec_path)
return w2vModel
映射
def embedding_lookup(w2vModel, sentences):
all_vectors = []
embeddingDim = w2vModel.vector_size
embeddingUnknown = [0 for i in range(embeddingDim)]
for sentence in sentences:
this_vector = []
for word in sentence:
if word in w2vModel.wv.vocab:
v=w2vModel[word]
this_vector.append(v)
else:
this_vector.append(embeddingUnknown)
all_vectors.append(this_vector)
return all_vectors
获得单词下标和词向量
w2vModel = Word2Vec.load(word2vec_path)
word='你'
index=w2vModel.wv.vocab[word].index # 获得单词word的下标
vector1 = w2vModel.wv.vectors[index] # 由下标获得词向量
vector2 = w2vModel[word] # 由word直接获得词向量