embedding+w2v

embedding与Word2Vec是序列模型的基础，其代码总结如下：
Word2Vec模型定义及存储与下载：

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
path = get_tmpfile("word2vec.model")
model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")
model.train([["hello", "world"]], total_examples=1, epochs=1)
vector = model.wv['computer']  # numpy vector of a word

Word2Vec训练：

from gensim.models import Word2Vec
model = Word2Vec(sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
#sample表示更高频率的词被随机下采样到所设置的阈值;hs=1表示层级softmax将会被使用;默认hs=0且negative不为0，则负采样将会被选择使用。
#workers控制训练的并行，此参数只有在安装了Cpython后才有效，否则只能使用单核
print(model.most_similar(positive=['woman', 'king'], negative=['man']))
# 输出[('queen', 0.50882536), ...]
print(model.doesnt_match("breakfast cereal dinner lunch".split()))
# 输出'cereal'
print(model.similarity('woman', 'man'))
# 输出0.73723527
print(model['computer'])  # raw numpy vector of a word
# 输出array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32)

embedding：

from keras.layers import Dense, Flatten, Input
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
import numpy as np
# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']
# define class labels
labels = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='pre')
print(padded_docs)
# define the model
input = Input(shape=(4, ))
x = Embedding(vocab_size, 8, input_length=max_length)(input)    #这一步对应的参数量为50*8
x = Flatten()(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(inputs=input, outputs=x)
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())	#输出模型结构
model.fit(padded_docs, labels, epochs=100, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
loss_test, accuracy_test = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy * 100))
# test the model
test = one_hot('good',50)
padded_test = pad_sequences([test], maxlen=max_length, padding='post')
print(model.predict(padded_test))

embedding中的lookup函数应用：

import numpy as np
import pandas as pd
import tensorflow as tf
input_id=tf.placeholder(dtype=tf.int32,shape=None)
embedding=tf.Variable(np.identity(5,dtype=np.int))
input_embedding=tf.nn.embedding_lookup(embedding,input_id)#找出embeddings中第id行
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
print(embedding.eval())
print(sess.run(input_embedding, feed_dict={input_id: [1, 2, 3, 0, 3, 2, 1]}))
#[[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]]
[[0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [1 0 0 0 0]
 [0 0 0 1 0]
 [0 0 1 0 0]
 [0 1 0 0 0]]

在这里插入图片描述

猜你喜欢