对word2vec的skip-gram模型进行训练(中心词去估计滑窗周边词),数据预处理:实现了下载数据,使用counter库进行分词
处理,在构建了数据(center-word和target-word),构建skip-gram模型实现了数据占位,投影矩阵,损失函数处理,优化器
的选取,对loss可视化的summary......,训练模型:主要是构建可视化,模型保存的处理...
所有定义的常量
import os
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf
from urllib.request import urlretrieve
import zipfile,random
import counter
VOCAB_SIZE = 50000 #稠密词的个数
#注意:词嵌入的(词典)的维度和批处理的维度一样
BATCH_SIZE = 128 #批处理的大小
EMBED_SIZE = 128 # 词嵌入的大小
SKIP_WINDOW = 1 # 滑窗的大小
NUM_SAMPLED = 64 # 下采样多少负样本(没有使用huffman数来层次化,而是用负例的下采样来进行优化的)
LEARNING_RATE = 1.0 #学习率
NUM_TRAIN_STEPS = 100000 #训练的迭代次数
init_iterator_step = 55000
WEIGHTS_FLD = 'processed/'
SKIP_STEP = 1000 #保存模型,计算损失的step!
#保存模型的路径
MODEL_PATH = './model'
MODEL_NAME = "model.ckpt"
DOWNLOAD_URL = 'http://mattmahoney.net/dc/'
EXPECTED_BYTES = 31344016
DATA_FOLDER = 'data/'
FILE_NAME = 'text8.zip'
1.数据处理模块
# 下载数据
def download(file_name, expected_bytes):
file_path = DATA_FOLDER + file_name
if os.path.exists(file_path):
print("Dataset ready")
return file_path
file_name, _ = urlretrieve(DOWNLOAD_URL + file_name, file_path)
file_stat = os.stat(file_path)
if file_stat.st_size == expected_bytes:
print('Successfully downloaded the file', file_name)
else:
raise Exception('File ' + file_name +
' might be corrupted. You should try downloading it with a browser.')
return file_path
# 读取数据
def read_data(file_path):
with zipfile.ZipFile(file_path) as f:
words = tf.compat.as_str(f.read(f.namelist()[0])).split()
return words
# 构建字典
def build_vocab(words, vocab_size):
dictionary = dict()
count = []
count.extend(counter.Counter(words).most_common(vocab_size - 1))
index = 0
make_dir('processed')
with open('processed/vocab_1000.tsv', "w") as f:
for _,word in enumerate(count):
dictionary[word] = index
if index < 1000:
f.write(word + "\n")
index += 1
index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return dictionary, index_dictionary
# 词和下标的映射转换
def convert_words_to_index(words, dictionary):
return [dictionary[word] if word in dictionary else 0 for word in words]
# 按照skip-gram重新组织数据,利用中心词去估计周边词
def generate_sample(index_words, context_window_size):
#index当成中心词的索引,center是中心词!
for index, center in enumerate(index_words):
context = random.randint(1, context_window_size) #这属于随机窗口。。。。
# get a random target before the center word
for target in index_words[max(0, index - context): index]:
yield center, target
# get a random target after the center wrod
for target in index_words[index + 1: index + context + 1]:
yield center, target
# 把一个batch数据处理成numpy格式返回
def get_batch(iterator, batch_size):
while True:
center_batch = np.zeros(batch_size, dtype=np.int32)
target_batch = np.zeros([batch_size, 1])
for index in range(batch_size):
center_batch[index], target_batch[index] = next(iterator)
yield center_batch, target_batch
# 如果文件夹不存在,创建文件夹
def make_dir(path):
try:
os.mkdir(path)
except OSError:
pass
# 数据处理
def process_data(vocab_size, batch_size, skip_window):
#下载数据
file_path = download(FILE_NAME, EXPECTED_BYTES)
#读取数据
words = read_data(file_path)
#构建词表
dictionary, _ = build_vocab(words, vocab_size)
#将词进行下标编码
index_words = convert_words_to_index(words, dictionary)
del words # 节省空间
#产出skip_gram样本
single_gen = generate_sample(index_words, skip_window)
return get_batch(single_gen, batch_size)
2.skip-gram模型
#词模型
class SkipModel:
def __init__(self,vocab_size,batch_size,embed_size,num_sample,learning_rate):
self.vocab_size = vocab_size
self.embed_size = embed_size
self.batch_size = batch_size
self.num_sampled = num_sample
self.learning_rate = learning_rate
#用于可视化,记录每次的迭代,trainable=False不需要进行训练
self.global_step = tf.Variable(0,dtype=tf.int32,trainable=False,name="global_step")
def _create_placeholders(self):
"""定义占位符存储数据"""
with tf.name_scope("data"): #注意格式:中心词和目标词
self.center_words = tf.placeholder(tf.int32,[self.batch_size],name="center_words")
self.target_words = tf.placeholder(tf.int32,[self.batch_size,1],name="target_words")
def _create_embedding(self):
"""定义一个投影矩阵,行代表vocab_size稠密词,列代表embed_size嵌入词"""
with tf.device("/cpu:0"):
with tf.name_scope("embed"):
#均匀分布
self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size,self.embed_size],
minval=-1,maxval=1,name="embed_matrix"))
def _create_loss(self):
"""定义word2vec的结果,同时定义自带的损失函数"""
with tf.device("/cpu:0"):
with tf.name_scope("loss"):
#投影矩阵对中心词进行映射查找
# Returns:A `Tensor` with the same type as the tensors in `params`.
embed = tf.nn.embedding_lookup(self.embed_matrix,self.center_words,name="embed")
#定义损失函数,通常词表很大,如果平铺开来进行全连接会有问题,可以使用huffman树层次化或者负例采样
#权重是投影矩阵的shape(self.vocab_size,self.embed_size预测的类别)与向量全连接进行输出,一定注意!!!
# 偏置是稠密词shape(softmax的类别数)
nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size,self.embed_size],stddev=1/(self.embed_size**0.5))
,name="nce_weight")
nce_biase = tf.Variable(tf.zeros([self.vocab_size]),name="nce_biase")
self.loss = tf.reduce_mean(tf.nn.nce_loss(
weights=nce_weight,biases=nce_biase,labels=self.target_words,inputs=embed,
num_sampled=self.num_sampled,num_classes=self.vocab_size,name="loss"
))
return
def _create_optimizer(self):
"""定义一个优化器"""
with tf.device("/cpu:0"):
self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
return
def _create_summary(self):
"""定义summary,以便在tensorboard里进行可视化"""
with tf.name_scope("summary"):
tf.summary.scalar("loss",self.loss)
tf.summary.histogram("histogram",self.loss)
#进行合并summary
self.summary_op = tf.summary.merge_all()
return
def build_graph(self):
"""构建整个定义的图graph"""
self._create_placeholders()
self._create_embedding()
self._create_loss()
self._create_optimizer()
self._create_summary()
3.训练模块
#训练词模型
def train(model,batch_data,num_train_step):
saver = tf.train.Saver()
init = tf.global_variables_initializer()
with tf.Session() as sess:
if os.path.exists(MODEL_PATH):
path = os.path.join(MODEL_PATH,MODEL_NAME)+"-"+str(init_iterator_step)
saver.restore(sess,path)
else:
os.mkdir(MODEL_PATH)
sess.run(init)
writer = tf.summary.FileWriter("./graph",sess.graph)
loss_total = 0
for i in range(init_iterator_step,num_train_step):
centers,targers = next(batch_data)
feed_dict = {model.center_words:centers,model.target_words:targers}
_,l,summary = sess.run([model.train,model.loss,model.summary_op],feed_dict=feed_dict)
loss_total+=l
writer.add_summary(summary,global_step=i)
if i%SKIP_STEP==0 and i!=1:
print("Iteration:{},loss:{}".format(i,loss_total/SKIP_STEP))
loss_total = 0
saver.save(sess,os.path.join(MODEL_PATH,MODEL_NAME),global_step=i)
writer.close()
return
4.启动代码
if __name__ == '__main__':
model = SkipModel(VOCAB_SIZE,BATCH_SIZE,EMBED_SIZE,NUM_SAMPLED,LEARNING_RATE)
model.build_graph()
data = process_data(VOCAB_SIZE,BATCH_SIZE,SKIP_WINDOW)
train(model,data,NUM_TRAIN_STEPS)