来到课程最后一次小作业,训练完word2vec模型后,作业六基于Text8.zip语料训练一个LSTM模型,用perplexity评价训练得到语言模型的质量,越低越好。
LSTM
Problem 1
num_nodes = 64
graph = tf.Graph()
with graph.as_default():
# Parameters:
# Input gate: input, previous output, and bias.
ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # 27*64
im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
ib = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
# Forget gate: input, previous output, and bias.
fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # 27*64
fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
fb = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
# Memory cell: input, state and bias.
cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # 27*64
cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
cb = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
# Output gate: input, previous output, and bias.
ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # 27*64
om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
ob = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
# 改动,拼接同类的tensor
tmp_x = tf.concat([ix, fx, cx, ox], 1) # 27*(64+64+64+64) = 27*256
tmp_m = tf.concat([im, fm, cm, om], 1) # 64*(64+64+64+64) = 64*256
tmp_b = tf.concat([ib, fb, cb, ob], 1) # 1*(64+64+64+64) = 1*256
# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
# Classifier weights and biases.
w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
b = tf.Variable(tf.zeros([vocabulary_size]))
# # Definition of the cell computation.
# def lstm_cell(i, o, state):
# """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
# Note that in this formulation, we omit the various connections between the
# previous state and the gates."""
# input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
# forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
# update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
# state = forget_gate * state + input_gate * tf.tanh(update)
# output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
# return output_gate * tf.tanh(state), state
def lstm_cell(i, o, state):
"""Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
Note that in this formulation, we omit the various connections between the
previous state and the gates."""
print(i.shape, '*', tmp_x.shape, '+', o.shape, '*', tmp_m.shape, '+', tmp_b.shape)
smatmul = tf.matmul(i, tmp_x) + tf.matmul(o, tmp_m) + tmp_b
smatmul_input, smatmul_forget, update, smatmul_output = tf.split(smatmul, 4, 1)
input_gate = tf.sigmoid(smatmul_input)
forget_gate = tf.sigmoid(smatmul_forget)
output_gate = tf.sigmoid(smatmul_output)
state = forget_gate * state + input_gate * tf.tanh(update)
return output_gate * tf.tanh(state), state
# Input data.
train_data = list()
for _ in range(num_unrollings + 1):
train_data.append(
tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
train_inputs = train_data[:num_unrollings]
train_labels = train_data[1:] # labels are inputs shifted by one time step.
# Unrolled LSTM loop.
outputs = list()
output = saved_output
state = saved_state
for i in train_inputs:
output, state = lstm_cell(i, output, state)
outputs.append(output)
# State saving across unrollings.
with tf.control_dependencies([saved_output.assign(output),
saved_state.assign(state)]):
# Classifier.
logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(
labels=tf.concat(train_labels, 0), logits=logits))
# Optimizer.
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(
10.0, global_step, 5000, 0.1, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
optimizer = optimizer.apply_gradients(
zip(gradients, v), global_step=global_step)
# Predictions.
train_prediction = tf.nn.softmax(logits)
# Sampling and validation eval: batch 1, no unrolling.
sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
reset_sample_state = tf.group(
saved_sample_output.assign(tf.zeros([1, num_nodes])),
saved_sample_state.assign(tf.zeros([1, num_nodes])))
sample_output, sample_state = lstm_cell(
sample_input, saved_sample_output, saved_sample_state)
with tf.control_dependencies([saved_sample_output.assign(sample_output),
saved_sample_state.assign(sample_state)]):
sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
输出结果:
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(1, 27) * (27, 256) + (1, 64) * (64, 256) + (1, 256)
运行计算:
num_steps = 7001
summary_frequency = 100
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print('Initialized')
mean_loss = 0
for step in range(num_steps):
batches = train_batches.next()
feed_dict = dict()
for i in range(num_unrollings + 1):
feed_dict[train_data[i]] = batches[i]
_, l, predictions, lr = session.run(
[optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
mean_loss += l
if step % summary_frequency == 0:
if step > 0:
mean_loss = mean_loss / summary_frequency
# The mean loss is an estimate of the loss over the last few batches.
print(
'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
mean_loss = 0
labels = np.concatenate(list(batches)[1:])
print('Minibatch perplexity: %.2f' % float(
np.exp(logprob(predictions, labels))))
if step % (summary_frequency * 10) == 0:
# Generate some samples.
print('=' * 80)
for _ in range(5):
feed = sample(random_distribution())
sentence = characters(feed)[0]
reset_sample_state.run()
for _ in range(79):
prediction = sample_prediction.eval({sample_input: feed})
feed = sample(prediction)
sentence += characters(feed)[0]
print(sentence)
print('=' * 80)
# Measure validation set perplexity.
reset_sample_state.run()
valid_logprob = 0
for _ in range(valid_size):
b = valid_batches.next()
predictions = sample_prediction.eval({sample_input: b[0]})
valid_logprob = valid_logprob + logprob(predictions, b[1])
print('Validation set perplexity: %.2f' % float(np.exp(
valid_logprob / valid_size)))
Problem 2
引入embedding层,用embedding代替输入
# 1. introduce embedding lookup on input
num_nodes = 64
embedding_size = 100 # embedding层的维度
graph = tf.Graph()
with graph.as_default():
# Parameters:
vocabulary_embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) # 增加vocabulary_embeddings
# Input gate: input, previous output, and bias.
ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1)) # 修改输入为100*64
im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
ib = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
# Forget gate: input, previous output, and bias.
fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1)) # 修改输入为100*64
fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
fb = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
# Memory cell: input, state and bias.
cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1)) # 修改输入为100*64
cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
cb = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
# Output gate: input, previous output, and bias.
ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1)) # 修改输入为100*64
om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
ob = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
# 改动,拼接同类的tensor
tmp_x = tf.concat([ix, fx, cx, ox], 1) # 100*(64+64+64+64) = 100*256
tmp_m = tf.concat([im, fm, cm, om], 1) # 64*(64+64+64+64) = 64*256
tmp_b = tf.concat([ib, fb, cb, ob], 1) # 1*(64+64+64+64) = 1*256
# Variables saving state across unrollings.
saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
# Classifier weights and biases.
w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
b = tf.Variable(tf.zeros([vocabulary_size]))
# # Definition of the cell computation.
# def lstm_cell(i, o, state):
# """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
# Note that in this formulation, we omit the various connections between the
# previous state and the gates."""
# input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
# forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
# update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
# state = forget_gate * state + input_gate * tf.tanh(update)
# output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
# return output_gate * tf.tanh(state), state
def lstm_cell(i, o, state):
"""Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
Note that in this formulation, we omit the various connections between the
previous state and the gates."""
print(i.shape, '*', tmp_x.shape, '+', o.shape, '*', tmp_m.shape, '+', tmp_b.shape)
smatmul = tf.matmul(i, tmp_x) + tf.matmul(o, tmp_m) + tmp_b
smatmul_input, smatmul_forget, update, smatmul_output = tf.split(smatmul, 4, 1)
input_gate = tf.sigmoid(smatmul_input)
forget_gate = tf.sigmoid(smatmul_forget)
output_gate = tf.sigmoid(smatmul_output)
state = forget_gate * state + input_gate * tf.tanh(update)
return output_gate * tf.tanh(state), state
# Input data.
train_data = list()
for _ in range(num_unrollings + 1):
train_data.append(
tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
train_inputs = train_data[:num_unrollings]
train_labels = train_data[1:] # labels are inputs shifted by one time step.
# Unrolled LSTM loop.
outputs = list()
output = saved_output
state = saved_state
for i in train_inputs:
# output, state = lstm_cell(i, output, state)
# outputs.append(output)
i_embed = tf.nn.embedding_lookup(vocabulary_embeddings, tf.argmax(i, dimension=1)) # 增加embedding代替input
output, state = lstm_cell(i_embed, output, state) # 对embedding计算lstm
outputs.append(output)
# State saving across unrollings.
with tf.control_dependencies([saved_output.assign(output),
saved_state.assign(state)]):
# Classifier.
logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(
labels=tf.concat(train_labels, 0), logits=logits))
# Optimizer.
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(
10.0, global_step, 5000, 0.1, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
optimizer = optimizer.apply_gradients(
zip(gradients, v), global_step=global_step)
# Predictions.
train_prediction = tf.nn.softmax(logits)
# Sampling and validation eval: batch 1, no unrolling.
sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
sample_input_embedding = tf.nn.embedding_lookup(vocabulary_embeddings, \
tf.argmax(sample_input, dimension=1)) # 增加sample的embedding层
saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
reset_sample_state = tf.group(
saved_sample_output.assign(tf.zeros([1, num_nodes])),
saved_sample_state.assign(tf.zeros([1, num_nodes])))
sample_output, sample_state = lstm_cell(
sample_input_embedding, saved_sample_output, saved_sample_state) # 调用sample_input_embedding
with tf.control_dependencies([saved_sample_output.assign(sample_output),
saved_sample_state.assign(sample_state)]):
sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
转自:http://blog.csdn.net/draco_mystack/article/details/77478021
源码:https://github.com/Zerof007/uda_deeplearning_z