udacity deeplearning实战（六）

来到课程最后一次小作业，训练完word2vec模型后，作业六基于Text8.zip语料训练一个LSTM模型，用perplexity评价训练得到语言模型的质量，越低越好。

LSTM

Problem 1

num_nodes = 64

graph = tf.Graph()
with graph.as_default():

    # Parameters:
    # Input gate: input, previous output, and bias. 
    ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # 27*64
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
    ib = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
    # Forget gate: input, previous output, and bias.
    fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # 27*64
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
    fb = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
    # Memory cell: input, state and bias.                             
    cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # 27*64
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
    cb = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) # 27*64
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
    ob = tf.Variable(tf.zeros([1, num_nodes])) # 1*64

    # 改动，拼接同类的tensor
    tmp_x = tf.concat([ix, fx, cx, ox], 1) # 27*(64+64+64+64) = 27*256
    tmp_m = tf.concat([im, fm, cm, om], 1) # 64*(64+64+64+64) = 64*256
    tmp_b = tf.concat([ib, fb, cb, ob], 1) # 1*(64+64+64+64) = 1*256

    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))

#   # Definition of the cell computation.
#     def lstm_cell(i, o, state):
#         """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
#         Note that in this formulation, we omit the various connections between the
#         previous state and the gates."""
#         input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
#         forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
#         update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
#         state = forget_gate * state + input_gate * tf.tanh(update)
#         output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
#         return output_gate * tf.tanh(state), state

    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        print(i.shape, '*', tmp_x.shape, '+', o.shape, '*', tmp_m.shape, '+', tmp_b.shape)
        smatmul = tf.matmul(i, tmp_x) + tf.matmul(o, tmp_m) + tmp_b 
        smatmul_input, smatmul_forget, update, smatmul_output = tf.split(smatmul, 4, 1)
        input_gate = tf.sigmoid(smatmul_input)
        forget_gate = tf.sigmoid(smatmul_forget)
        output_gate = tf.sigmoid(smatmul_output)
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
          tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.


    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)

    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                            saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

输出结果：

(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(64, 27) * (27, 256) + (64, 64) * (64, 256) + (1, 256)
(1, 27) * (27, 256) + (1, 64) * (64, 256) + (1, 256)

运行计算：

num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Problem 2

引入embedding层，用embedding代替输入

# 1. introduce embedding lookup on input

num_nodes = 64
embedding_size = 100 # embedding层的维度

graph = tf.Graph()
with graph.as_default():

    # Parameters:
    vocabulary_embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) # 增加vocabulary_embeddings
    # Input gate: input, previous output, and bias. 
    ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1)) # 修改输入为100*64
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
    ib = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
    # Forget gate: input, previous output, and bias.
    fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1)) # 修改输入为100*64
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
    fb = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
    # Memory cell: input, state and bias.                             
    cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1)) # 修改输入为100*64
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
    cb = tf.Variable(tf.zeros([1, num_nodes])) # 1*64
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1)) # 修改输入为100*64
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) # 64*64
    ob = tf.Variable(tf.zeros([1, num_nodes])) # 1*64

    # 改动，拼接同类的tensor
    tmp_x = tf.concat([ix, fx, cx, ox], 1) # 100*(64+64+64+64) = 100*256
    tmp_m = tf.concat([im, fm, cm, om], 1) # 64*(64+64+64+64) = 64*256
    tmp_b = tf.concat([ib, fb, cb, ob], 1) # 1*(64+64+64+64) = 1*256

    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))

#   # Definition of the cell computation.
#     def lstm_cell(i, o, state):
#         """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
#         Note that in this formulation, we omit the various connections between the
#         previous state and the gates."""
#         input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
#         forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
#         update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
#         state = forget_gate * state + input_gate * tf.tanh(update)
#         output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
#         return output_gate * tf.tanh(state), state

    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        print(i.shape, '*', tmp_x.shape, '+', o.shape, '*', tmp_m.shape, '+', tmp_b.shape)
        smatmul = tf.matmul(i, tmp_x) + tf.matmul(o, tmp_m) + tmp_b 
        smatmul_input, smatmul_forget, update, smatmul_output = tf.split(smatmul, 4, 1)
        input_gate = tf.sigmoid(smatmul_input)
        forget_gate = tf.sigmoid(smatmul_forget)
        output_gate = tf.sigmoid(smatmul_output)
        state = forget_gate * state + input_gate * tf.tanh(update)
        return output_gate * tf.tanh(state), state

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
          tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.


    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
#         output, state = lstm_cell(i, output, state)
#         outputs.append(output)
        i_embed = tf.nn.embedding_lookup(vocabulary_embeddings, tf.argmax(i, dimension=1)) # 增加embedding代替input
        output, state = lstm_cell(i_embed, output, state) # 对embedding计算lstm
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
          tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)

    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    sample_input_embedding = tf.nn.embedding_lookup(vocabulary_embeddings, \
        tf.argmax(sample_input, dimension=1)) # 增加sample的embedding层
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input_embedding, saved_sample_output, saved_sample_state) # 调用sample_input_embedding
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                            saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

转自：http://blog.csdn.net/draco_mystack/article/details/77478021
源码：https://github.com/Zerof007/uda_deeplearning_z

udacity deeplearning实战（六）

猜你喜欢