Encoder

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

LSTM

在这里插入图片描述

$block\_input$ 输入 $z^{t}$
$output\_gate$ 决定 $c^{t}$ 中哪些部分对 $block\_ouput$ 输出 $y^{t}$ 有用
$input\_gate$ 控制当前 $x^t$ 的信息融入细胞状态 $c^t$ ，判断当前词对全局的重要性
$forget\_gate$ 控制上一时刻的细胞状态 $c^{t-1}$ 的信息融入细胞状态 $c^{t}$ ，判断先前词的重要性

GRU

在这里插入图片描述
没有peephole连接
重置门 $r_t$ 控制前一时刻隐藏单元 $h_{t-1}$ 对当前词的影响
$z_t$ 决定是否忽略当前词而产生res短路连接

Decoder

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)

        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)

        return x, state, attention_weights

Attention模型架构

在这里插入图片描述

$\alpha_{ts}=\frac{exp(score(h_t, \bar{h}_s))}{\sum_{s^{'}=1}^{S}exp(score(h_t, \bar{h}_s))} \hspace{2cm} \text{[Attention Weight], shape: [batch\_size, seq\_len, 1]}$
$c_{t}=\sum_{s}\alpha_{ts}\bar{h}_s \hspace{4.6cm} \text{[Context vector], shape: [batch\_size, hidden\_size]}$
$score(h_t,\bar{h}_s)= \begin{cases} h^T_tW\bar{h}_s & \hspace{0.3cm} \text{[Luong's multiplicative style], shape: [batch\_size, hidden\_size]} \\ v_a^Ttanh(W_1h_t+W_2h_s) & \hspace{.3cm} \text{[Bahdanau's additive style], shape: [batch\_size, hidden\_size]} \end{cases}$

GNMT model

gnmt采用如下模型架构：

encoder端是8层的LSTM单元，其中只有第一层采用bi-directional的LSTM模块，其他层都是单向的，出于并行化的考虑。
data parallelism：Downpour SGD。
model parallelism：encoder端每层运行在不同的gpu上，不需要等待前一层运行完之后再运行。decoder端的softmax层进行了partition。
res net使深度能支持到8层
decoder的第一层作为attention输入，attention的输出送入其他层产生softmax的输入。
word piece: 解决生僻词问题，将词细分为词根和后缀。
Quantizable Model and Quantized Inference：优化效率
BLEU: The main problem with this objective is that it does not reflect the task reward function as measured by the BLEU score in translation. Further, this objective does not explicitly encourage a ranking among incorrect output sequences – where outputs with higher BLEU scores should still obtain higher probabilities under the model – since incorrect outputs are never observed during training. In other words, using maximum-likelihood training only, the model will not learn to be robust to errors made during decoding since they are never observed, which is quite a mismatch between the training and testing procedure. GELU在最大似然的基础上加入了reward。

创建decoder网络：

  def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
                          source_sequence_length):
    # GNMT attention
    attention_option = hparams.attention
    attention_architecture = hparams.attention_architecture
    num_units = hparams.num_units
    infer_mode = hparams.infer_mode
	...
	# tf.contrib.seq2seq.LuongAttention或者tf.contrib.seq2seq.BahdanauAttention
    attention_mechanism = self.attention_mechanism_fn(
        attention_option, num_units, memory, source_sequence_length, self.mode)

    # single cell（LSTM、GRU、NAS)的list,同时添加resnet短连接
    cell_list = model_helper._cell_list(  # pylint: disable=protected-access
        unit_type=hparams.unit_type,
        num_units=num_units,
        num_layers=self.num_decoder_layers,
        num_residual_layers=self.num_decoder_residual_layers,
        forget_bias=hparams.forget_bias,
        dropout=hparams.dropout,
        num_gpus=self.num_gpus,
        mode=self.mode,
        single_cell_fn=self.single_cell_fn,
        residual_fn=gnmt_residual_fn
    )

    # Only wrap the bottom layer with the attention mechanism.
    # 只将第一层送入attention
    attention_cell = cell_list.pop(0)

    # Only generate alignment in greedy INFER mode.
    alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and
                         infer_mode != "beam_search")
    attention_cell = tf.contrib.seq2seq.AttentionWrapper(
        attention_cell,
        attention_mechanism,
        attention_layer_size=None,  # don't use attention layer.
        output_attention=False,
        alignment_history=alignment_history,
        name="attention")
	# attention输出送入GNMTAttentionMultiCell
    if attention_architecture == "gnmt":
      cell = GNMTAttentionMultiCell(
          attention_cell, cell_list)
    elif attention_architecture == "gnmt_v2":
      cell = GNMTAttentionMultiCell(
          attention_cell, cell_list, use_new_attention=True)
    else:
      raise ValueError(
          "Unknown attention_architecture %s" % attention_architecture)

    if hparams.pass_hidden_state:
      decoder_initial_state = tuple(
          zs.clone(cell_state=es)
          if isinstance(zs, tf.contrib.seq2seq.AttentionWrapperState) else es
          for zs, es in zip(
              cell.zero_state(batch_size, dtype), encoder_state))
    else:
      decoder_initial_state = cell.zero_state(batch_size, dtype)

    return cell, decoder_initial_state

其中AttentionWrapper执行流程如下，代码中分6步：

    # Step 1: Mix the `inputs` and previous step's `attention` output via
    # `cell_input_fn`.
	# 默认是将inputs和上一步的attention进行concat(前一步 attention 的信息可能会对当前预测有帮助，例如让模型避免连续两次注意到同一个地方，跟个结巴似的一直输出一个词。)
    cell_inputs = self._cell_input_fn(inputs, state.attention)
    
    # Step 2: Call the wrapped `cell` with this input and its previous state.
    cell_state = state.cell_state
    cell_output, next_cell_state = self._cell(cell_inputs, cell_state)

    all_alignments = []
    all_attentions = []
    all_attention_states = []
    maybe_all_histories = []
    for i, attention_mechanism in enumerate(self._attention_mechanisms):
      attention, alignments, next_attention_state = self._attention_fn(
          attention_mechanism, cell_output, previous_attention_state[i],
          self._attention_layers[i] if self._attention_layers else None)
      alignment_history = previous_alignment_history[i].write(
          state.time, alignments) if self._alignment_history else ()

      all_attention_states.append(next_attention_state)
      all_alignments.append(alignments)
      all_attentions.append(attention)
      maybe_all_histories.append(alignment_history)

    attention = array_ops.concat(all_attentions, 1)
    next_state = AttentionWrapperState(
        time=state.time + 1,
        cell_state=next_cell_state,
        attention=attention,
        attention_state=self._item_or_tuple(all_attention_states),
        alignments=self._item_or_tuple(all_alignments),
        alignment_history=self._item_or_tuple(maybe_all_histories))

    if self._output_attention:
      return attention, next_state
    else:
      return cell_output, next_state
    # 其中上面的self._attention_fn调用如下：
    	>>>> alignments, next_attention_state = attention_mechanism(
        		cell_output, state=attention_state)
	        # 以luong为例，调用attention_mechanism会触发LuongAttention的__call__
	        # 其中query是cell_output,即step 2的输出；state是attention_state
			>>>> def __call__(self, query, state):
				    with variable_scope.variable_scope(None, "luong_attention", [query]):
				      attention_g = None
				      if self._scale:
				        attention_g = variable_scope.get_variable(
				            "attention_g",
				            dtype=query.dtype,
				            initializer=init_ops.ones_initializer,
				            shape=())
				      # Step 3: Score the cell's output with `attention_mechanism`.
				      score = _luong_score(query, self._keys, attention_g)
				    # Step 4: Calculate the alignments by passing the score through the `normalizer`.
				    alignments = self._probability_fn(score, state)
				    next_state = alignments
				    return alignments, next_state
				    # _luong_score的keys是由memory（encoder的encoder_outputs)通过memory_layer的dense全连接得到
				    >>>> def _luong_score(query, keys, scale):
							  depth = query.get_shape()[-1]
							  key_units = keys.get_shape()[-1]
							  # Reshape from [batch_size, depth] to [batch_size, 1, depth]
							  # for matmul.
							  query = array_ops.expand_dims(query, 1)
							  # Inner product along the query units dimension.
							  # matmul shapes: query is [batch_size, 1, depth] and
							  #                keys is [batch_size, max_time, depth].
							  # the inner product is asked to **transpose keys' inner shape** to get a
							  # batched matmul on:
							  #   [batch_size, 1, depth] . [batch_size, depth, max_time]
							  # resulting in an output shape of:
							  #   [batch_size, 1, max_time].
							  # we then squeeze out the center singleton dimension.
							  score = math_ops.matmul(query, keys, transpose_b=True)
							  score = array_ops.squeeze(score, [1])
							  if scale is not None:
							    score = scale * score
							  return score
    # Step 5: Calculate the context vector as the inner product between the 
    # alignments and the attention_mechanism's values (memory).
	context_ = math_ops.matmul(expanded_alignments, attention_mechanism.values)
	context_ = array_ops.squeeze(context_, [1])
  
    # Step 6: Calculate the attention output by concatenating the cell output and 
    # context through the attention layer (a linear layer with `attention_layer_size` outputs).
    if attention_layer is not None:
        attention = attention_layer(tf.concat([cell_output, context_], 1))
    else:
        attention = context_

GNMTAttentionMultiCell的执行流程如下：

      with tf.variable_scope("cell_0_attention"):
        attention_cell = self._cells[0]
        attention_state = state[0]
        cur_inp, new_attention_state = attention_cell(inputs, attention_state)
        new_states.append(new_attention_state)

	  # 上一时间步的输出作为当前时间步的输入，级联起来。
      for i in range(1, len(self._cells)):
        with tf.variable_scope("cell_%d" % i):

          cell = self._cells[i]
          cur_state = state[i]

          if self.use_new_attention:
            cur_inp = tf.concat([cur_inp, new_attention_state.attention], -1)
          else:
            cur_inp = tf.concat([cur_inp, attention_state.attention], -1)

          cur_inp, new_state = cell(cur_inp, cur_state)
          new_states.append(new_state)

通过dynamic_decode的body调用BasicDecoder的step，从而递归调用cell。在训练阶段inputs初始化为target_input的embedding，next_inputs读取该embedding时序数据；在infer阶段inputs初始化为start_tokens，next_inputs是由helper的sample产生，以GreedyEmbeddingHelper为例，sample产生outputs中argmax的单词。

nmt普通Attention

将cell_list级联，以Attention的输出作为网络的输出（和前面的attention模型架构有些许区别：前面是将target_input作为query，encoder_outputs作为key和value，将attention的输出通过gru cell产生最终输出；而这里是将target_input通过gru cell作为query，encoder_outputs作为key和value，将attention输出作为最终输出）

	# Attention
    attention_mechanism = self.attention_mechanism_fn(
        hparams.attention, num_units, memory, source_sequence_length, self.mode)

    cell = model_helper.create_rnn_cell(
        unit_type=hparams.unit_type,
        num_units=num_units,
        num_layers=num_layers,
        num_residual_layers=num_residual_layers,
        forget_bias=hparams.forget_bias,
        dropout=hparams.dropout,
        num_gpus=self.num_gpus,
        mode=self.mode,
        single_cell_fn=self.single_cell_fn)

    # Only generate alignment in greedy INFER mode.
    alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and
                         infer_mode != "beam_search")
    cell = tf.contrib.seq2seq.AttentionWrapper(
        cell,
        attention_mechanism,
        attention_layer_size=num_units,
        alignment_history=alignment_history,
        output_attention=hparams.output_attention,
        name="attention")

    # TODO(thangluong): do we need num_layers, num_gpus?
    cell = tf.contrib.rnn.DeviceWrapper(cell,
                                        model_helper.get_device_str(
                                            num_layers - 1, self.num_gpus))

    if hparams.pass_hidden_state:
      decoder_initial_state = cell.zero_state(batch_size, dtype).clone(
          cell_state=encoder_state)
    else:
      decoder_initial_state = cell.zero_state(batch_size, dtype)

    return cell, decoder_initial_state

赵文淮

发布了10 篇原创文章 · 获赞 0 · 访问量 279

私信关注

随笔-Attention

Attention