Attention
Encoder
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = gru(self.enc_units)
def call(self, x, hidden):
x = self.embedding(x)
output, state = self.gru(x, initial_state=hidden)
return output, state
LSTM
输入
决定
中哪些部分对
输出
有用
控制当前
的信息融入细胞状态
,判断当前词对全局的重要性
控制上一时刻的细胞状态
的信息融入细胞状态
,判断先前词的重要性
GRU
没有peephole连接
重置门
控制前一时刻隐藏单元
对当前词的影响
决定是否忽略当前词而产生res短路连接
Decoder
class Decoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
super(Decoder, self).__init__()
self.batch_sz = batch_sz
self.dec_units = dec_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = gru(self.dec_units)
self.fc = tf.keras.layers.Dense(vocab_size)
# used for attention
self.W1 = tf.keras.layers.Dense(self.dec_units)
self.W2 = tf.keras.layers.Dense(self.dec_units)
self.V = tf.keras.layers.Dense(1)
def call(self, x, hidden, enc_output):
# enc_output shape == (batch_size, max_length, hidden_size)
# hidden shape == (batch_size, hidden size)
# hidden_with_time_axis shape == (batch_size, 1, hidden size)
# we are doing this to perform addition to calculate the score
hidden_with_time_axis = tf.expand_dims(hidden, 1)
# score shape == (batch_size, max_length, 1)
# we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
# attention_weights shape == (batch_size, max_length, 1)
attention_weights = tf.nn.softmax(score, axis=1)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * enc_output
context_vector = tf.reduce_sum(context_vector, axis=1)
# x shape after passing through embedding == (batch_size, 1, embedding_dim)
x = self.embedding(x)
# x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
# passing the concatenated vector to the GRU
output, state = self.gru(x)
# output shape == (batch_size * 1, hidden_size)
output = tf.reshape(output, (-1, output.shape[2]))
# output shape == (batch_size * 1, vocab)
x = self.fc(output)
return x, state, attention_weights
Attention模型架构
GNMT model
gnmt采用如下模型架构:
- encoder端是8层的LSTM单元,其中只有第一层采用bi-directional的LSTM模块,其他层都是单向的,出于并行化的考虑。
- data parallelism:Downpour SGD。
- model parallelism:encoder端每层运行在不同的gpu上,不需要等待前一层运行完之后再运行。decoder端的softmax层进行了partition。
- res net使深度能支持到8层
- decoder的第一层作为attention输入,attention的输出送入其他层产生softmax的输入。
- word piece: 解决生僻词问题,将词细分为词根和后缀。
- Quantizable Model and Quantized Inference:优化效率
- BLEU: The main problem with this objective is that it does not reflect the task reward function as measured by the BLEU score in translation. Further, this objective does not explicitly encourage a ranking among incorrect output sequences – where outputs with higher BLEU scores should still obtain higher probabilities under the model – since incorrect outputs are never observed during training. In other words, using maximum-likelihood training only, the model will not learn to be robust to errors made during decoding since they are never observed, which is quite a mismatch between the training and testing procedure. GELU在最大似然的基础上加入了reward。
创建decoder网络:
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
source_sequence_length):
# GNMT attention
attention_option = hparams.attention
attention_architecture = hparams.attention_architecture
num_units = hparams.num_units
infer_mode = hparams.infer_mode
...
# tf.contrib.seq2seq.LuongAttention或者tf.contrib.seq2seq.BahdanauAttention
attention_mechanism = self.attention_mechanism_fn(
attention_option, num_units, memory, source_sequence_length, self.mode)
# single cell(LSTM、GRU、NAS)的list,同时添加resnet短连接
cell_list = model_helper._cell_list( # pylint: disable=protected-access
unit_type=hparams.unit_type,
num_units=num_units,
num_layers=self.num_decoder_layers,
num_residual_layers=self.num_decoder_residual_layers,
forget_bias=hparams.forget_bias,
dropout=hparams.dropout,
num_gpus=self.num_gpus,
mode=self.mode,
single_cell_fn=self.single_cell_fn,
residual_fn=gnmt_residual_fn
)
# Only wrap the bottom layer with the attention mechanism.
# 只将第一层送入attention
attention_cell = cell_list.pop(0)
# Only generate alignment in greedy INFER mode.
alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and
infer_mode != "beam_search")
attention_cell = tf.contrib.seq2seq.AttentionWrapper(
attention_cell,
attention_mechanism,
attention_layer_size=None, # don't use attention layer.
output_attention=False,
alignment_history=alignment_history,
name="attention")
# attention输出送入GNMTAttentionMultiCell
if attention_architecture == "gnmt":
cell = GNMTAttentionMultiCell(
attention_cell, cell_list)
elif attention_architecture == "gnmt_v2":
cell = GNMTAttentionMultiCell(
attention_cell, cell_list, use_new_attention=True)
else:
raise ValueError(
"Unknown attention_architecture %s" % attention_architecture)
if hparams.pass_hidden_state:
decoder_initial_state = tuple(
zs.clone(cell_state=es)
if isinstance(zs, tf.contrib.seq2seq.AttentionWrapperState) else es
for zs, es in zip(
cell.zero_state(batch_size, dtype), encoder_state))
else:
decoder_initial_state = cell.zero_state(batch_size, dtype)
return cell, decoder_initial_state
其中AttentionWrapper执行流程如下,代码中分6步:
# Step 1: Mix the `inputs` and previous step's `attention` output via
# `cell_input_fn`.
# 默认是将inputs和上一步的attention进行concat(前一步 attention 的信息可能会对当前预测有帮助,例如让模型避免连续两次注意到同一个地方,跟个结巴似的一直输出一个词。)
cell_inputs = self._cell_input_fn(inputs, state.attention)
# Step 2: Call the wrapped `cell` with this input and its previous state.
cell_state = state.cell_state
cell_output, next_cell_state = self._cell(cell_inputs, cell_state)
all_alignments = []
all_attentions = []
all_attention_states = []
maybe_all_histories = []
for i, attention_mechanism in enumerate(self._attention_mechanisms):
attention, alignments, next_attention_state = self._attention_fn(
attention_mechanism, cell_output, previous_attention_state[i],
self._attention_layers[i] if self._attention_layers else None)
alignment_history = previous_alignment_history[i].write(
state.time, alignments) if self._alignment_history else ()
all_attention_states.append(next_attention_state)
all_alignments.append(alignments)
all_attentions.append(attention)
maybe_all_histories.append(alignment_history)
attention = array_ops.concat(all_attentions, 1)
next_state = AttentionWrapperState(
time=state.time + 1,
cell_state=next_cell_state,
attention=attention,
attention_state=self._item_or_tuple(all_attention_states),
alignments=self._item_or_tuple(all_alignments),
alignment_history=self._item_or_tuple(maybe_all_histories))
if self._output_attention:
return attention, next_state
else:
return cell_output, next_state
# 其中上面的self._attention_fn调用如下:
>>>> alignments, next_attention_state = attention_mechanism(
cell_output, state=attention_state)
# 以luong为例,调用attention_mechanism会触发LuongAttention的__call__
# 其中query是cell_output,即step 2的输出;state是attention_state
>>>> def __call__(self, query, state):
with variable_scope.variable_scope(None, "luong_attention", [query]):
attention_g = None
if self._scale:
attention_g = variable_scope.get_variable(
"attention_g",
dtype=query.dtype,
initializer=init_ops.ones_initializer,
shape=())
# Step 3: Score the cell's output with `attention_mechanism`.
score = _luong_score(query, self._keys, attention_g)
# Step 4: Calculate the alignments by passing the score through the `normalizer`.
alignments = self._probability_fn(score, state)
next_state = alignments
return alignments, next_state
# _luong_score的keys是由memory(encoder的encoder_outputs)通过memory_layer的dense全连接得到
>>>> def _luong_score(query, keys, scale):
depth = query.get_shape()[-1]
key_units = keys.get_shape()[-1]
# Reshape from [batch_size, depth] to [batch_size, 1, depth]
# for matmul.
query = array_ops.expand_dims(query, 1)
# Inner product along the query units dimension.
# matmul shapes: query is [batch_size, 1, depth] and
# keys is [batch_size, max_time, depth].
# the inner product is asked to **transpose keys' inner shape** to get a
# batched matmul on:
# [batch_size, 1, depth] . [batch_size, depth, max_time]
# resulting in an output shape of:
# [batch_size, 1, max_time].
# we then squeeze out the center singleton dimension.
score = math_ops.matmul(query, keys, transpose_b=True)
score = array_ops.squeeze(score, [1])
if scale is not None:
score = scale * score
return score
# Step 5: Calculate the context vector as the inner product between the
# alignments and the attention_mechanism's values (memory).
context_ = math_ops.matmul(expanded_alignments, attention_mechanism.values)
context_ = array_ops.squeeze(context_, [1])
# Step 6: Calculate the attention output by concatenating the cell output and
# context through the attention layer (a linear layer with `attention_layer_size` outputs).
if attention_layer is not None:
attention = attention_layer(tf.concat([cell_output, context_], 1))
else:
attention = context_
GNMTAttentionMultiCell的执行流程如下:
with tf.variable_scope("cell_0_attention"):
attention_cell = self._cells[0]
attention_state = state[0]
cur_inp, new_attention_state = attention_cell(inputs, attention_state)
new_states.append(new_attention_state)
# 上一时间步的输出作为当前时间步的输入,级联起来。
for i in range(1, len(self._cells)):
with tf.variable_scope("cell_%d" % i):
cell = self._cells[i]
cur_state = state[i]
if self.use_new_attention:
cur_inp = tf.concat([cur_inp, new_attention_state.attention], -1)
else:
cur_inp = tf.concat([cur_inp, attention_state.attention], -1)
cur_inp, new_state = cell(cur_inp, cur_state)
new_states.append(new_state)
通过dynamic_decode的body调用BasicDecoder的step,从而递归调用cell。在训练阶段inputs初始化为target_input的embedding,next_inputs读取该embedding时序数据;在infer阶段inputs初始化为start_tokens,next_inputs是由helper的sample产生,以GreedyEmbeddingHelper为例,sample产生outputs中argmax的单词。
nmt普通Attention
将cell_list级联,以Attention的输出作为网络的输出(和前面的attention模型架构有些许区别:前面是将target_input作为query,encoder_outputs作为key和value,将attention的输出通过gru cell产生最终输出;而这里是将target_input通过gru cell作为query,encoder_outputs作为key和value,将attention输出作为最终输出)
# Attention
attention_mechanism = self.attention_mechanism_fn(
hparams.attention, num_units, memory, source_sequence_length, self.mode)
cell = model_helper.create_rnn_cell(
unit_type=hparams.unit_type,
num_units=num_units,
num_layers=num_layers,
num_residual_layers=num_residual_layers,
forget_bias=hparams.forget_bias,
dropout=hparams.dropout,
num_gpus=self.num_gpus,
mode=self.mode,
single_cell_fn=self.single_cell_fn)
# Only generate alignment in greedy INFER mode.
alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and
infer_mode != "beam_search")
cell = tf.contrib.seq2seq.AttentionWrapper(
cell,
attention_mechanism,
attention_layer_size=num_units,
alignment_history=alignment_history,
output_attention=hparams.output_attention,
name="attention")
# TODO(thangluong): do we need num_layers, num_gpus?
cell = tf.contrib.rnn.DeviceWrapper(cell,
model_helper.get_device_str(
num_layers - 1, self.num_gpus))
if hparams.pass_hidden_state:
decoder_initial_state = cell.zero_state(batch_size, dtype).clone(
cell_state=encoder_state)
else:
decoder_initial_state = cell.zero_state(batch_size, dtype)
return cell, decoder_initial_state