attention_mask=None,
cache=None,
decode_loop_step=None):
from_tensor = inputs[0]to_tensor = inputs[1]
// Scalar dimensions referenced here:
// B = batch size (number of sequences)
// F = `from_tensor` sequence length
// T = `to_tensor` sequence length
// N = `num_attention_heads`
// H = `size_per_head`
// `query_tensor` = [B, F, N ,H]
query_tensor = self._query_dense(from_tensor)
// `key_tensor` = [B, T, N, H]
key_tensor = self._key_dense(to_tensor)
// `value_tensor` = [B, T, N, H]
value_tensor = self._value_dense(to_tensor)
if cache:
key_tensor, value_tensor = self._update_cache(key_tensor, value_tensor,
cache, decode_loop_step)
// Take the dot product between "query" and "key" to get the raw
// attention scores.
attention_scores = tf.einsum(self._dot_product_equation, key_tensor,
query_tensor)attention_scores = tf.multiply(attention_scores,
1.0 / math.sqrt(float(self._key_size)))
// Normalize the attention scores to probabilities.
// `attention_scores` = [B, N, F, T]
attention_scores = self._masked_softmax(attention_scores, attention_mask)
// This is actually dropping out entire tokens to attend to, which might
// seem a bit unusual, but is taken from the original Transformer paper.
attention_scores = self._dropout_layer(attention_scores)
// `context_layer` = [B, F, N, H]
attention_output = tf.einsum(self._combine_equation, attention_scores,
value_tensor)
attention_output = self._output_dense(attention_output)
if self._return_attention_scores:
return attention_output, attention_scores, cache
return attention_output, cache