570d9a2b06fd6269c930d7fddf38bc60b212ebee,official/nlp/modeling/layers/attention.py,CachedAttention,call,#CachedAttention#Any#Any#Any#Any#,487

Before Change


           attention_mask=None,
           cache=None,
           decode_loop_step=None):
    from_tensor = inputs[0]
    to_tensor = inputs[1]

    // Scalar dimensions referenced here:
    //   B = batch size (number of sequences)
    //   F = `from_tensor` sequence length
    //   T = `to_tensor` sequence length
    //   N = `num_attention_heads`
    //   H = `size_per_head`
    // `query_tensor` = [B, F, N ,H]
    query_tensor = self._query_dense(from_tensor)

    // `key_tensor` = [B, T, N, H]
    key_tensor = self._key_dense(to_tensor)

    // `value_tensor` = [B, T, N, H]
    value_tensor = self._value_dense(to_tensor)

    if cache:
      key_tensor, value_tensor = self._update_cache(key_tensor, value_tensor,
                                                    cache, decode_loop_step)

    // Take the dot product between "query" and "key" to get the raw
    // attention scores.
    attention_scores = tf.einsum(self._dot_product_equation, key_tensor,
                                 query_tensor)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(self._key_size)))

    // Normalize the attention scores to probabilities.
    // `attention_scores` = [B, N, F, T]
    attention_scores = self._masked_softmax(attention_scores, attention_mask)

    // This is actually dropping out entire tokens to attend to, which might
    // seem a bit unusual, but is taken from the original Transformer paper.
    attention_scores = self._dropout_layer(attention_scores)
    // `context_layer` = [B, F, N, H]
    attention_output = tf.einsum(self._combine_equation, attention_scores,
                                 value_tensor)
    attention_output = self._output_dense(attention_output)
    if self._return_attention_scores:
      return attention_output, attention_scores, cache
    return attention_output, cache

After Change


    // `value` = [B, S, N, H]
    value = self._value_dense(value)

    attention_output, attention_scores = self.compute_attention(
        query, key, value, attention_mask)
    attention_output = self._output_dense(attention_output)

In pattern: SUPERPATTERN

Frequency: 4

Non-data size: 8

Instances

Link

Project Name: tensorflow/models

Commit Name: 570d9a2b06fd6269c930d7fddf38bc60b212ebee

Time: 2020-07-21

Author: hongkuny@google.com

File Name: official/nlp/modeling/layers/attention.py

Class Name: CachedAttention

Method Name: call

Link

Project Name: jfkirk/tensorrec

Commit Name: ac35acad4f4311aef39aea039902cd01775eb70e

Time: 2018-03-01

Author: jcauterucciojr@gmail.com

File Name: tensorrec/eval.py

Class Name:

Method Name: ndcg_at_k

Link

Project Name: tensorflow/models

Commit Name: 36101ab4095065a4196ff4f6437e94f0d91df4e9

Time: 2020-07-21

Author: hongkuny@google.com

File Name: official/nlp/modeling/layers/attention.py

Class Name: CachedAttention

Method Name: call

Link

Project Name: jfkirk/tensorrec

Commit Name: 466ef0befceaad2be3244efd724c76c90869b17d

Time: 2018-03-02

Author: jcauterucciojr@gmail.com

File Name: tensorrec/eval.py

Class Name:

Method Name: ndcg_at_k