attention_scores = attention_scores + attention_mask
// Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)
// This is actually dropping out entire tokens to attend to, which might
// seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
After Change
attention_scores = attention_scores + attention_mask
// Normalize the attention scores to probabilities.
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
// This is actually dropping out entire tokens to attend to, which might
// seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)