if scores_mask is not None:
padding_mask = tf.logical_not(scores_mask)
// Bias so padding positions do not contribute to attention distribution.
scores -= 1.e9 * tf.cast(padding_mask, dtype=K.floatx())
if training is None:
training = K.learning_phase()
weights = tf.compat.v1.math.softmax(scores)
After Change
if scores.dtype is tf.float16:
scores -= 65504. * tf.cast(padding_mask, dtype=scores.dtype)
else:
scores -= 1.e9 * tf.cast(padding_mask, dtype=scores.dtype)
if training is None:
training = K.learning_phase()
weights = tf.compat.v1.math.softmax(scores)