to_shape = tf_utils.get_shape_list(to_mask, expected_rank=2)
to_seq_length = to_shape[1]
to_mask = tf.cast(
tf.reshape(to_mask, [batch_size, 1, to_seq_length]),
dtype=from_tensor.dtype)
// We don"t assume that `from_tensor` is a mask (although it could be). We
// don"t actually care if we attend *from* padding tokens (only *to* padding)
// tokens so we create a tensor of all ones.