inverted_one_hot = mask - self.actions_one_hot[action]
// max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others
expert_margin = self.training_output[action][:-1] + inverted_one_hot * config.expert_margin
supervised_selector = tf.reduce_max(input_tensor=expert_margin, axis=1)
After Change
for name, action in self.action.items():
// Create the supervised margin loss
// Zero for the action taken, one for all other actions, now multiply by expert margin
one_hot = tf.one_hot(indices=action, depth=config.actions[name].num_actions)
ones = tf.ones_like(tensor=one_hot, dtype=tf.float32)
inverted_one_hot = ones - one_hot
// max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others