// Disable clipping to make sure we can see the difference in behavior
agent.policy._clip = False
// Remove policy_info, as BehavioralCloningAgent expects none.
traj = traj.replace(policy_info=())
// TODO(b/123883319)
if tf.executing_eagerly():
train_and_loss = lambda: agent.train(traj)
else:
train_and_loss = agent.train(traj)
replay = trajectory_replay.TrajectoryReplay(agent.policy)
self.evaluate(tf.compat.v1.global_variables_initializer())
initial_actions = self.evaluate(replay.run(traj)[0])
for _ in range(TRAIN_ITERATIONS):
self.evaluate(train_and_loss)
post_training_actions = self.evaluate(replay.run(traj)[0])
post_training_actions = self.evaluate(replay.run(traj)[0])
// We don"t necessarily converge to the same actions as in trajectory after
// 10 steps of an untuned optimizer, but the policy does change.
self.assertFalse(np.all(initial_actions == post_training_actions))
After Change
// We don"t necessarily converge to the same actions as in trajectory after
// 10 steps of an untuned optimizer, but the loss should go down.
self.assertGreater(initial_loss, loss)
def testTrainWithSingleOuterDimension(self):
// Hard code a trajectory shaped (time=6, batch=1, ...).
traj, time_step_spec, action_spec = create_arbitrary_trajectory()