float: The average return in last epoch cycle.
if not self._eval_env:
self._eval_env = runner.get_env_copy()
last_return = None
runner.enable_logging = False
for _ in runner.step_epochs():
for cycle in range(self._steps_per_epoch):
runner.step_path = runner.obtain_trajectories(runner.step_itr)
last_return = self.train_once(runner.step_itr,
runner.step_path)
if (cycle == 0 and self.replay_buffer.n_transitions_stored >=
self._min_buffer_size):
runner.enable_logging = True
log_performance(runner.step_itr,
obtain_evaluation_samples(
self.policy, self._eval_env),
discount=self._discount)
runner.step_itr += 1