rollouts.obs[0].copy_(obs)
// These variables are used to compute average rewards for all processes.
episode_rewards = torch.zeros([args.num_processes, 1])
final_rewards = torch.zeros([args.num_processes, 1])
rollouts.to(device)
start = time.time()
for j in range(num_updates):
for step in range(args.num_steps):
// Sample actions
with torch.no_grad():
value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
rollouts.obs[step],
rollouts.recurrent_hidden_states[step],
rollouts.masks[step])
// Obser reward and next obs
obs, reward, done, info = envs.step(action)
episode_rewards += reward
// If done then clean the history of observations.
masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
final_rewards *= masks
After Change
obs, reward, done, infos = envs.step(action)
for info in infos:
if "episode" in info.keys():
episode_rewards.append(info["episode"]["r"])
// If done then clean the history of observations.