rollouts.obs[0].copy_(obs)
// These variables are used to compute average rewards for all processes.
episode_rewards = torch.zeros([args.num_processes, 1])
final_rewards = torch.zeros([args.num_processes, 1])
rollouts.to(device)
start = time.time()
for j in range(num_updates):
for step in range(args.num_steps):
// Sample actions
with torch.no_grad():
value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
rollouts.obs[step],
rollouts.recurrent_hidden_states[step],
rollouts.masks[step])
// Obser reward and next obs
obs, reward, done, info = envs.step(action)
episode_rewards += reward
// If done then clean the history of observations.
masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
final_rewards *= masks
final_rewards += (1 - masks) * episode_rewards
episode_rewards *= masks
rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)
After Change
rollouts.obs[0].copy_(obs)
rollouts.to(device)
episode_rewards = deque(maxlen=10)
start = time.time()
for j in range(num_updates):
for step in range(args.num_steps):