rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
for (log_prob, value), r in zip(saved_actions, rewards):
reward = r - value.data[0, 0]
policy_loss -= (log_prob * reward).sum()
value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r])))
optimizer.zero_grad()
(policy_loss + value_loss).backward()
optimizer.step()
del model.rewards[:]
del model.saved_actions[:]
After Change
value_losses.append(F.smooth_l1_loss(value, Variable(torch.Tensor([r]))))
optimizer.zero_grad()
loss = torch.cat(policy_losses).sum() + torch.cat(value_losses).sum()
loss.backward()
optimizer.step()
del model.rewards[:]
del model.saved_actions[:]