v = self.v_function(self.past_states[i])
advantage = R - v
// Accumulate gradients of policy
log_prob = F.log(self.past_action_prob[i])
(- log_prob * float(advantage.data)).backward()
// Accumulate gradients of value function
(advantage ** 2).backward()
After Change
self.optimizer.zero_grads()
pi_loss = 0
v_loss = 0
for i in reversed(xrange(self.t_start, self.t)):
R *= self.gamma
R += self.past_rewards[i]
v = self.v_function(self.past_states[i])
advantage = R - v
// Accumulate gradients of policy
log_prob = self.past_action_log_prob[i]
pi_loss += (- log_prob * float(advantage.data))
// Accumulate gradients of value function
v_loss += advantage ** 2
pi_loss.backward()
v_loss.backward()
self.optimizer.update()