self.xp, self.phi)
batch_action = [
self.policy(batch_xs[i]).sample()
for i in range(len(batch_obs))]
// Q is not needed here, but log it just for information
// q = self.q_function(batch_xs, batch_action)
After Change
with chainer.using_config("train", False), chainer.no_backprop_mode():
batch_xs = self.batch_states(batch_obs,
self.xp, self.phi)
batch_action = self.policy(batch_xs).sample()
// Q is not needed here, but log it just for information
q = self.q_function(batch_xs, batch_action)
// Update stats
self.average_q *= self.average_q_decay
self.average_q += (1 - self.average_q_decay) * float(q.array)
self.logger.debug("t:%s a:%s q:%s",
self.t, batch_action.array[0], q.array)
return [cuda.to_cpu(action.array) for action in batch_action]