if self.distributed_world_size:
grads = []
for p in self.model.parameters():
if p.grad is not None:
grads.append(p.grad)
start = time.time()
After Change
// Erase gradients in all vars of this optimizer.
opt.zero_grad()
// Recompute gradients of loss over all variables.
loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1))
grad_info.update(self.extra_grad_process(opt, loss_out[i]))
if self.distributed_world_size: