try:
with torch.autograd.profiler.record_function("reduce-grads"):
self.optimizer.all_reduce_grads(self.model)
if utils.has_parameters(self.criterion):
self.optimizer.all_reduce_grads(self.criterion)
with torch.autograd.profiler.record_function("multiply-grads"):
// multiply gradients by (data_parallel_size / sample_size) since
// DDP already normalizes by the number of data parallel workers.
// Thus we get (sum_of_gradients / sample_size) at the end.