// copy FP32 params back into FP16 model
offset = 0
for p in self.params:
if not p.requires_grad:
continue
numel = p.data.numel()
p.data.copy_(self.fp32_params.data[offset:offset+numel].view_as(p.data))
offset += numel
After Change
for p in group["params"]:
p.data = p.data.half()
if p.grad is not None:
p.grad.data = p.grad.data.half()
def zero_grad(self):
Clears the gradients of all optimized parameters.
self.wrapped_optimizer.zero_grad()