self.optimizer = optim.Adagrad(self.params, lr=self.learning_rate)
for group in self.optimizer.param_groups:
for p in group["params"]:
self.optimizer.state[p]["sum"] = self.optimizer\
.state[p]["sum"].fill_(self.adagrad_accum)
elif self.method == "adadelta":
self.optimizer = optim.Adadelta(self.params, lr=self.learning_rate)
elif self.method == "adam":
self.optimizer = optim.Adam(self.params, lr=self.learning_rate,
After Change
betas=self.betas, eps=1e-9)
elif self.method == "sparseadam":
dense = []
sparse = []
for name, param in model.named_parameters():
if not param.requires_grad:
continue
// TODO: Find a better way to check for sparse gradients.
if "embed" in name:
sparse.append(param)
else:
dense.append(param)
self.optimizer = MultipleOptimizer(
[optim.Adam(dense, lr=self.learning_rate,