print("-" * 89)
print("| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | "
"valid ppl {:8.2f}".format(epoch, (time.time() - epoch_start_time),
val_loss, math.exp(val_loss)))
print("-" * 89)
if val_loss < best_val_loss:
After Change
nlayers = 2 // the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 // the number of heads in the multiheadattention models
dropout = 0.2 // the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Run the model
// -------------
//
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// `CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss//torch.nn.CrossEntropyLoss>`__
// is applied to track the loss and
// `SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd//torch.optim.SGD>`__
// implements stochastic gradient descent method as the optimizer. The initial
// learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr//torch.optim.lr_scheduler.StepLR>`__ is
// applied to adjust the learn rate through epochs. During the
// training, we use
// `nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm//torch.nn.utils.clip_grad_norm_>`__
// function to scale all the gradient together to prevent exploding.
//
criterion = nn.CrossEntropyLoss()
lr = 5.0 // learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
import time
def train():