// equal to the length of the vocab object.
//
ntokens = len(vocab.stoi) // the size of vocabulary
emsize = 200 // embedding dimension
nhid = 200 // the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 // the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 // the number of heads in the multiheadattention models
dropout = 0.2 // the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Run the model
// -------------
//
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// `CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss//torch.nn.CrossEntropyLoss>`__
// is applied to track the loss and
// `SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd//torch.optim.SGD>`__
// implements stochastic gradient descent method as the optimizer. The initial
// learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr//torch.optim.lr_scheduler.StepLR>`__ is
// applied to adjust the learn rate through epochs. During the
// training, we use
// `nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm//torch.nn.utils.clip_grad_norm_>`__
// function to scale all the gradient together to prevent exploding.
//
criterion = nn.CrossEntropyLoss()
lr = 5.0 // learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
import time
def train():
model.train() // Turn on the train mode