// embed fc and att feats
fc_feats = [m.fc_embed(fc_feats) for m in self.models]
att_feats = [pack_wrapper(m.att_embed, att_feats[...,:m.att_feat_size], att_masks)for m in self.models]
// Project the attention feats first to reduce memory and computation comsumptions.
p_att_feats = [m.ctx2att(att_feats[i])for i,m in enumerate(self.models)]
for i in range(seq.size(1) - 1):
if self.training and i >= 1 and self.ss_prob > 0.0: // otherwiste no need to sample