// feed forward
ff_in, _ = self.multiattn[i](out, out, out)
// ff_in = out
ff_out = self.linear_final[i](F.relu(self.linear_out[i](ff_in)))
out = self.layer_norm[i](ff_in + F.dropout(ff_out, p=0.1))
// out = self.layer_norm[i](F.relu(ff_in + ff_out)).contiguous()
return (mean, mean), out.transpose(0, 1).contiguous()
if lengths:
After Change
.expand(self.layers, pre_emb.size(1), pre_emb.size(2))
out = pre_emb.transpose(0, 1).contiguous()
for i in range(self.layers):
out = self.transformer[i](out, input[:, :, 0])
return (mean, mean), out.transpose(0, 1).contiguous()