// we choose to use a RNN instead.
answer.add(LSTM(32))
// one regularization layer -- more would probably be needed.
answer.add(Dropout(0.3))
answer.add(Dense(vocab_size))
// we output a probability distribution over the vocabulary
answer.add(Activation("softmax"))
After Change
// the original paper uses a matrix multiplication for this reduction step.
// we choose to use a RNN instead.
answer = LSTM(32)(answer) // (samples, 32)
// one regularization layer -- more would probably be needed.
answer = Dropout(0.3)(answer)
answer = Dense(vocab_size)(answer) // (samples, vocab_size)
// we output a probability distribution over the vocabulary
answer = Activation("softmax")(answer)
// build the final model
model = Model([input_sequence, question], answer)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
metrics=["accuracy"])