onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD})
featuresVocabs = []
reader = onmt.utils.FileReader.new(filename)
while True:
sent = reader.next()
if sent is None:
break
words, features, numFeatures = onmt.utils.Features.extract(sent)
if len(featuresVocabs) == 0 and numFeatures > 0:
for j in range(numFeatures):
featuresVocabs[j] = onmt.utils.Dict.new(
{onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD})
else:
assert(len(featuresVocabs) == numFeatures,
"all sentences must have the same numbers of additional features")
for i in range(len(words)):
wordVocab.add(word[i])
for j in range(numFeatures):
featuresVocabs[j].add(features[j][i])
reader.close()
originalSize = wordVocab.size()
wordVocab = wordVocab.prune(size)
print("Created dictionary of size %d (pruned from %d)" %