logging.info("storing corpus in Blei"s LDA-C format: %s" % fname)
with open(fname, "w") as fout:
offsets = []for doc in corpus:
doc = list(doc)
offsets.append(fout.tell())
fout.write("%i %s\n" % (len(doc), " ".join("%i:%s" % p for p in doc)))
fout.close()
// write out vocabulary, in a format compatible with Blei"s topics.py script
fnameVocab = fname + ".vocab"
logging.info("saving vocabulary of %i words to %s" % (numTerms, fnameVocab))
fout = open(fnameVocab, "w")
for featureId in xrange(numTerms):
fout.write("%s\n" % utils.toUtf8(id2word.get(featureId, "---")))
return offsets
def docbyoffset(self, offset):
Return the document stored at file position `offset`.