self.corpus.append([bug["id"], textual_features])
// Assigning unique integer ids to all words
self.dictionary = Dictionary(text for bug_id, text in self.corpus)
// Conversion to BoW
corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus]
// Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions
tfidf = models.TfidfModel(corpus_final)
corpus_tfidf = tfidf[corpus_final]
// Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing
self.lsi = models.LsiModel(
corpus_tfidf, id2word=self.dictionary, num_topics=300
)
corpus_lsi = self.lsi[corpus_tfidf]
// Indexing the corpus
self.index = similarities.Similarity(
output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300