logger.debug("constructing sparse document matrix")
// construct the sparse matrix as lil_matrix first, convert to csc later
// lil_matrix can quickly update rows, so initialize it transposed (documents=rows)
mat = scipy.sparse.lil_matrix((1, 1), dtype = dtype)mat.rows, mat.data = [], []
for i, doc in enumerate(corpus):
doc = sorted(doc)
mat.rows.append([fid for fid, _ in doc])
mat.data.append([val for _, val in doc])
docs = i + 1
mat._shape = (docs, m)
mat = mat.tocsr().transpose() // transpose back to documents=columns
assert isinstance(mat, scipy.sparse.csc_matrix)
return mat
After Change
with documents as columns.
logger.debug("constructing sparse document matrix")
docs, data, indices, indptr = 0, [], [], [0]
for doc in corpus:
indptr.append(len(doc))
indices.extend([feature_id for feature_id, _ in doc])
data.extend([feature_weight for _, feature_weight in doc])
docs += 1
indptr = numpy.cumsum(indptr)
data = numpy.asarray(data)
indices = numpy.asarray(indices)
return scipy.sparse.csc_matrix((data, indices, indptr), shape = (num_terms, docs), dtype = dtype)