
Before Change

        no_above_abs = int(no_above * self.num_docs) // convert fractional threshold to absolute threshold

        // statistics of which ids do we keep?
        good_ids = (hash_id for hash_id in self.keys() if no_below <= self.dfs.get(hash_id, 0) <= no_above_abs)

        if keep_n is not None:
            good_ids = sorted(good_ids, key=lambda item: self.dfs.get(item, 0), reverse=True)
            good_ids = good_ids[:keep_n]
        good_ids = set(good_ids)

        self.id2token = dict((tokenid, freq) for tokenid, freq in self.id2token.iteritems() if tokenid in good_ids)
        self.dfs = dict((tokenid, freq) for tokenid, freq in self.dfs.iteritems() if tokenid in good_ids)
        logger.info("kept statistics for %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents" %

After Change

        no_above_abs = int(no_above * self.num_docs) // convert fractional threshold to absolute threshold

        self.dfs_debug = dict((word, freq) for word, freq in self.dfs_debug.iteritems() if no_below <= freq <= no_above_abs)
        self.token2id = dict((token, tokenid) for token, tokenid in self.token2id.iteritems() if token in self.dfs_debug)
        self.id2token = dict((tokenid, set(token for token in tokens if token in self.dfs_debug)) for tokenid, tokens in self.id2token.iteritems())
        self.dfs = dict((tokenid, freq) for tokenid, freq in self.dfs.iteritems() if self.id2token.get(tokenid, set()))

        // for word->document frequency
Italian Trulli

Frequency: 3

Non-data size: 5


Project Name: RaRe-Technologies/gensim
Commit Name: 3e2ebf76a530eca5305d1feba6be9c531ee33d71
Time: 2012-08-19
Author: radimrehurek@seznam.cz
File Name: gensim/corpora/hashdictionary.py
Class Name: HashDictionary
Method Name: filter_extremes

Project Name: deepfakes/faceswap
Commit Name: cbcd301150a30da76baa29b27026d85def796e3b
Time: 2020-08-22
Author: 36920800+torzdf@users.noreply.github.com
File Name: lib/gui/_config.py
Class Name:
Method Name: get_clean_fonts

Project Name: pantsbuild/pants
Commit Name: cb091ce8cd52691e9eb569dacd960cd326030c8b
Time: 2014-02-20
Author: jsirois@twitter.com
File Name: src/python/twitter/pants/tasks/builddictionary.py
Class Name: BuildBuildDictionary
Method Name: _gen_goals_reference