3e2ebf76a530eca5305d1feba6be9c531ee33d71,gensim/corpora/hashdictionary.py,HashDictionary,filter_extremes,#HashDictionary#Any#Any#Any#,172

Before Change


        no_above_abs = int(no_above * self.num_docs) // convert fractional threshold to absolute threshold

        // statistics of which ids do we keep?
        good_ids = (hash_id for hash_id in self.keys() if no_below <= self.dfs.get(hash_id, 0) <= no_above_abs)

        if keep_n is not None:
            good_ids = sorted(good_ids, key=lambda item: self.dfs.get(item, 0), reverse=True)
            good_ids = good_ids[:keep_n]
        good_ids = set(good_ids)

        self.id2token = dict((tokenid, freq) for tokenid, freq in self.id2token.iteritems() if tokenid in good_ids)
        self.dfs = dict((tokenid, freq) for tokenid, freq in self.dfs.iteritems() if tokenid in good_ids)
        logger.info("kept statistics for %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents" %

After Change


        no_above_abs = int(no_above * self.num_docs) // convert fractional threshold to absolute threshold

        self.dfs_debug = dict((word, freq) for word, freq in self.dfs_debug.iteritems() if no_below <= freq <= no_above_abs)
        self.token2id = dict((token, tokenid) for token, tokenid in self.token2id.iteritems() if token in self.dfs_debug)
        self.id2token = dict((tokenid, set(token for token in tokens if token in self.dfs_debug)) for tokenid, tokens in self.id2token.iteritems())
        self.dfs = dict((tokenid, freq) for tokenid, freq in self.dfs.iteritems() if self.id2token.get(tokenid, set()))

        // for word->document frequency
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 5

Non-data size: 6

Instances


Project Name: RaRe-Technologies/gensim
Commit Name: 3e2ebf76a530eca5305d1feba6be9c531ee33d71
Time: 2012-08-19
Author: radimrehurek@seznam.cz
File Name: gensim/corpora/hashdictionary.py
Class Name: HashDictionary
Method Name: filter_extremes


Project Name: brian-team/brian2
Commit Name: 724ad2c1d5f631d40b757766f5ae7dca308542e2
Time: 2014-02-25
Author: marcel.stimberg@ens.fr
File Name: brian2/codegen/generators/base.py
Class Name: CodeGenerator
Method Name: arrays_helper


Project Name: scipy/scipy
Commit Name: e7fbf0f7dbe36bcf9d0057f881b5faf7547e9a02
Time: 2017-01-07
Author: yoch.melka@gmail.com
File Name: scipy/sparse/dok.py
Class Name: dok_matrix
Method Name: tocoo


Project Name: pantsbuild/pants
Commit Name: dd6afb2e51df558f103ba8b3e11a3871b2015962
Time: 2020-07-15
Author: benjyw@gmail.com
File Name: src/python/pants/backend/project_info/source_file_validator.py
Class Name: MultiMatcher
Method Name: __init__


Project Name: OpenNMT/OpenNMT-py
Commit Name: 594f66417cf0a2abaead1ce6e5f15a2100441682
Time: 2017-09-21
Author: srush@seas.harvard.edu
File Name: onmt/IO.py
Class Name: ONMTDataset
Method Name: __init__