6e8898b82f4591fe2256ca56be3c98836fe94b88,simhash/__init__.py,Simhash,__init__,#Simhash#Any#Any#Any#Any#,25
Before Change
else:
self.hashfunc = hashfunc
if isinstance(value, Simhash):
self.value = value.value
elif isinstance(value, basestring):
self.build_by_text(unicode(value))
elif isinstance(value, collections.Iterable):
self.build_by_features(value)
elif isinstance(value, long):
self.value = value
else:
raise Exception("Bad parameter with type {}".format(type(value)))
def _slide(self, content, width=4):
return [content[i:i + width] for i in range(max(len(content) - width + 1, 1))]
def _tokenize(self, content):
After Change
self.bucket = collections.defaultdict(set)
for i, q in enumerate(objs):
if i % 10000 == 0 or i == count - 1:
self.log.info("%s/%s", i + 1, count)
self.add(*q)
def get_near_dups(self, simhash):
In pattern: SUPERPATTERN
Frequency: 3
Non-data size: 5
Instances
Project Name: leonsim/simhash
Commit Name: 6e8898b82f4591fe2256ca56be3c98836fe94b88
Time: 2017-10-31
Author: akellne@users.noreply.github.com
File Name: simhash/__init__.py
Class Name: Simhash
Method Name: __init__
Project Name: biolab/orange3
Commit Name: c073de4002f7a72babc8fd6ecdcd90c141be2c08
Time: 2013-01-21
Author: janez.demsar@fri.uni-lj.si
File Name: Orange/widgets/utils/datacaching.py
Class Name:
Method Name: getCached
Project Name: tensorflow/models
Commit Name: 0270cac779f61a879f74d9495b9156f8f4127478
Time: 2018-05-11
Author: scottzhu@google.com
File Name: official/utils/logs/logger.py
Class Name: BaseBenchmarkLogger
Method Name: log_metric