bd334ef20fdccb74d310ca00b1134388645ba0a5,vendor/readability/encoding.py,,get_encoding,#Any#,4
Before Change
if not text.strip() or len(text) < 10:
return enc // can"t guess
try:
diff = text.decode(enc, "ignore").encode(enc)
sizes = len(diff), len(text)
if abs(len(text) - len(diff)) < max(sizes) * 0.01: // 99% of utf-8
return enc
except UnicodeDecodeError:
pass
res = chardet.detect(text)
enc = res["encoding"]
After Change
pragma_re = re.compile(r"<meta.*?content=["\"]*;?charset=(.+?)["\">]", flags=re.I)
xml_re = re.compile(r"^<\?xml.*?encoding=["\"]*(.+?)["\">]")
declared_encodings = (charset_re.findall(page) +
pragma_re.findall(page) +
xml_re.findall(page))
// Try any declared encodings
if len(declared_encodings) > 0:
for declared_encoding in declared_encodings:
try:
page.decode(custom_decode(declared_encoding))
In pattern: SUPERPATTERN
Frequency: 3
Non-data size: 6
Instances
Project Name: samuelclay/NewsBlur
Commit Name: bd334ef20fdccb74d310ca00b1134388645ba0a5
Time: 2014-07-21
Author: samuel@ofbrooklyn.com
File Name: vendor/readability/encoding.py
Class Name:
Method Name: get_encoding
Project Name: chakki-works/doccano
Commit Name: 49d41416e440926f0a9a8243b4d77f6f5468efe9
Time: 2019-03-12
Author: light.tree.1.13@gmail.com
File Name: app/server/utils.py
Class Name: CoNLLHandler
Method Name: parse
Project Name: sony/nnabla
Commit Name: 86bcfb42aaa66a4a4da18fbba062bcda23d2b135
Time: 2021-03-08
Author: Yoshiyuki.Kobayashi@sony.com
File Name: python/src/nnabla/utils/data_source_implements.py
Class Name: CsvDataSource
Method Name: __init__