def get_encoding(page):
// Regex for XML and HTML Meta charset declaration
charset_re = re.compile(r"<meta.*?charset=["\"]*(.+?)["\">]", flags=re.I)
pragma_re = re.compile(r"<meta.*?content=["\"]*;?charset=(.+?)["\">]", flags=re.I)
xml_re = re.compile(r"^<\?xml.*?encoding=["\"]*(.+?)["\">]")
declared_encodings = (charset_re.findall(page) +
After Change
// Fallback to chardet if declared encodings fail
// Remove all HTML tags, and leave only text for chardet
text = re.sub(b"(\s*</?[^>]*>)+\s*", b" ", page).strip()
enc = "utf-8"
if len(text) < 10:
return enc // can"t guess