版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u013630017/article/details/78469254
def unicode_it(html):
if isinstance(html, str):
r = chardet.detect(html)
if r["confidence"] >= 0.7:
encoding = r["encoding"]
if encoding.lower() == "gb2312":
charset = charset_pattern.findall(html)
if not charset or charset[0].lower().strip() == "gbk":
encoding = "gbk"
print encoding
s = html.decode(encoding)
return s
else:
return html