defopen_in_browser(html):"""
Open the HTML document in a web browser, saving it to a temporary
file to open it. Note that this does not delete the file after
use. This is mainly meant for debugging.
"""import os
import webbrowser
import tempfile
# 创建 HTML 临时文件
handle, fn = tempfile.mkstemp(suffix=".html")# 打开 HTML
f = os.fdopen(handle,"wb")# 写入 HTML 文本try:
f.write(b"<meta charset='UTF-8' />")
f.write(html.encode("utf-8"))finally:# we leak the file itself here, but we should at least close it
f.close()# 拼接文件的 URL
url ="file://"+ fn.replace(os.path.sep,"/")# 让浏览器打开
webbrowser.open(url)return url
cleaner.py
# strip out a set of nuisance html attributes that can mess up rendering in RSS feedsimport re
from lxml.html.clean import Cleaner
# 不良属性
bad_attrs =["width","height","style","[-a-z]*color","background[-a-z]*","on*"]# 匹配单引号包围的文本
single_quoted ="'[^']+'"# 匹配双引号包围的文本
double_quoted ='"[^"]+"'# 匹配非空格和标签结构字符
non_space ="[^ \"'>]+"# 匹配带有不良属性的标签
htmlstrip = re.compile("<"# open"([^>]+) "# prefix"(?:%s) *"%("|".join(bad_attrs),)+"= *(?:%s|%s|%s)"# undesirable attributes%(non_space, single_quoted, double_quoted)+"([^>]*)"# value # postfix">",# end
re.I,)defclean_attributes(html):# 如果发现了不良属性while htmlstrip.search(html):# 那就把它移除
html = htmlstrip.sub("<\\1\\2>", html)# 直到没有指定属性为止return html
defnormalize_spaces(s):# 如果`s`为空返回空串ifnot s:return""# 将连续的空白字符`\s+`替换为单个空格`\x20`并返回return" ".join(s.split())# 调用 lxml 库的`Cleaner`创建标签格式化工具
html_cleaner = Cleaner(# 移除`<script>`标签
scripts=True,# 移除`onXXX`属性
javascript=True,# 移除注释节点
comments=True,# 移除`<style>`标签`
style=True,# 移除`<link>`标签
links=True,# 不移除`<meta>`标签
meta=False,# 不添加`nofollow`属性
add_nofollow=False,# 不排版`<html> <head> <title>`
page_structure=False,# 移除命令节点
processing_instructions=True,# 不移除`<embed>`标签
embedded=False,# 不溢出`<iframe>`标签
frames=False,# 不移除`<form>`标签及控件标签
forms=False,# 不移除'blink', 'marquee'标签
annoying_tags=False,# 没有自定义的移除标签
remove_tags=None,# 不移除未知标签
remove_unknown_tags=False,# 不移除未知属性
safe_attrs_only=False,)
debug.py
import re
# FIXME: use with caution, can leak memory
uids ={
}
uids_document =None# 获取节点的描述文本defdescribe_node(node):global uids
if node isNone:return""# 如果节点没有名称# 返回占位符ifnothasattr(node,"tag"):return"[%s]"%type(node)
name = node.tag
# 获取节点 ID 或者类名,转成选择器形式# 附加在名称之后if node.get("id",""):
name +="#"+ node.get("id")if node.get("class","").strip():
name +="."+".".join(node.get("class").split())# 如果节点是 DIV,并且具有 ID 或者类名# 从描述中移除 DIVif name[:4]in["div#","div."]:
name = name[3:]# 如果名称是以下这四个if name in["tr","td","div","p"]:# 给节点分配一个自增的 UID,并缓存
uid = uids.get(node)if uid isNone:
uid = uids[node]=len(uids)+1# 在描述后面添加 UID
name +="{%02d}"% uid
return name
# 获取节点的描述文本,带有指定数量的父元素defdescribe(node, depth=1):global uids, uids_document
# 判断`uids_document`是否是根节点# 如果不是,清空`uids`和它
doc = node.getroottree().getroot()if doc != uids_document:
uids ={
}
uids_document = doc
# return repr(NodeRepr(node))
parent =""# 判断深度是否为 0if depth and node.getparent()isnotNone:# 递归获取父元素的描述文本
parent = describe(node.getparent(), depth=depth -1)+">"# 将父元素描述和当前节点描述拼接return parent + describe_node(node)
RE_COLLAPSE_WHITESPACES = re.compile(r"\s+", re.U)# 获取节点的简短内容deftext_content(elem, length=40):# 折叠空白字符,并移除所有 \r
content = RE_COLLAPSE_WHITESPACES.sub(" ", elem.text_content().replace("\r",""))# 如果内容长度小鱼限制,直接返回iflen(content)< length:return content
# 否则阶段并加省略号return content[:length]+"..."
encoding.py
import re
try:import cchardet
except ImportError:import chardet
import sys
# 匹配三个可能包含编码的标签# `<meta charset>` `<meta content>` 和 `<?xml ?>`
RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
CHARSETS ={
"big5":"big5hkscs","gb2312":"gb18030","ascii":"utf-8","maccyrillic":"cp1251","win1251":"cp1251","win-1251":"cp1251","windows-1251":"cp1251",}# 通过查表,将输入编码替换成它的超集deffix_charset(encoding):"""Overrides encoding when charset declaration
or charset determination is a subset of a larger
charset. Created because of issues with Chinese websites"""
encoding = encoding.lower()return CHARSETS.get(encoding, encoding)defget_encoding(page):# Regex for XML and HTML Meta charset declaration# 获取所有包含编码的标签
declared_encodings =(
RE_CHARSET.findall(page)+ RE_PRAGMA.findall(page)+ RE_XML.findall(page))# Try any declared encodingsfor declared_encoding in declared_encodings:try:# Python3 only# 如果是 Python3,将字节串转字符串if sys.version_info[0]==3:# declared_encoding will actually be bytes but .decode() only# accepts `str` type. Decode blindly with ascii because no one should# ever use non-ascii characters in the name of an encoding.
declared_encoding = declared_encoding.decode("ascii","replace")
encoding = fix_charset(declared_encoding)# Now let's decode the page
page.decode(encoding)# It worked!return encoding
except UnicodeDecodeError:pass# Fallback to chardet if declared encodings fail# Remove all HTML tags, and leave only text for chardet# 如果编码没有声明,尝试用 chardet 猜测# 移除所有标签
text = re.sub(r'(\s*</?[^>]*>)+\s*',' ', page).strip()# 如果长度小鱼 10,无法猜测,返回默认编码 UTF8
enc ='utf-8'iflen(text)<10:return enc # can't guess# 猜测编码
res = chardet.detect(text)# 如果猜测失败,设为 UTF8
enc = res["encoding"]or"utf-8"# print '->', enc, "%.2f" % res['confidence']# 修复编码名称
enc = fix_charset(enc)return enc