# 获取网页的键为'title'\'content'的字典, 其内容为网页标题和html主体
def requestsTextDecorator(func):
def getHtml(*args, **kwargs):
data = func(*args, **kwargs)
try:
import re
title = re.findall('<title>(.*?)</title>', data['content'])[0].strip()
except Exception as e:
return e
return {'picBool': data['picBool'], 'title': title, 'content': data['content']}
return getHtml
# 获取一个页面的所有的本站内图片或文本其他链接
def requestsLinksDecorator(func):
def getLinks(*args, **kwargs):
data = func(*args, **kwargs)
import re
try:
if data['picBool']:
links = re.findall('<img src="(.*?)".*?>', data['content'])
if not data['picBool']:
links = re.findall('<a href="(.*?)".*?>', data['content'])
except Exception as e:
raise e
finally:
result_links = []
for link in links:
if 'http' not in link:
result_links.append(args[0] + link[1:])
elif args[0] == link or args[0] not in link:
continue
else:
result_links.append(link)
return {'title': data['title'], 'links': result_links}
return getLinks
@requestsLinksDecorator
@requestsTextDecorator
# 对requests库进行封装, 输入url, 自动识别编码, 输出html网页
def requestsEncapsulation(url, picBool=False, **kwargs):
if kwargs == {}:
headers = {'User-Agent': 'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',}
else:
headers = kwargs
try:
import requests
for i in range(5):
res = requests.get(url, headers=kwargs, timeout=5)
if res.status_code == 200:
break
elif i == 4:
raise Exception('链接打开失败!')
res.encoding = res.apparent_encoding
except Exception as e:
return e
else:
if picBool:
return {'picBool': True, 'content': res.text}
else:
return {'picBool': False, 'content': res.text}
data = requestsEncapsulation('http://www.runoob.com/', picBool=False)
print(data)
Python3练习装饰器
猜你喜欢
转载自blog.csdn.net/weixin_43690548/article/details/88835148
今日推荐
周排行