问题描述
对某个页面中的日期+时间进行解析
解决方案
requests爬取+正则表达式解析
import re
import requests
def match_datetime(text):
'''正则表达式提取文本所有日期+时间
:param text: 待检索文本
>>> match_datetime('日期是2020-05-20 13:14:15.477062.')
['2020-05-20 13:14:15']
'''
pattern = r'(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})'
pattern = re.compile(pattern)
result = pattern.findall(text)
return result
def extract_datetime(url, headers=None):
'''提取某页面中的日期+时间
:param url: 页面地址
:param headers: 请求头
:return:
>>> extract_datetime('https://xercis.blog.csdn.net/article/details/104752851')
['2020-03-20 15:55:17', '2020-03-20 15:55:17', '2020-03-20 15:55:17']
'''
r = requests.get(url, headers=headers)
html = r.text
return match_datetime(html)
if __name__ == '__main__':
url = 'https://xercis.blog.csdn.net/article/details/104752851'
headers = {
'cookie': '', # 若失败则填上
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
print(extract_datetime(url))
print(extract_datetime(url, headers))