上一篇:【Python3爬虫(五)】【数据解析】【正则表达式】
++++++++++开始线++++++++++++++++++
文章目录
一、 正则表达式
1.1 拆分
re-split_chinese.py
# 导入正则的包
import re
# 1.拆分字符串
one = 'asdsfsgsh'
# 标准是s为拆分
pattern = re.compile('s')
result1 = pattern.split(one)
# print(result1)
# ['a', 'd', 'f', 'g', 'h', '']
# 2.匹配中文
two = '<a href="https://www.baidu.com/" nslog="normal" nslog-type="10600112" ' \
'data-href="https://www.baidu.com/s?ie=utf-8&fr=bks0000&wd=">网页是最新版本的,适配移动端</a> '
# python中匹配中文unicode的范围
pattern = re.compile('[\u4e00-\u9fa5]+')
result2 = pattern.findall(two)
print(result2)
# ['网页是最新版本的','适配移动端']
1.2 用正则解析网页
news_re.py
import re
import requests
url = 'http://news.baidu.com/'
headers = {
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/70.0.3538.77 Safari/537.36 '
}
data = requests.get(url, headers=headers).content.decode()
# 正则解析data
# 解析每个新闻的titile, url
# target="_blank"不变,mon里面数据不变
# 正则写HTML
pattern = re.compile('<a href="(.*?)" target="_blank" mon="(.*?)">(.*?)</a>')
result = pattern.findall(data)
print(result)
# 将数据写入02.news.html并查看是否请求成功
# with open('02news.html', 'w') as f:
# f.write(data)
++++++++++结束线++++++++++++++++++