1. 匹配单字符
import re
#match 从最开始的位置开始查找匹配,如果第一个字符不匹配则返回
# text = "abc"
# ret = re.match('a', text)
# print(ret.group())
# . 匹配任意的字符, 除了'\n'
# text = '2bc'
# ret1 = re.match('.', text)
# print(ret1.group())
# \d 匹配任意的数字
# text = "2345"
# ret2 = re.match('\d', text)
# print(ret2.group())
# \D 匹配任意非数字
# text = "+S345"
# ret3 = re.match('\D', text)
# print(ret3.group())
# \s 匹配空白字符, \n, \t, \r 空格
# text = "\n2345"
# ret3 = re.match('\s', text)
# print('=' * 20)
# print(ret3.group())
# print('=' * 20)
# \S 匹配非空白字符, \n, \t, \r 空格
# text = "2345"
# ret4 = re.match('\S', text)
# print(ret4.group())
# \w 匹配字母a-z, A-Z 以及数字和下划线w
# text = "ac45"
# ret5 = re.match('\w', text)
# print(ret5.group())
# \W 匹配与\w相反
# text = "+ac45"
# ret6 = re.match('\W', text)
# print(ret6.group())a
# [] 组合匹配中括号中满足一个的条件
# text = "bc"
# ret6 = re.match('[1bc]', text)
# print(ret6.group())
# text = "123"
# ret6 = re.match('[0-9]', text)
# print(ret6.group())
# text = "bc"
# ret6 = re.match('[^0-9]', text)
# print(ret6.group())
text = "bc"
ret6 = re.match('[a-zA-Z0-9]', text)
print(ret6.group())
2. 匹配多字符
import re
# *, 匹配0个或多个字符,匹配条件由前面设置
# text = "+abc"
# result = re.match('\w*', text)
# print(result.group()) #空
# +, 匹配1个或多个字符,匹配条件由前面设置
# text = "abc"
# result = re.match('\w+', text)
# print(result.group()) #abc
# ?, 匹配前一个字符要求的0个或多个字符,匹配条件由前面设置
# text = "1abc"
# result = re.match('\w?', text)
# print(result.group()) #1
# {m}, 匹配m个字符,匹配条件由前面设置
# text = "12abc"
# result = re.match('\w{3}', text)
# print(result.group())
# {m,n}, 匹配m-n之间个字符,匹配条件由前面设置
text = "12abc"
result = re.match('\w{1,3}', text)#注意{m,n}中间没有空格,加上空格会报错
print(result.group())
3. 开始 / 结束 / 贪婪 / 非贪婪
import re
# ^ 以...开头
# text = "hello world"
# result = re.search('^hello', text)
# print(result.group())
#$ 以..结尾
# text = "hello world"
# result = re.search('world$', text)
# print(result.group())
#
# text = ""
# result = re.search('^$', text)
# print(result.group())
# | 匹配多个表达式
# 贪婪和非贪婪
# text = "12345"
# result = re.search('\d+?', text)#非贪婪模式, 满足条件下尽可能少的匹配字符
# print(result.group())
#案例
# text = "<h1>这是个大标题</h1>"
# result1 = re.search('<.+>', text) #贪婪模式,满足条件下尽可能多的匹配字符
# result2 = re.search('<.+?>', text)#非贪婪模式, 满足条件下尽可能少的匹配字符
# print(result1.group())
# print(result2.group())
#案例: 验证一个字符是不是0-100之间的数字0,1, 99, 100
text = "100"
result = re.match('0$|[1-9]\d?$|100$', text)
print(result.group())
4. 转义字符和原生字符
import re
# 转义字符
# text = "hello\\nworld"
# print(text)
# 正则表达式中的转义字符
# text = r"hello\nworld"
# print(text)
# 正则表达式中的转义字符
# text = "apple price is $99, orange price is $88"
# result = re.findall("\$\d+", text)
# print(result)
# 原生字符串 和正则表达式
# 正则表达式的字符串解析规则
# 先把字符串放到python层面解析,之后放到正则表达式解析
text = "\cba a"
result = re.match("\\\\c", text) # \\\\c (python层面) > \\c (正则表达式) > \c
result2 = re.match(r'\\c', text)
print(result.group())
print(result2.group())
5. 分组
import re
#分组
text = "apple price is $99, orange price is $88"
result = re.search('.+(\$\d+).+(\$\d+)', text)
print(result.group(0)) # / group() 匹配整个分组 apple price is $99, orange price is $88
print(result.group(1)) #匹配第一个分组 $99
print(result.group(2)) #匹配第二个分组... $88
print(result.groups()) # 获取所有分组('$99', '$88')
6. 常用函数
6.1 sub
#sub 根据规则替换其他字符串
text = "nuhao zhongque, lala hello world"
# new_text = text.replace(" ", "\n")
# print(new_text)
new_text = re.sub(r' |,', '\n', text)
print(new_text)
# nuhao
# zhongque
#
# lala
# hello
# world
html = '''
<div class="job-detail">
<p>工作职责:<br>1.从事计算机视觉相关服务、以及平台研发,以及现有服务维护及优化;<br>2.参与研发过程中的需求分析和架构设计。<br>任职资格:<br>1.三年及以上python开发经验, 熟悉django flask等常见开发框架;<br>2. 计算机相关专业,具备扎实的编程功底,熟悉常用数据结构和算法、设计模式;<br>3. 具备独立数据库设计,以及系统设计能力,如mongo、redis、mysql,缓存、消息队列等相关技术;<br>4. 具有良好的编程思想、沟通、团队合作精神、优秀的分析问题和解决问题的能力;具备强烈的责任心。</p>
</div>
'''
new_html = re.sub(r'<.+?>', '', html)
print(new_html)
6.2 split
text = "nihao zhongguo, hello world"
result = re.split(r' |,', text)
print(result) #['nihao', 'zhongguo', '', 'hello', 'world']
6.3 compile
# compile 编译正则表达式
text = "apple price is 34.56"
result = re.compile(r'\d+\.?\d*')
r = re.search(result, text)
print(r.group()) #34.56
6.4 VERBOSE
# compile 编译正则表达式
text = "apple price is 34.56"
result = re.compile(r'''
\d+ #整数部分
\.? #小数点
\d* #小数部分
''', re.VERBOSE)#如果在正则表达式中加注释,需要在函数最后加re.VERBOSE
r = re.search(result, text)
print(r.group())
6.5 findall
#findall
# text = "apple price is $99, orange price is $88"
# result = re.findall(r'\$\d+', text) # findall()返回是一个list
# print(result) #['$99', '$88']