python进阶宝典6-正则表达式

#正则表达式匹配步骤：
# 1）用import re导入正则表达式模块。
# 2）用re.compile()函数创建一个Regex对象（记得使用原始字符串）。
# 3）向Regex对象的search()方法传入想查找的字符串。它返回一个Match对象。
# 4）调用Match对象的group()方法，返回实际匹配文本的字符串。
import re
pregex=re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo=pregex.search('My number is 323-123-3322, 566-767-5555.')
print('Phone number found: '+mo.group())

pregex=re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)') #字符串引号前加上 r ，表示为原始字符串，无转义字符
#pregex=re.compile(r'($\d\d\d$)-(\d\d\d-\d\d\d\d)') #括号匹配
mo=pregex.search('My number is 323-123-3322, 566-767-5555.')
mo.group(1)
mo.group(2)
mo.group()
mo.groups()
code1,code2=mo.groups()
print(code2)

#正则表达式测试程序： http://regexpal.com/
#batRegex = re.compile(r'Batman|Tina') #管道匹配匹配多个之一,返回第一个匹配
#batRegex = re.compile(r'Bat(wo)?man') #问号实现可选匹配
#batRegex = re.compile(r'Bat(wo)*man') #星号实现匹配零次或多次
#batRegex = re.compile(r'Bat(wo)+man') #加号实现匹配一次或多次
#batRegex = re.compile(r'Bat(wo){3,}man') #花括号实现匹配特定次数。{3,} 3次及以上，{,5} 5次及以下，{3,5} 3到5次，{3} 正好3次
#batRegex = re.compile(r'Bat(wo){3,5}man') #花括号实现匹配特定次数。默认为贪心模式，尽量匹配满足条件的最长字符串
#batRegex = re.compile(r'Bat(wo){3,5}?man') #花括号后跟着一个问号，表示非贪心模式，尽量匹配满足条件的最短字符串
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel Batwoman Batwowowowowoman Batwowowoman')
mo.group()
mo.group(1)

#findall() 方法
batRegex = re.compile(r'wowowoman')
batRegex.findall('Batmobile lost a wheel Batwoman Batwowowowowoman Batwowowoman')

##字符分类
# \d 0到9的任何数字
# \D 除0到9的数字外的任何字符
# \w 任何字母、数字或下划线字符(可以认为是匹配“单词”字符)
# \W 除字母、数字和下划线以外的任何字符
# \s 空格、制表符或换行符(可以认为是匹配“空白”字符)
# \S 除空格、制表符和换行符以外的任何字符
xmasRegex = re.compile(r'\d+\s\w+') # 加号表示匹配一个或多个，结果例子：23 hens
xmasRegex = re.compile(r'[a-zA-Z0-9]') # 匹配所有大小写字母及数字。在方括号内，普通的正则表达式不会被解释
xmasRegex = re.compile(r'[aeiouAEIOU]') # 匹配所有元音字符，不论大小写
xmasRegex.findall('Roboxy ai eats baby EIIU')
xmasRegex = re.compile(r'[^aeiouAEIOU]') # 匹配所有非元音字符。注意这是方括号内的^
xmasRegex = re.compile(r'^aeiouAEIOU') # 匹配以aeiouAEIOU为开头的字符串。注意没方括号的^
xmasRegex = re.compile(r'aeiouAEIOU$') # 匹配以aeiouAEIOU为结束的字符串。
xmasRegex = re.compile(r'\d$') # 匹配以数字0-9为结束的字符串。
xmasRegex = re.compile(r'^\d$') # 严格匹配数字串。
##通配符
atRegex = re.compile(r'.at') # .(句点)匹配除了换行之外的所有单个字符
atRegex = re.compile(r'.*at') # 匹配从开始到at的任意文本
atRegex = re.compile(r'<.*?>') # 使用问号，非贪心匹配
mo = atRegex.search('<To serve man> for dinner.>')
mo.group()

atRegex = re.compile(r'<.*>') # 没使用问号，贪心匹配
mo = atRegex.search('<To serve man> for dinner.>')
mo.group()

atRegex = re.compile(r'.*') # .*匹配任意文本，不包括换行符
atRegex = re.compile(r'.*',re.DOTALL) # .*匹配任意文本，包括换行符

atRegex = re.compile(r'robat',re.I) # 忽略大小写匹配。re.IGNORECASE 或 re.I

## sub()方法
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED','Agent Alice gave the secret documents to Agent Bog.')
# CENSORED gave the secret documents to CENSORED.

namesRegex = re.compile(r'Agent (\w)\w*')
namesRegex.sub(r'\1****','Agent Alice told Agent Carol that Agent Eva knew Agent Bob ws a dd agent.')
# A**** told C**** that E**** knew B**** was a dd agent.
# 这里\1、\2、\3这种表示可替换输入分组1、2、3的文本

## re.VERBOSE 忽略正则表达式字符串中的空白和注释，用来格式化长表达式
# 三重引号(''')用来创建多行字符串
phoneRegex = re.compile(r'''(
(\d{3}|$\d{3}$)? # area code
(\s|-|\.)? # separator
\d{3} # first 3 digits
(\s|-|\.) # separator
\d{4} # last 4 digits
(\s*(ext|x|ext.)\s*\d{2,5})? # extension
)''',re.VERBOSE)

############################################################################
########## 电话号码和Email地址提取程序
###
#1）从剪贴板取得文本
#2）找出文本中所有的电话号码和Email地址
#3）将它们粘贴到剪贴板

#step1:为电话号码创建一个正则表达式
import pyperclip,re
phoneRegex = re.compile(r'''(
(\d{3}|$\d{3}$)? # area code
(\s|-|\.)? # separator
\d{3} # first 3 digits
(\s|-|\.) # separator
\d{4} # last 4 digits
(\s*(ext|x|ext.)\s*\d{2,5})? # extension
)''',re.VERBOSE)

#step2：为Email地址创建一个正则表达式
emailRegex = re.compile(r'''(
[a-zA-Z0-9._%+-]+ # username
@ # @ symbol
[a-zA-Z0-9.-]+ # domain name
(\.[a-zA-Z]{2,4}) # dot-something
)''',re.VERBOSE)

#step3: 在剪贴板文本中找到所有匹配
text = str(pyperclip.paste())
print(text)
matches = []
for groups in phoneRegex.findall(text):
phoneNum = '-'.join([groups[1],groups[3],groups[5]])
if groups[8] != '':
phoneNum += ' x'+groups[8]
matches.append(phoneNum)
for groups in emailRegex.findall(text):
matches.append(groups[0])

#step4：所有匹配连接成一个字符串，复制到剪贴板
if len(matches) > 0:
pyperclip.copy('\n'.join(matches))
print('Copied to clipboard:')
print('\n'.join(matches))
else:
print('No phone numbers or email addresses found.')

python进阶宝典6-正则表达式

猜你喜欢