Python——re

#正则表达式regex
#特殊符号和字符 ---> 元字符

(?:pattern)	非获取匹配，匹配pattern但不获取匹配结果，不进行存储供以后使用。这在使用或字符“(\|)”来组合一个模式的各个部分是很有用。例如“industr(?:y\|ies)”就是一个比“industry\|industries”更简略的表达式。
(?=pattern)	非获取匹配，正向肯定预查，在任何匹配pattern的字符串开始处匹配查找字符串，该匹配不需要获取供以后使用。例如，“Windows(?=95\|98\|NT\|2000)”能匹配“Windows2000”中的“Windows”，但不能匹配“Windows3.1”中的“Windows”。预查不消耗字符，也就是说，在一个匹配发生后，在最后一次匹配之后立即开始下一次匹配的搜索，而不是从包含预查的字符之后开始。
(?!pattern)	非获取匹配，正向否定预查，在任何不匹配pattern的字符串开始处匹配查找字符串，该匹配不需要获取供以后使用。例如“Windows(?!95\|98\|NT\|2000)”能匹配“Windows3.1”中的“Windows”，但不能匹配“Windows2000”中的“Windows”。
(?<=pattern)	非获取匹配，反向肯定预查，与正向肯定预查类似，只是方向相反。例如，“(?<=95\|98\|NT\|2000)Windows”能匹配“2000Windows”中的“Windows”，但不能匹配“3.1Windows”中的“Windows”。
(?<!pattern)	非获取匹配，反向否定预查，与正向否定预查类似，只是方向相反。例如“(?<!95\|98\|NT\|2000)Windows”能匹配“3.1Windows”中的“Windows”，但不能匹配“2000Windows”中的“Windows”。这个地方不正确，有问题

正则表达式基础知识
通配符	含义	正则示例	匹配结果
reg1 \| reg2	匹配正则表达式reg1或reg2	foo \| bar	foo
.	匹配任何字符(\n除外)	a.a	abc
^	匹配字符串起始部分	^a	ab....
$	匹配字符串终止部分	.txt$	a.txt
*	匹配0次或者多次前面出现的正则表达式	a*	aaaaa
+	匹配1次或者多次前面出现的正则表达式	[a-z]+	aasx
?	匹配0次或者1次前面出现的正则表达式	first?	first
{N}	匹配N次前面出现的正则表达式	*.c{2}	first.c abc.c
{M,N}	匹配M~N次前面出现的正则表达式	*.c{0,1}	one.c
[...]	匹配来自字符集的任意单个字符	[abc]	b
[...x-y...]	匹配x~y范围中的任意单个字符	[0-9]	9
[^...]	不匹配次字符集中任意单个字符	[^0-9]	a
(*\|+\|?\|{})?	匹配上面频繁出现符号的非贪婪版	(*\|+\|?\|{})?	({})
(...)	匹配封闭的正则表达式，然后另存为子组	([0-1][0-9])?	12
\d	匹配任何十进制数字	\d.txt	1.txt
\w	匹配任何字母数字字符	\w{2}txt	1.txt
\s	匹配任何空格字符	a\sb	a b
\b	匹配任何单词边界	The\bdog	The dog
\N	匹配已保存的子组	([0-9])\1	1
\.	匹配"."这个字符	a\.txt	a.txt

常用正则表达式
正则表达式	描述	匹配结果
\d+(\.\d*)?	任意整数和浮点数	0.004 2 75.
\b[^\Wa-z0-9_][^\WA-Z0-9_]*\b	首字母只能大写	Boo Foo
^http:\/\/([\w-]+(\.[\w-]+)+(\/[\w-.\/\?%&=\u4e00-\u9fa5]*)?)?$	验证网址	http://www.baidu.com/?id=1
^[\u4e00-\u9fa5]{0,}$	验证汉字	汉字汉字
\w+([-+.']\w+)@\w+([-.]\w+)\.\w+([-.]\w+)*	验证电子邮件	[email protected]
^[1-9]([0-9]{16}\|[0-9]{13})[xX0-9]$	验证身份证	14525419951215445X
^13[0-9]{1}[0-9]{8}\|^15[9]{1}[0-9]{8}	验证手机号	138459572***
^(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9])\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9]\|0)\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9]\|0)\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[0-9])$	验证IP	192.168.1.1
^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.s\|)$	验证域名	baidu.com
^([a-zA-Z]\:\|\\)\$[^\\]+\$[^\/:?"<>\|]+\.txt(l)?$	验证文件路径	C:\user\wo
<(.)>(.)<\/(.)>\|<(.)\/>	HTML标签匹配	xxxx

#re模块
#常用的方法
compile(pattern, flags = 0) 　　　　　　匹配任何可选的标记来编译正则表达式的模式，然后返回一个正则表达式对象
match(pattern, string, flags = 0)　　　　使用带有可选标记的正则表达式的模式来匹配字符串。如果匹配成功，返回匹配对象，否则返回None
search(pattern, string ,flags = 0) 　　　　使用可选标记搜索字符串中第一次出现的正则表达式模式。如果匹配成功，则返回匹配对象，否则返回None
findall(pattern, string[,flags] ) 　　　　    查找字符串中所有(非重复)出现的正则表达式模式，并返回一个匹配列表
finditer(pattern, string[,flags] ) 　　　　   与findall()相同，但返回的是一个迭代器。对于每一次匹配，迭代器都能返回一个匹配对象
split(pattern, string, max = 0) 　　　　根据正则表达式的模式分隔符，split函数将字符串分割为列表，返回匹配列表，分割最多操作max次
group(num = 0) 　　　　　　　　  　　返回整个匹配对象，或者编号为num的特定子组

import re
m = re.search('foo','asdasdfooasd')
#这里如果使用match将匹配不到任何字符串，因为match从第一个a开始匹配
if m is not None:
    print(m.group())

regex = <(.*)>(.*)<\/(.*)>|<(.*)\/>
m = re.search(regex,"aa<a>aaaa</a>")
#一样只有search能匹配到标签
if m is not None:
    print(m.group())

regex = '(foo\w)(\w)'
m = re.match(r'(foo\w)(\w)','fooasdfooasd')
if m is not None:
    print(m.group(1))
    print(m.groups())
#输出
#fooa
#('fooa', 's')

regex = 'apple'
m = re.findall(regex,'apple1 apple2 apple3')
    print(m)
#输出
#['apple', 'apple', 'apple']

regex = 'apple'
m = [ g.group() for g in re.finditer(regex,'apple1 apple2 apple3')]
print(m)
#输出
#['apple', 'apple', 'apple']

list = [
'aaa, bbb ccc',
'ddd, eee fff',
]
for i in list:
    print(re.split(', |(?= (?:[a-z]{3})) ',i))
#输出
#['aaa', 'bbb', 'ccc']
#['ddd', 'eee', 'fff']

re模块小实例：

__author__ = 'cq'

import  re
from random import randrange,choice,randint
from string import ascii_lowercase as lc
from time import ctime


#生成数据文件
def generate_data():
    with open('./data.txt','w') as f:
        for i in range(randint(20,30)):
            tlds = ('com', 'edu', 'net', 'org', 'gov')
            dtint = randint(100000000,1200000000) #生成时间戳
            dtstr = ctime(dtint)  #将时间戳转化为特定时间格式
            llen = randrange(4, 8) #用户名长度
            login = ''.join(choice(lc) for i in range(llen))  #生成用户名
            dlen = randrange(llen,13)                         #域名长度
            dom = ''.join(choice(lc) for i in range(dlen))    #生成域名

            data_line = "%s::%s@%s.%s::%d-%d-%d\n" % (dtstr, login, dom, choice(tlds), dtint, llen, dlen)
            f.write(data_line) #写入文件
            print(data_line)   #打印每行记录



#匹配指定日期的行
def match_date():
    regex = '(Mon|Tue|Wed|Thu|Fri|Sat|Sun)(.*)'
    with open('./data.txt','r') as f:
        m = re.findall(regex,f.read())
        for i in m:
            print(i)




#匹配在某时间段内的记录
def match_time_slot():
    regex = ' ([0-9]{1,2}) .*([0-9]{4})::(.*)'
    # regex = ' ([0-9]{0,2}).*(::)(.*) '
    with open('./data.txt','r') as f:
        m = re.findall(regex,f.read())
        for i in m:
            if 2000 <= int(i[1]) and int(i[1]) <= 2020 and 20 <= int(i[0]) and int(i[0]) <= 31:
                print(i)


#匹配某名单中人员的记录
def match_name():
    regex = '::([a-z]{2,13})@([a-z]{2,13})\.(com|edu|net|org|gov)'
    with open('./data.txt','r') as f:
        m = re.findall(regex,f.read())
        for i in m:
            print(i)



def main():
    generate_data()
    print("\n---------------match_date--------------------\n")
    match_date()
    print("\n---------------match_time_slot--------------------\n")
    match_time_slot()
    print("\n---------------match_name--------------------\n")
    match_name()


if '__main__' == __name__:
    main()

猜你喜欢