-
正则
''' 只要使用量词:* + ? {} 贪婪模式 *? +? ?? {}? 非贪婪模式 贪婪模式:正则表达式一般趋向于最大长度匹配,也就是所谓的贪婪匹配。如上面使用模式pattern 匹配字符串example, 匹配到的结果就是”abbbbbb”整个字符串。 非贪婪模式:在整个表达式匹配成功的前提下,尽可能少的匹配。 如上面使用模式pattern 匹配字符串example,匹配到的结果就只是”ab”整个字符串。 ''' import re s = 'abbbbbHello' result = re.match(r'ab+?', s) print(result.group()) # 分组引用: \number , ?P<名> ?P=名 s = '<div><a href="http://www.baidu.com">百度</a></div>123' # # result = re.match(r'<(.+)><(.+) href="(.+?)">(.+?)</\2></\1>', s) # print(result.group(1)) # print(result.group(2)) # print(result.group(3)) # print(result.group(4)) print('————————————————————————————————————') result = re.match(r'<(?P<e1>.+)><(?P<e2>.+) href="(.+?)">(.+?)</(?P=e2)></(?P=e1)>(\d+)', s) print(result) print(result.group(1)) print(result.group(2)) print(result.group(3)) print(result.group(4)) print(result.group(5)) #(?P=e2) (?P=e1) 不算分组
-
简单爬虫
''' 正则表达式 + 爬虫 requests 就是一个浏览器 User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36 ''' # import re # # import requests # url = 'https://imgsa.baidu.com/forum/w%3D223/sign=7c297b08b00e7bec23da04e31c2eb9fa/e433434a20a446234cdfca659022720e0cf3d7b5.jpg' # response = requests.get(url, headers={ # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}) # # code = response.status_code # # print(code) # content = response.content # # with open('images/a1.jpg', 'wb') as ws: # ws.write(content) # # print('下载完毕') # import os # import re # # ele = ''' # <img src="https://imgsa.baidu.com/forum/w%3D223/sign=7c297b08b00e7bec23da04e31c2eb9fa/e433434a20a446234cdfca659022720e0cf3d7b5.jpg" style="width:223px;height:278px;left:0px;top:0px;"> # <img src="https://imgsa.baidu.com/forum/w%3D223/sign=a3f5fb73a5345982c58ae2903ff4310b/de3d1ddfa9ec8a13570f38e7ff03918fa0ecc0b5.jpg" style="width:223px;height:315px;left:0px;top:0px;"> # <img src="http://tiebapic.baidu.com/forum/wh%3D90%2C99%3Bcrop%3D0%2C0%2C90%2C90/sign=6a0c52c1c33f8794d3aa4027e23737cd/fdfaaf51f3deb48f741d10a7e71f3a292df57857.jpg" attr="45157" data-original="http://tiebapic.baidu.com/forum/wh%3D90%2C99%3Bcrop%3D0%2C0%2C90%2C90/sign=6a0c52c1c33f8794d3aa4027e23737cd/fdfaaf51f3deb48f741d10a7e71f3a292df57857.jpg" bpic="http://tiebapic.baidu.com/forum/pic/item/fdfaaf51f3deb48f741d10a7e71f3a292df57857.jpg" class="threadlist_pic j_m_pic " style="display: inline; width: 89px; height: 90px;"> # <img src="https://imgsa.baidu.com/forum/wh%3D135%2C90/sign=3a8846c3023387449c90277d623af5c0/659b033b5bb5c9ead9621b70db39b6003bf3b394.jpg" attr="7854" data-original="https://imgsa.baidu.com/forum/wh%3D135%2C90/sign=3a8846c3023387449c90277d623af5c0/659b033b5bb5c9ead9621b70db39b6003bf3b394.jpg" bpic="https://imgsa.baidu.com/forum/pic/item/659b033b5bb5c9ead9621b70db39b6003bf3b394.jpg" class="threadlist_pic j_m_pic " style="display: inline; width: 135px; height: 90px;"> # ''' # # image_list = re.findall(r'<img src="(.+?)"', ele) # # print(image_list) # # for image in image_list: # # 使用requests模拟浏览器获取内容,image就是图片的连接地址 # response = requests.get(image, headers={ # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}) # # content = response.content # filename = os.path.split(image)[1] # # 本地保存 # with open('images/' + filename, 'wb') as ws: # ws.write(content) # # print('{}下载完成'.format(filename)) import os import re import requests ele = ''' http://n.sinaimg.cn/sinacn17/213/w1680h933/20180710/0273-hezpzwu8730048.jpg https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1578368718950&di=99e6f9921699450fe2da48e4ae90c51b&imgtype=0&src=http%3A%2F%2Fp2.qhimgs4.com%2Ft0128307802c64fd817.jpg http://img0.imgtn.bdimg.com/it/u=4250364844,2026637142&fm=26&gp=0.jpg ''' imagelist = re.findall('(http://.+?jpg)', ele) print(imagelist) for image in imagelist: response = requests.get(image, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}) context = response.content finame = os.path.split(image)[1] with open('images/' + finame, 'wb') as ws: ws.write(context)
正则+简单爬虫
猜你喜欢
转载自blog.csdn.net/piduocheng0577/article/details/105107132
今日推荐
周排行