#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib.request
import random
import re # 正则库
'''
正则匹配-抓取静态网页(内涵吧)的图片
Python3.7.0
'''
class Spider:
def __init__(self):
# 初始化起始页位置
self.page = 2
# 开关,为True继续爬去
self.switch = True
def loadPage(self):
'''
作用:下载页面
'''
url = "https://www.neihan8.com/mm/index_" + str(self.page) + ".html"
headers = {"User-Agent" : "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"}
request = urllib.request.Request(url, headers = headers)
response = urllib.request.urlopen(request)
# 获取Html源码字符串
html = response.read().decode("utf-8")
# print(html)
# 创建正则表达式的匹配规则对象,\s表示空格 re.S表示匹配全文、re.I表示忽略大小写,返回正则表达式的对象
patter = re.compile('<a\sclass="img"\shref="/mm/(.*?).html">\s<img\ssrc="(.*?)"></a>', re.S)
# 匹配数据,findall返回的是一个列表
content_list = patter.findall(html)
for list in content_list:
print(list[1])
self.download_web_image(list[1])
def download_web_image(self, url):
'''
作用:下载图片到本地
'''
name = random.randrange(1, 10000000000)
full_name = str(name) + '.jpg'
urllib.request.urlretrieve(url, full_name)
def startWork(self):
'''
作用:控制爬虫运行
'''
while self.switch:
command = input("如果继续爬取,请按回车(退出输入quit)")
if command == "quit":
self.switch = False
break
self.loadPage()
self.page += 1
print("谢谢使用!")
if __name__ == "__main__":
picSpider = Spider()
picSpider.startWork()
PythonScript_004_正则匹配_抓取静态网页(内涵吧)的图片
猜你喜欢
转载自blog.csdn.net/weixin_40022980/article/details/84797480
今日推荐
周排行