# -*- coding: utf-8 -*-
__author__ = '木之易'
__date__ = '2018/8/7 20:17'
# from urllib.request import urlretrieve
from urllib import request
import re, os
class WeddingImageSpider(object):
def __init__(self, t_id):
# 拼接完整地址
self.url = 'http://www.mmonly.cc'+t_id
self.title = 'images'
self.html = ''
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'}
self.create_directry()
self.count = 0
def create_directry(self):
"""
创建存放图片的文件夹
:return:
"""
# 获取源代码
self.get_html(self.url)
# 2.获取帖子标题
pattern = re.compile(r'<title>(.*?)</title>', re.S)
# print(pattern)
rs = re.search(pattern, self.html)
# 3.提取帖子标题,并赋值给self.title
if rs:
self.title = rs.group(1)
# 4.判断文件/目录是否存在
if not os.path.exists(self.title):
# 没有self.title这个文件夹,创建这个文件夹
# mkdir() 创建文件夹
os.mkdir(self.title)
def get_total(self):
"""获取下一页"""
self.get_html(self.url)
pattern = re.compile(r'<div class="pages">(.*?)</div>', re.S)
res = re.search(pattern, self.html)
# 若找到
if res:
ul_html = res.group()
# 找到含图片网址和关键字的标签
print(ul_html)
links = re.findall(re.compile(r"<li.*?a href='(.*?)'", re.S), ul_html)
print(links)
print(1232)
if 'tag' in links[-2]:
# 拼接下一页完整地址
# self.url = 'http://www.ivsky.com' + links[0][0:len(links)-6]
self.url = 'http://www.mmonly.cc' + links[-1]
return True
else:
return False
else:
return False
def get_html(self, url):
# 构建请求对象
req = request.Request(url=url, headers=self.headers)
# 发送请求
response = request.urlopen(req)
# 读取相应数据
self.html = response.read().decode('gb2312', 'ignore')
def parser_html(self):
# 1.准备正则
pattern = re.compile(r'<a target="_blank".*?<img.*?alt=.*?src="(.*?)"', re.S)
# 2.从源代码中提取数据
result = re.findall(pattern, self.html)
# 3.遍历下载每一张图片
for link in result:
self.count += 1
print('正在下载第%s张图片' % self.count)
# 4.拼接图片完整的存放路径
path = self.title + '/' + '%s.jpg' % self.count
# 5.下载图片
# urlretrieve(link, path)
request.urlretrieve(link, path)
def run(self):
count = 0
while True:
self.get_html(self.url)
count += 1
print('.........正在爬取第%s页数据,请稍后........' % count)
self.parser_html()
if not self.get_total():
print('图片爬取完毕')
break
if __name__ == '__main__':
wedding = WeddingImageSpider('/tag/cs/')
# wedding = WeddingImageSpider('/tag/hb2/')
wedding.run()
正则爬取海量美女图片(自动生成文件夹并储存)
猜你喜欢
转载自blog.csdn.net/A_fool_Program_ape/article/details/81569735
今日推荐
周排行