# -*- coding:utf-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import os
class IvskySpider(object):
def __init__(self):
self.url = 'http://www.ivsky.com'
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Host':"www.ivsky.com"
}
self.html = ''
# 请求函数
def get_html(self,url):
req = request.Request(url,headers=self.headers)
response = request.urlopen(req)
self.html = response.read().decode('utf-8')
# 解析列表页函数
def parse_list(self):
# 创建bs对象
bs = BeautifulSoup(self.html,'lxml')
res = bs.select('.ali p a')
# for循环遍历所有找到的节点
for ele in res:
href = ele['href']
title = ele.string
# 拼接详情页面url
url = self.url + href
self.get_html(url)
# 根据分类标题,创建文件夹
path = 'images/'+title
if not os.path.exists(path):
os.mkdir(path)
# 解析详情页面数据
self.parse_detail(path)
# 找到下一页链接
next_ele = bs.find(class_='page-next')
if next_ele:
# 获取下一页链接
next_href = next_ele['href']
page = next_href.split('_')[-1].split('.')[0]
print('正在获取第%s页'%page)
# 拼接完整的url地址
url = self.url + next_href
# 发起请求
self.get_html(url)
# 调用本身函数,解析下一页数据
self.parse_list()
else:
print('没有下一页')
# 解析详情页面数据
def parse_detail(self,path):
# 对比数据,分析不同之处
'http://img.ivsky.com/img/tupian/t/201712/11/limao_nanshi.jpg'
'http://img.ivsky.com/img/tupian/pre/201712/11/limao_nanshi.jpg'
bs = BeautifulSoup(self.html,'lxml')
# 找到图片
imgs = bs.select('.pli li div img')
for img in imgs:
try:
src = img['src']
# 替换链接中的/t/ 获取高清图片的链接
src = src.replace('/t/','/pre/')
# 分割图片名称
name = src.split('/')[-1]
# 拼接图片完整存放路径
img_path = path+'/'+name
request.urlretrieve(src,img_path)
except Exception as e:
print(e)
# 找到下一页
next_ele = bs.find('page-next')
if next_ele:
next_href = next_ele['href']
url = self.url + next_href
self.get_html(url)
self.parse_detail(path)
else:
print('没有下一页')
def start(self):
self.get_html('http://www.ivsky.com/tupian/')
self.parse_list()
pass
if __name__ == '__main__':
ivsky = IvskySpider()
ivsky.start()
from urllib import request
from bs4 import BeautifulSoup
import os
class IvskySpider(object):
def __init__(self):
self.url = 'http://www.ivsky.com'
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Host':"www.ivsky.com"
}
self.html = ''
# 请求函数
def get_html(self,url):
req = request.Request(url,headers=self.headers)
response = request.urlopen(req)
self.html = response.read().decode('utf-8')
# 解析列表页函数
def parse_list(self):
# 创建bs对象
bs = BeautifulSoup(self.html,'lxml')
res = bs.select('.ali p a')
# for循环遍历所有找到的节点
for ele in res:
href = ele['href']
title = ele.string
# 拼接详情页面url
url = self.url + href
self.get_html(url)
# 根据分类标题,创建文件夹
path = 'images/'+title
if not os.path.exists(path):
os.mkdir(path)
# 解析详情页面数据
self.parse_detail(path)
# 找到下一页链接
next_ele = bs.find(class_='page-next')
if next_ele:
# 获取下一页链接
next_href = next_ele['href']
page = next_href.split('_')[-1].split('.')[0]
print('正在获取第%s页'%page)
# 拼接完整的url地址
url = self.url + next_href
# 发起请求
self.get_html(url)
# 调用本身函数,解析下一页数据
self.parse_list()
else:
print('没有下一页')
# 解析详情页面数据
def parse_detail(self,path):
# 对比数据,分析不同之处
'http://img.ivsky.com/img/tupian/t/201712/11/limao_nanshi.jpg'
'http://img.ivsky.com/img/tupian/pre/201712/11/limao_nanshi.jpg'
bs = BeautifulSoup(self.html,'lxml')
# 找到图片
imgs = bs.select('.pli li div img')
for img in imgs:
try:
src = img['src']
# 替换链接中的/t/ 获取高清图片的链接
src = src.replace('/t/','/pre/')
# 分割图片名称
name = src.split('/')[-1]
# 拼接图片完整存放路径
img_path = path+'/'+name
request.urlretrieve(src,img_path)
except Exception as e:
print(e)
# 找到下一页
next_ele = bs.find('page-next')
if next_ele:
next_href = next_ele['href']
url = self.url + next_href
self.get_html(url)
self.parse_detail(path)
else:
print('没有下一页')
def start(self):
self.get_html('http://www.ivsky.com/tupian/')
self.parse_list()
pass
if __name__ == '__main__':
ivsky = IvskySpider()
ivsky.start()