本次作业目的
认识课程中讲述的三种优化算法:梯度下降,具有动量的梯度下降算法,Adam优化后的梯度下降。
观察三种算法的优化方式和效果
下载地址
链接:https://pan.baidu.com/s/1av5v-tEbnx0cMjIlLLk_JQ
提取码:wk8o
代码:
# 1. 分割数据集
# 2. 优化梯度下降算法:
# 2.1 不使用任何优化算法
# 2.2 mini-batch梯度下降法
# 2.3 使用具有动量的梯度下降算法
# 2.4 使用Adam算法
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
import math
import sklearn
import sklearn.datasets
import opt_utils
import testCase
plt.rcParams['figure.figsize'] = (7.0, 4.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
#梯度下降
def update_parameters_with_gd(parameters, grads, learning_rate):
L=len(parameters)//2
for l in range(1,L):
parameters['W'+str(l+1)]=parameters['W'+str(l+1)]-learning_rate*grads['dW'+str(l+1)]
parameters['b' + str(l+1)] = parameters['b' + str(l+1)] - learning_rate * grads['db' + str(l+1)]
return parameters
#mini_batch梯度下降
def random_mini_batches(X,Y,mini_batches_size=64,seed=0):
np.random.seed(seed)
m=X.shape[1]
mini_batches=[]
#打乱顺序
permutation=list(np.random.permutation(m))#返回一个m长的随机数列表
shuffled_X=X[:,permutation]
shuffled_Y=Y[:,permutation].reshape((1,m))
#分割
num_complete_minibatches =math.floor(m/mini_batches_size)#可能会出现无法整除的现象
for k in range(0,num_complete_minibatches):
mini_batch_X=shuffled_X[:,k*mini_batches_size:(k+1)*mini_batches_size]
mini_batch_Y=shuffled_Y[:,k*mini_batches_size:(k+1)*mini_batches_size]
mini_batche=(mini_batch_X,mini_batch_Y)
mini_batches.append(mini_batche)
#无法整除的现象
if m/mini_batches_size!=0:
mini_batch_X=shuffled_X[:,mini_batches_size*num_complete_minibatches:]
mini_batch_Y=shuffled_Y[:,mini_batches_size*num_complete_minibatches:]
mini_batche=(mini_batch_X,mini_batch_Y)
mini_batches.append(mini_batche)
return mini_batches
#包含动量的梯度下降
def initialize_velocity(parameters):
L=len(parameters)//2
v={}
for l in range(L):
v['dW'+str(l+1)]=np.zeros_like(parameters['W'+str(l+1)])
v['db'+str(l+1)]=np.zeros_like(parameters['b'+str(l+1)])
return v
def update_parameters_with_momentum(parameters,grads,v,beta,learning_rate):
L=len(parameters)//2
for l in range(L):
v['dW'+str(l+1)]=beta*v['dW'+str(l+1)]+(1-beta)*grads['dW'+str(l+1)]
v['db'+str(l+1)]=beta*v['db'+str(l+1)]+(1-beta)*grads['db'+str(l+1)]
parameters['W'+str(l+1)]=parameters['W'+str(l+1)]-learning_rate*v['dW'+str(l+1)]
parameters['b'+str(l+1)]=parameters['b'+str(l+1)]-learning_rate*v['db'+str(l+1)]
return parameters,v
#Adam算法
def initialize_adam(parameters):
L=len(parameters)//2
v={}
s={}
for l in range(L):
v['dW'+str(l+1)]=np.zeros_like(parameters['W'+str(l+1)])
v['db'+str(l+1)]=np.zeros_like(parameters['b'+str(l+1)])
s['dW'+str(l+1)]=np.zeros_like(parameters['W'+str(l+1)])
s['db'+str(l+1)]=np.zeros_like(parameters['b'+str(l+1)])
return (v,s)
def update_parameters_with_adam(parameters,grads,v,s,t,learning_rate=0.01,beta1=0.9,beta2=0.999,epsilon=1e-8):
L=len(parameters)//2
v_corrected={}
s_corrected={}
for l in range(L):
v['dW'+str(l+1)]=beta1*v['dW'+str(l+1)]+(1-beta1)*grads['dW'+str(l+1)]
v['db'+str(l+1)]=beta1*v['db'+str(l+1)]+(1-beta1)*grads['db'+str(l+1)]
v_corrected['dW'+str(l+1)]=v['dW'+str(l+1)]/(1 - np.power(beta1,t))
v_corrected['db'+str(l+1)]=v['db'+str(l+1)]/(1 - np.power(beta1,t))
s['dW'+str(l+1)]=beta2*s['dW'+str(l+1)]+(1-beta2)*np.square(grads['dW'+str(l+1)])
s['db'+str(l+1)]=beta2*s['db'+str(l+1)]+(1-beta2)*np.square(grads['db'+str(l+1)])
s_corrected['dW'+str(l+1)]=s['dW'+str(l+1)]/(1-np.power(beta2,t))
s_corrected['db'+str(l+1)]=s['db'+str(l+1)]/(1-np.power(beta2,t))
parameters['W'+str(l+1)]=parameters['W'+str(l+1)]-learning_rate\
*v_corrected['dW'+str(l+1)]/(np.sqrt(s_corrected['db'+str(l+1)])+epsilon)
parameters['b'+str(l+1)]=parameters['b'+str(l+1)]-learning_rate\
*v_corrected['db'+str(l+1)]/(np.sqrt(s_corrected['db'+str(l+1)])+epsilon)
return parameters,v,s
train_X,train_Y=opt_utils.load_dataset(is_plot=False)
def model(X,Y,layers_dims,optimizer,learning_rate=0.0007,mini_batch_size=64
,beta=0.9,beta1=0.9,beta2=0.999,epsilon=1e-8,num_epochs=10000,print_cost=True,is_plot=True):
L=len(layers_dims)
costs=[]
t=0#每次完成一个mini_batc便会加1
seed=10
#初始化
parameters=opt_utils.initialize_parameters(layers_dims)
#选择优化器
if optimizer=='gd':
pass
elif optimizer=='momentum':
v=initialize_velocity(parameters)
elif optimizer=='adam':
v,s=initialize_adam(parameters)
else:
print('输入错误')
exit(1)
for i in range(num_epochs):
seed+=1#每次遍历完一次数据后增加种子重新排序
minibatches =random_mini_batches(X,Y,mini_batch_size,seed)
for minibatch in minibatches :
#选择一个mini_batche
(minibatch_X, minibatch_Y) = minibatch
#前向传播
A3,cache=opt_utils.forward_propagation(minibatch_X,parameters)
#计算误差
cost=opt_utils.compute_cost(A3,minibatch_Y)
#反向传播
grads=opt_utils.backward_propagation(minibatch_X,minibatch_Y,cache)
#更新参数
if optimizer=='gd':
parameters=update_parameters_with_gd(parameters,grads,learning_rate)
elif optimizer=='momentum':
parameters,v=update_parameters_with_momentum(parameters,grads,v,beta,learning_rate)
elif optimizer=='adam':
t=t+1
parameters,v,s=update_parameters_with_adam(parameters,grads,v,s,t,learning_rate,beta1,beta2,epsilon)
if i%100==0:
costs.append(cost)
if print_cost and i%1000==0:
print('第',i,'次迭代,代价为:'+str(cost))
if is_plot:
plt.plot(costs)
plt.ylabel('cost')
plt.xlabel('epochs (per 100)')
plt.title("Learning rate = " + str(learning_rate))
plt.show()
return parameters
layers_dims = [train_X.shape[0], 5, 2, 1]
#使用Adam优化的梯度下降
parameters = model(train_X, train_Y, layers_dims, optimizer="adam",is_plot=True)
注:参考网址https://blog.csdn.net/u013733326/article/details/79907419