代码实践:反向传播梯度校验(解析法和数值逼近)

import h5py
import matplotlib.pyplot as plt
import numpy as np

def load_dataset():
    #拿到这个文件字典
    train_dataset = h5py.File('./train_catvnoncat.h5','r')
    test_dataset =  h5py.File('./test_catvnoncat.h5','r')
    #拿到字典中所有的键值
#     for key in train_dataset.keys():
        #打印Key-Value
#         print (train_dataset[key])
    #list_classes表示是种类，train_set_x表示训练的数据集,train_set_y"表示训练的数据集的标签
    #有209个图片集，每个图片集是 64*64*3
    #<HDF5 dataset "list_classes": shape (2,), type "|S7">
    #<HDF5 dataset "train_set_x": shape (209, 64, 64, 3), type "|u1">
    #<HDF5 dataset "train_set_y": shape (209,), type "<i8">

    #拿到209个图片集，使用np做成数组
    train_set_x = np.array(train_dataset["train_set_x"][:])
    train_set_y = np.array(train_dataset["train_set_y"][:])
    test_set_x = np.array(test_dataset["test_set_x"][:])
    test_set_y = np.array(test_dataset["test_set_y"][:])
    #打印维度
#     print(train_set_x.shape)
    #(209, 64, 64, 3)

    #设置窗体大小
#     plt.figure(figsize = (2,2))
    #设置第0张
#     plt.imshow(train_set_x[11])
#     plt.show()
    #显示标签是否为猫
#     print(train_set_y[11])
    #第1 个元素，也就是图片数，第二个参数 -1 是 其余数相乘),T代表转置,形成（12288，209）
    train_set_x = train_set_x.reshape(train_set_x.shape[0],-1).T
#     print(train_set_x.shape)
    test_set_x = test_set_x.reshape(train_set_x.shape[0],-1).T
    #转化y的坐标
    train_set_y = train_set_y.reshape(train_set_y.shape[0],-1).T
    test_set_y = train_set_y.reshape(train_set_y.shape[0],-1).T#(1,209)
    
    return train_set_x,train_set_y,test_set_x,test_set_y

def init_parameters(fc_net):
    #定义一个字典，存放参数矩阵W1，b1,W2,b2,W3,b3,W4,b4
    #a1 = W1*a0+b1
    parameters = {}
    #拿到层数
    layers_num = len(fc_net)
#     print(layers_num)
    for L in range(1,layers_num):
        #使用高斯分布，每一个字典，形成一个fc_net[L]行fc_net[L-1]列的随机数组
        parameters["W"+str(L)] = np.random.randn(fc_net[L],fc_net[L-1]) 
        #fc_net[L]行1列
        parameters["b"+str(L)] = np.zeros((fc_net[L],1))
#     for L in range(1,layers_num):
        #打印W1到W4
#         print("W"+str(L) +"=",parameters["W"+str(L)])
#         print("b"+str(L) +"=",parameters["b"+str(L)])
    
    return parameters
#定义sigmoid函数
def sigmoid(Z):
    return 1/(1+np.exp(-Z))
    

def forward_pass(A0,parameters):
    #缓存函数
    cache = {}
    A = A0
    #缓存A0
    cache["A0"] = A0
    #python //代表整除，向下取整
    Layer_num = len(parameters) // 2
    #遍历 rang(1,5)是遍历1,2,3,4
    for L in range(1, Layer_num+1):
        #dot是矩阵乘法
        #其中对于b来说的话，b是1列的，但是 python中有广播机制,可以扩展为209列
        Z = np.dot(parameters["W"+str(L)],A) +parameters["b"+str(L)]
        #A1 = (4,12288)*(12288,209) +(4,1) = (4,209) +(4,1) = (4,209)
        #A2 = (3,4)*(4,209) +(3,1) = (3,209)+(3,1) = (3,209)
        #A3 = (2,3)*(3,209) +(2,1) = (2,209)
        #A4 = (1,209)
        A = sigmoid(Z)
        #给所有 中间值Z1-Z4 和 A1 -A1缓存
        cache["Z"+str(L)] = Z
        cache["A"+str(L)] = A
        
    return A,cache
    
def compute_cost(AL,Y):
    #代价函数
    m = Y.shape[1]   #Y =（1,209）
    cost = (1/m)*np.sum((1/2)*(AL-Y)*(AL-Y)) #代价函数
    return cost
   
def backward_pass(AL,parameters,cache,Y):
    #样本总数 209
    m = Y.shape[1]
    #定义字典，存储所有层的梯度
    gredient = {}
    #拿到层数
    Layer_num = len(parameters) //2
    #末层误差 dJ/dz   dZL.shape = (1,209)
    dZL= (AL -Y)*(AL*(1-AL) )
    #第4层的梯度 W4 = 1/m *(dZL, A )
    gredient["dW"+str(Layer_num)] = (1/m)*np.dot(dZL,cache["A"+ str(Layer_num -1)].T)
    gredient["db" +str(Layer_num)] = (1/m)*np.sum(dZL,axis=1,keepdims =True)#axis=1代表横向叠加,keepdims让其横向为矩阵，有维度
    #遍历[3,2,1]
    for L in reversed (range(1,Layer_num)):
        dZL = np.dot(parameters["W"+str(L+1)].T,dZL)*(AL*(1-AL))
        gredient["dW"+str(L)] = (1/m)*np.dot(dZL,cache["A"+ str(L -1)].T)
        gredient["db" +str(L)] = (1/m)*np.sum(dZL,axis=1,keepdims =True)
    return gredient 
    
#梯度检验
#解析法：求得梯度解析表达式，通过这个表达式得到梯度(确切解)
#数值逼近：（近似解）
#字典转列向量，先把所有b W串联成一个 (49182,1)矩阵，然后计算 数值梯度和 解析梯度的欧式距离
#
def  grad_dict_to_vector(gradient):
    Layer_num = len(gradient) // 2 #gradients=4
    count = 0
    for L in range(1,Layer_num+1):  #遍历[1,2,...,Layer_num]    
        dW_vector = np.reshape(gradient["dW"+str(L)], (-1,1))  #将该层dW矩阵展平为一个列矩阵 （4,12288）->（49152,1）
        db_vector = np.reshape(gradient["db"+str(L)], (-1,1))  #将该层db矩阵展平为一个列矩阵 （4,1）->（4,1）
        vec_L = np.concatenate((dW_vector, db_vector), axis=0)   #先将该层W个b串联叠加
        if count == 0:
            vec_output = vec_L  #叠加到输出列矩阵
        else:
            vec_output = np.concatenate((vec_output, vec_L), axis=0) #逐层串联叠加
        count = count + 1
    return vec_output #返回列矩阵

#把parameters转成 列向量
def param_dict_to_vector(parameters):#参数字典转列矩阵
    #将我们所有的参数字典转换为满足我们特定所需形状的单个向量。
    Layer_num = len(parameters) // 2 #Layer_num=4	
    count = 0
    for L in range(1,Layer_num+1):  #遍历[1,2,...,Layer_num]    
        W_vector = np.reshape(parameters["W"+str(L)], (-1,1))  #将该层W参数矩阵展平为一个列矩阵   
        b_vector = np.reshape(parameters["b"+str(L)], (-1,1))  #将该层b参数矩阵展平为一个列矩阵 
        vec_L = np.concatenate((W_vector, b_vector), axis=0)   #串联叠加	
        if count == 0:
            vec_output = vec_L
        else:
            vec_output = np.concatenate((vec_output, vec_L), axis=0) #串联叠加
        count = count + 1
    return vec_output

# 列向量转字典
def vector_to_param_dict(vec,param_src):   #列矩阵转参数字典,第一个输入为列矩阵，第二个输入为保存W和b的参数字典
    Layer_num = len(param_src) // 2 #Layer_num=4
    param_epsilon = param_src  
    idx_start = 0
    idx_end = 0
    for L in range(1,Layer_num+1):  #遍历[1,2,...,Layer_num] 
        row = param_src["W"+str(L)].shape[0]
        col = param_src["W"+str(L)].shape[1]
        idx_end = idx_start + row*col #该W参数矩阵元素个数
        #print("idx_start=",idx_start,";idx_end=",idx_end)
        param_epsilon["W"+str(L)] = vec[idx_start:idx_end].reshape((row,col))
        idx_start = idx_end
      
        row = param_src["b"+str(L)].shape[0]
        col = param_src["b"+str(L)].shape[1]
        idx_end = idx_start+row*col #该b参数矩阵元素个数
        #print("b.shape=",param_src["b"+str(L)].shape)
        #print("idx_start=",idx_start,";idx_end=",idx_end)
        param_epsilon["b"+str(L)] = vec[idx_start:idx_end].reshape((row,col))
        idx_start = idx_end
    return param_epsilon #返回添加了epsilon的参数字典   
compute_cost


#梯度检测    
def gradient_check(A0,Y,gradient,parameters,epsilon=1e-4):
    print("opooooooo")
    grad_vec = grad_dict_to_vector(gradient)#梯度字典转列向量
    param_vec = param_dict_to_vector(parameters)#参数字典转列向量(49182,1)
    param_num = param_vec.shape[0]  #49182,参数个数
    #定义列矩阵保存数值梯度
    grad_vec_approach = np.zeros(grad_vec.shape)
    #得到数值梯度列矩阵
    for i in range(0,param_num):
        #第一个参数的数值梯度
        if i%1000==0:
            print("grad checking i=",i)
            #对第一个参数做前向计算过程,求得当前代价值
        param_vec_plus = np.copy(param_vec)
            #对第一个参数加epslion，一般为1e-7
        param_vec_plus[i][0] = param_vec_plus[i][0] + epsilon
            #把这样的参数列矩阵，重新变换成字典，然后前向传播，计算出代价值
            #返回的cache不需要，就是不需要返回中间的W和b
        AL,_ =  forward_pass(A0,vector_to_param_dict(param_vec_plus,parameters))
            # 计算代价值        
        J_plus_epsilon = compute_cost(AL,Y)
        #计算减去epslion的代价值
        param_vec_minus = np.copy(param_vec)
        param_vec_minus[i][0] = param_vec_minus[i][0] - epsilon 
        AL,_ = forward_pass(A0,vector_to_param_dict(param_vec_minus,parameters))
        J_minus_epsilon = compute_cost(AL,Y)
        #减去e和 加上e的相加后除以得到第一个参数的数值梯度
        grad_vec_approach[i][0]= (J_plus_epsilon-J_minus_epsilon)/(2*epsilon)
        
    #在机器学习中，表征两个向量之间差异性的方法：L2范数（欧氏距离）、余弦距离
    #L2范数：主要用于表征两个向量之间数值的差异（适合我们现在的情况）
    #余弦距离：主要用于表征两个向量之间方向的差异
    # np.sqrt(np.sum((grad_vec-grad_vec_approach)**2))可以直接求得，但是解析梯度和数值非常小，会引入数值误差，采用相对误差的方法
    diff = np.sqrt(np.sum((grad_vec-grad_vec_approach)**2))/(np.sqrt(np.sum((grad_vec)**2))+np.sqrt(np.sum((grad_vec_approach)**2)))
    if diff > 1e-4:
        print("Maybe a mistake in your bakeward pass!!!  diff=",diff)
    else:
        print("No problem in your bakeward pass!!!  diff=",diff)



    
    
    

# 反向传播算法    
def  update_parameters(gredient,parameters,LearnRate):
    #     w: = w -learningRate *dw
    #     w: = b -learningRate *db
    Layer_num = len(parameters)//2
    for L in range(1,Layer_num+1):
        #遍历[1,2,3,4] 
        parameters["W" +str(L)] = parameters["W" +str(L)]  - LearnRate*gredient["dW"+str(L)]
        parameters["b" +str(L)] = parameters["b" +str(L)]  - LearnRate*gredient["db"+str(L)]
    return parameters
        

def Train_Net(fc_net,train_set_x,train_set_y,iterations=2000,LearnRate=0.01):
        #4.初始化全连接层
    parameters =  init_parameters(fc_net)
    #z = wx+b ;a =f(z)
    #AL  = (1,209)
    #cost保存每10次迭代计算得到的代价值
    costs = []
    for iteration in range (0,iterations):
        AL,cache = forward_pass(train_set_x,parameters) 

    #6.代价函数,多样本损失值
        loss = compute_loss(AL, train_set_y)
        if iteration%500 == 0:
            #每100个打印一次
            costs.append(loss)
            print("loss == ",loss)
    #7 .AL为输入数据，parameters中有w和b，cache中有Z和A，train_set_y为标签值
        gradient = backward_pass(AL,parameters,cache,train_set_y)
        
        #梯度检测
        if iteration ==1999:
            diff = gradient_check(train_set_x,train_set_y,gradient,parameters)
        #8. 梯度下降 gredient,根据梯度更新一次参数    
        parameters   =  update_parameters(gredient,parameters,LearnRate)
        
    plt.plot(costs,'p')
    plt.xlabel("opooc-iteration")
    plt.ylabel("opooc-cost")
    plt.show()   
    return parameters
    
    
    
    
if __name__ == '__main__':
    #1.加载数据
    train_set_x,train_set_y,test_set_x,test_set_y = load_dataset()
    #2.输入像素值做归一化 ()
    train_set_x = train_set_x/255.0
    test_set_x = test_set_x/255.0
    #3.定义全连接神经网络各层神经元个数，并初始化参数w和b,12288代表输入像素数
    fc_net = [12288,4,3,2,1]
    
    parameters = Train_Net(fc_net,train_set_x,train_set_y,iterations=2000,LearnRate=0.01)
opooc
发布了155 篇原创文章 · 获赞 134 · 访问量 390万+
私信关注
代码实践:反向传播梯度校验(解析法和数值逼近)

猜你喜欢