算法强化 —— 前向神经网络

网络图和激活函数

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles

准备数据

def load_data():
    # 训练样本有300个 测试样本有100个
    train_X,train_Y = make_circles(n_samples=300,noise=.05)
    test_X,test_Y = make_circles(n_samples=100,noise=.05)
    # 可视化数据
    plt.scatter(train_X[:,0],train_X[:,1],c = train_Y,s = 40,cmap=plt.cm.Spectral)
    train_X = train_X.T
    train_Y = train_Y.reshape((1,train_Y.shape[0]))
    test_X = test_X.T
    test_Y = test_Y.reshape((1,test_Y.shape[0]))
    return train_X,train_Y,test_X,test_Y

train_X,train_Y,test_X,test_Y = load_data()

print(train_X,train_Y,test_X,test_Y)

几个激活函数

s i g m o i d ( z ) = 1 1 + e z sigmoid(z) = \frac{1}{1+e^{-z}}
t a n h ( z ) = e z e z e z + e z tanh(z) = \frac{e^z-e^{-z}}{e^z+e^{-z}}
r e l u ( z ) = m a x ( z , 0 ) relu(z) = max(z,0)

def sigmoid(z):
    return 1./(1+np.exp(-z))

def tanh(z):
    return (np.exp(z)-np.exp(-z)) / (np.exp(z)+np.exp(-z))

def relu(z):
    return (np.abs(z)+z)/2

x = np.linspace(-5,5,100)
y = sigmoid(x)
plt.plot(x,y)
plt.show()


y = tanh(x)
plt.plot(x,y)
plt.show()

y = relu(x)
plt.plot(x,y)
plt.show()

sigmoid函数的导函数: z ( x ) = s ( x ) ( 1 s ( x ) ) z^{\prime}(x) = s(x)*(1-s(x))
RELU函数的导数:
r e l u ( z ) = { 1 , r e l u ( x ) > 0 0 , r e l u ( z ) = 0 relu^{\prime}(z)=\left\{\begin{array}{ll} {1,} & {relu(x)>0} \\ {0,} & {relu(z) = 0} \end{array}\right.

前向传播的过程

首先对于每一组数据来说都有一个输入 [ x 1 x 2 ] ( 2 , 1 ) \left[\begin{array}{l} x_{1} \\ x_{2} \end{array}\right]_{(2,1)}
然后第一个隐藏层我们称之为 H 1 H_1 ,其hidden_size为20,将还没有经过激活函数的RELU的,称之为 Z 1 Z_1 ,那么我们马上有:
Z 1 = W 1 X + b 1 Z_1 = W_1X+b_1
H 1 = R E L U ( Z 1 ) H_1 = RELU(Z_1)
其中 W 1 W_1 是从输入层到第一个隐藏层的参数, b 1 b_1 是其bias项。
在这里插入图片描述
在这里面我们要十分注意维度的变换,W1的维度是(20,2), b 1 b_1 为(20,1),自然 Z 1 Z_1 是(20,1),同样的 H 1 H_1 也是(20,1),他的过程大致是
Z [ 1 ] = W X + b [ 1 ] = [ w 1 , 1 [ 1 ] w 1 , 2 [ 1 ] w 2 , 1 [ 1 ] w 2 , 2 [ 1 ] w 20 , 1 [ 1 ] w 20 , 2 [ 1 ] ] ( 20 , 2 ) × [ x 1 x 2 ] ( 2 , 1 ) + [ b 1 [ 1 ] b 2 [ 1 ] b 20 [ 1 ] ] ( 20 , 1 ) Z^{[1]}=W X+b^{[1]}=\left[\begin{array}{cc} w_{1,1}^{[1]} & w_{1,2}^{[1]} \\ w_{2,1}^{[1]} & w_{2,2}^{[1]} \\ \vdots & \vdots \\ w_{20,1}^{[1]} & w_{20,2}^{[1]} \end{array}\right]_{(20,2)} \times\left[\begin{array}{c} x_{1} \\ x_{2} \end{array}\right]_{(2,1)}+\left[\begin{array}{c} b_{1}^{[1]} \\ b_{2}^{[1]} \\ \vdots \\ b_{20}^{[1]} \end{array}\right]_{(20,1)}
当然对于第二个隐藏层,它的hidden_size是5,那么它的前向传播的一系列东西如下
Z 2 = W 2 H 1 + b 2 Z_2 = W_2H_1 + b_2
H 2 = R E L U ( Z 2 ) H_2 = RELU(Z_2)
其各个维度对应如下 H 1 : ( 20 , 1 ) , W 2 : ( 5 , 20 ) , b 2 : ( 5 , 1 ) , Z 2 : ( 5 , 1 ) , H 2 : ( 5 , 1 ) H_1:(20,1),W_2:(5,20),b_2:(5,1),Z_2:(5,1),H_2:(5,1)
到了我们的最后一层
Z 3 = W 3 H 2 + b 3 Z_3 = W_3H_2+b_3
y ^ = s i g m o i d ( Z 3 ) \hat{y} = sigmoid(Z_3)
其各个维度对应如下 H 2 : ( 5 , 1 ) , W 3 : ( 5 , 1 ) , b 3 : ( 1 ) , H 3 : ( 1 ) H_2:(5,1),W_3:(5,1),b_3:(1),H_3:(1)
然后我们得出一个标量值,这个值根据我们设定的阈值,比如如果大于0.5,我们把它归为1,反之为0,就完成了我们的分类,也代表我们整个前向传播的过程结束了

def initialize_parameters(layer_dims):
    # 初始化权重和偏置项
    Weight = {}
    bias = {}
    for d in range(1,len(layer_dims)):
        Weight['W'+str(d)] = np.random.randn(layer_dims[d],layer_dims[d-1]) / np.sqrt(layer_dims[d-1])
        bias['b'+str(d)] = np.zeros((layer_dims[d],1))

    return Weight,bias

# 设定每一层的 hidden_size
layer_dims = [2,20,5,1]  # 两个隐藏层的大小 20 5
W,b = initialize_parameters(layer_dims)
W["W3"].shape
# 输出w3的维度 (1,5)

def forward_propagation(X,Weight,bias,activation):
    # 前向传播函数
    Z = {}
    H = {}
    # 一般吧输入X看做第0层的输出 H0
    H['HO'] = X
    L = len(Weight)
    for l in range(1,L=1):
        Z['Z'+str(l)] = np.dot(Weight['W'+str(l)],H['H'+str(l-1)] + bias['b'+str(l)])
        exec("H['H'+str(l)] = "+activation[l-1]+"(Z['Z'+str(l)])")

    return H,Z

损失函数选用

选用的是交叉熵损失函数(cross entropy ,CE)
L ( y ^ ( i ) , y ( i ) ) = ( y ( i ) log ( y ^ ( i ) ) + ( 1 y ( i ) ) log ( 1 y ^ ( i ) ) ) L\left(\hat{y}^{(i)}, y^{(i)}\right)=-\left(y^{(i)} \log \left(\hat{y}^{(i)}\right)+\left(1-{y}^{(i)}\right) \log \left(1-\hat{y}^{(i)}\right)\right)
对整个样本来说,我们的损失函数可以这么定义:
J ( w , b ) = 1 m i = 1 m L ( y ^ ( i ) , y ( i ) ) = 1 m i = 1 m [ y ( i ) log ( y ^ ( i ) ) + ( 1 y ( i ) ) log ( 1 y ^ ( i ) ) ] + λ 2 m w F 2 J(w, b)=\frac{1}{m} \sum_{i=1}^{m} L\left(\hat{y}^{(i)}, y^{(i)}\right)=-\frac{1}{m} \sum_{i=1}^{m}\left[y^{(i)} \log \left(\hat{y}^{(i)}\right)+\left(1-y^{(i)}\right) \log \left(1-\hat{y}^{(i)}\right)\right]+\frac{\lambda}{2 m}\|w\|_{F}^{2}
我们在损失函数的最后加了一个L2正则式,进行一个权重的衰减,防止过拟合

import numpy as np
def compute_cost(H,Y,Weight,lambd = 0.7):
    m = Y.shape[1]
    L2_term = 0
    for key in Weight.keys():
        L2_term += (np.sum(np.square(Weight[key])))

    logprobs = np.multiarray(-np.log(H['H'+str(len(H)-1)]),Y)+np.multiarray(-np.log(1-H['H'+str(len(H)-1)]),1-Y)
    # 在训练的时候我们是一批数据进行迭代,最后取其总和的平均值
    cost = 1./m*np.nansum(logprobs)
    cost += L2_term * lambd / (2*m)
    return cost

发布了110 篇原创文章 · 获赞 3 · 访问量 4080

猜你喜欢

转载自blog.csdn.net/qq_33357094/article/details/105097287