本章主要介绍如何用python手撸前向传播和反向传播,激活函数是基于sigmoid的
前向传播:计算出来预测值,然后与target Y做差值计算出来误差
反向传播:根据前向传播得到的输出,得到delta值,用来更新权重值
直接上代码:
# -*- coding:utf-8 -*-
__author__ = 'xuy'
import numpy as np
"""
这个是10.1.3反向传播和多层网络
"""
class NeuralNetwork:
def __init__(self, layers, alpha=0.1):
self.W = []
self.losses = []
# A list of integers which represents the actual architecture of the
# feedforward network. For example, a value of [2, 2, 1] would imply
# that our first layer has 2 nodes, our hidden layer has 2 nodes, and our
# final output layer has one node.
self.layers = layers
self.alpha = alpha
# start looping from the index of the first layers
# but stop before we reach the last 2 layers
for i in np.arange(0, len(layers) - 2):#这里循环仅仅到倒数第三层,因为最后一层【输出层】没有bias值
# randomly initialize a weight matrix connecting the number of nodes
# in each respective layer together, adding an extra node for the
# bias.
w = np.random.randn(layers[i] + 1, layers[i + 1] + 1)
self.W.append(w / np.sqrt(layers[i]))
# the last 2 layers are a special case where the input connections need
# a bias term but the output does not
w = np.random.randn(layers[-2] + 1, layers[-1])#单独处理最后一层的情况,不需要新增一层偏置值列
self.W.append(w / np.sqrt(layers[-2]))#W一共有layer-1个元素
def __repr__(self):#输出每层神经网络的神经元个数,便于进行调试
# construct and return a string that represents the network
# architecture
#在这里输出结果是:2-2-1
return "NeuralNetwork: {}".format("-".join(str(l) for l in self.layers))
def sigmoid(self, x):#sigmoid的前向传播
# compute and return the sigmoid activation value for a given inputs
return 1.0 / (1 + np.exp(-x))
def sigmoid_deriv(self, x):#sigmoid的反向传播
# compute the derivative of the sigmoid function ASSUMING that "x" has
# already been passed through the sigmoid function
return x * (1 - x)
def fit(self, X, y, epochs=1000, display_update=100):#train code,最终进行训练的时候调用该函数即可
# insert a column of ones as the last entry in the feature matrix --
# this little trick allows us to treat the bias as a trainable parameter
# within the weight matrix
X = np.c_[X, np.ones((X.shape[0]))]#在输入数据x后面添加bias,这一列都为1
# loop over the desired number of epochs
for epoch in np.arange(0, epochs):
# loop over each individual data point and train our network
# on it
for (x, target) in zip(X, y):#训练阶段,包含正向传播和反向传播
self.fit_partial(x, target)
loss = self.calculate_loss(X, y)
self.losses.append(loss)
# check to see if we should display a training update
#输出结果,并且输出每次epoch的loss结果
if epoch == 0 or (epoch + 1) % display_update == 0:
print("[INFO] epoch={}, loss={:.7f}".format(
epoch + 1, loss))
def fit_partial(self, x, y):
#在训练的时候,每个epoch都调用一次fit函数,进行一次正向传播和一次反向传播,本段代码是先前向传播,然后接着再反向传播,最终从前到后更新权重
# construct our list of output activations for each layer as our data
# point flows through the network; the first activation is a special
# it's just the input feature vector itself
#A的作用是用来存储每一层的输出结果
A = [np.atleast_2d(x)]#将x转化为二维数组,如果x是一维的,那么就转化为1*len(x)
# FEEDFORWARD:正向传播
# loop over the layers in the network
# print(print1是否相等",len(self.W)==len(self.layers)-1)#这里layers的长度是3,weight的长度是2,A的长度是3,D的长度是2
for layer in np.arange(0, len(self.W)):#从第一层遍历到倒数第二层
# feedforward the activation at the current layer by taking the dot
# product between the activation and the weight matrix. this is
# called "net input" to the current layer.
net = A[layer].dot(self.W[layer])#w*x
# computing the "net output" is simply applying our nonlinear
# activation function to the net input
out = self.sigmoid(net)#进行非线性的sigmoid计算
#once we have the net output, add it to our list of activations
#在这里:len(layers)=3,len(weight)=2,len(A)=3,
# A[0]是输入的x,A[1]是根据A[0]以及W[0]的运算得出的结果,
# A[2]作为输出结果,是根据A【1】以及W[1]的运算结果求出的
A.append(out)#将前向传播的结果存储在A中
# BACKPROPAGATION:
# the first phase of backpropagation is to compute the difference
# between our *prediction* (the final output activation in the
# activations list) and the true target value
#error用来计算输出层的预测值A[-1]与真实值:y的差值,用来计算误差
#反向传播的第一步是计算误差值
error = A[-1] - y
# from here we need to apply the chain rule and build our list of deltas
# 'D'; the first entry in the deltas is simply the error of the output
# layer times the derivative of our activation function for the output
# value.
D = [error * self.sigmoid_deriv(A[-1])]#作为D[0]=error*sigmoid(y),作为反向传播的开始
# once you understand the chain rule, it becomes super easy to
# implement with a 'for' loop -- simply loop over the layers in reverse
# order (ignoring the last 2 since we already have taken them into
# account)
# count=0
#反向传播的第二步是,根据前面前向传播的结果,更新delta,为下一步更新weight做准备
for layer in np.arange(len(A) - 2, 0, -1):#反向传播,从倒数第二层开始进行反向传播直到输入层【不包括输入层】,因为初始值是D = [error * self.sigmoid_deriv(A[-1])]
# the delta for the current layer is equal to the delta of the
# *previous layer* dotted with the weight matrix of the current
# layer, followed by multiplying the delta by the derivative of the
# nonlinear activation function for the activations of the current
# layer
#开始计算梯度
delta = D[-1].dot(self.W[layer].T)#当前层的delta=前一层的D[-1]*当前层的权重
delta = delta * self.sigmoid_deriv(A[layer])
D.append(delta)
# count=count+1
# print ("循环的次数:",count)
# print(len(D)==len(self.layers)-1)
# since we looped over our layers in reverse order we need to reverse
# the deltas,反转回来,便于进行下面的梯度下降对于weight的更新
D = D[::-1]
# WEIGHT UPDATE PHASE:
# loop ove the layers
# print("D的长度和Weight的长度相同",len(D)==len(self.W))
# 进行权重更新,为下一次epoch做准备
for layer in np.arange(0, len(self.W)):
# update our weights by taking the dot product of the layer
# activations with their respective deltas, then multiplying
# this value by some small learning rate and adding to our weight
# matrix -- this is where the actual "learning" takes palce
self.W[layer] += -self.alpha * A[layer].T.dot(D[layer])
def calculate_loss(self, X, targets):
# make predictions for the input data points then compute the loss
targets = np.atleast_2d(targets)
predictions = self.predict(X, add_bias=False)
loss = 0.5 * np.sum((predictions - targets) ** 2)#二阶loss
return loss
def predict(self, X, add_bias=True):#返回每个layer的预测值的结果,是一个【【single data】】,因此是p[0][0]
# initialize the output prediction as the input features -- this value
# will be (forward) propogated through the network to obtain the final
# prediction
p = np.atleast_2d(X)
# check to see if the bias column should be added
if add_bias:
# insert column of 1's as last entry in the feature matrix (bias)
p = np.c_[p, np.ones((p.shape[0]))]
# loop over our layers in the network
for layer in np.arange(0, len(self.W)):
p = self.sigmoid(np.dot(p, self.W[layer]))
# return the predicted value,
return p