这是一个全连接网络,结构为:{affine - [batch norm] - relu - [dropout]} x (L - 1) - affine - softmax
(也是fc_net.py里面的一个网络,纯自己写)
class FullyConnectedNet(object):
"""
Author::Chenx
"""
"""
A fully-connected neural network with an arbitrary number of hidden layers,
ReLU nonlinearities, and a softmax loss function. This will also implement
dropout and batch normalization as options. For a network with L layers,
the architecture will be
{affine - [batch norm] - relu - [dropout]} x (L - 1) - affine - softmax
where batch normalization and dropout are optional, and the {...} block is
repeated L - 1 times.
Similar to the TwoLayerNet above, learnable parameters are stored in the
self.params dictionary and will be learned using the Solver class.
"""
def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,
dropout=0, use_batchnorm=False, reg=0.0,
weight_scale=1e-2, dtype=np.float32, seed=None):
"""
Initialize a new FullyConnectedNet.
Inputs:
- hidden_dims: A list of integers giving the size of each hidden layer.
- input_dim: An integer giving the size of the input.
- num_classes: An integer giving the number of classes to classify.
- dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then
the network should not use dropout at all.
- use_batchnorm: Whether or not the network should use batch normalization.
- reg: Scalar giving L2 regularization strength.
- weight_scale: Scalar giving the standard deviation for random
initialization of the weights.
- dtype: A numpy datatype object; all computations will be performed using
this datatype. float32 is faster but less accurate, so you should use
float64 for numeric gradient checking.
- seed: If not None, then pass this random seed to the dropout layers. This
will make the dropout layers deteriminstic so we can gradient check the
model.
"""
self.use_batchnorm = use_batchnorm
self.use_dropout = dropout > 0
self.reg = reg
self.num_layers = 1 + len(hidden_dims)
self.dtype = dtype
self.params = {}
############################################################################
# TODO: Initialize the parameters of the network, storing all values in #
# the self.params dictionary. Store weights and biases for the first layer #
# in W1 and b1; for the second layer use W2 and b2, etc. Weights should be #
# initialized from a normal distribution with standard deviation equal to #
# weight_scale and biases should be initialized to zero. #
# #
# When using batch normalization, store scale and shift parameters for the #
# first layer in gamma1 and beta1; for the second layer use gamma2 and #
# beta2, etc. Scale parameters should be initialized to one and shift #
# parameters should be initialized to zero. #
############################################################################
i=1
b = input_dim
#初始化(L-1)个隐藏层的权值、偏置、batchnorm、dropout
for d in hidden_dims:
stringw = 'W' + str(i)
stringb = 'b' + str(i)
self.params[stringw] = weight_scale*np.random.randn(b,d)
self.params[stringb] = np.zeros(d)
if use_batchnorm:
stringg = 'gamma' + str(i)
stringbe = 'beta' + str(i)
self.params[stringg] = np.ones(d)
self.params[stringbe] = np.zeros(d)
b=d
i+=1
#初始化输出层的权值、偏置
stringw = 'W' + str(i)
stringb = 'b' + str(i)
self.params[stringw] = weight_scale*np.random.randn(b,num_classes)
self.params[stringb] = np.zeros(num_classes)
############################################################################
# END OF YOUR CODE #
############################################################################
# When using dropout we need to pass a dropout_param dictionary to each
# dropout layer so that the layer knows the dropout probability and the mode
# (train / test). You can pass the same dropout_param to each dropout layer.
self.dropout_param = {}
if self.use_dropout:
self.dropout_param = {'mode': 'train', 'p': dropout}
if seed is not None:
self.dropout_param['seed'] = seed
# With batch normalization we need to keep track of running means and
# variances, so we need to pass a special bn_param object to each batch
# normalization layer. You should pass self.bn_params[0] to the forward pass
# of the first batch normalization layer, self.bn_params[1] to the forward
# pass of the second batch normalization layer, etc.
self.bn_params = []
if self.use_batchnorm:
self.bn_params = [{'mode': 'train'} for i in range(self.num_layers - 1)]
# Cast all parameters to the correct datatype
for k, v in self.params.items():
if isinstance(v,int):
self.params[k] = dtype(v)
else: self.params[k] = v.astype(dtype)
def loss(self, X, y=None):
"""
Compute loss and gradient for the fully-connected net.
Input / output: Same as TwoLayerNet above.
"""
X = X.astype(self.dtype)
mode = 'test' if y is None else 'train'
# Set train/test mode for batchnorm params and dropout param since they
# behave differently during training and testing.
if self.use_dropout:
self.dropout_param['mode'] = mode
if self.use_batchnorm:
for bn_param in self.bn_params:
bn_param['mode'] = mode
scores = None
############################################################################
# TODO: Implement the forward pass for the fully-connected net, computing #
# the class scores for X and storing them in the scores variable. #
# #
# When using dropout, you'll need to pass self.dropout_param to each #
# dropout forward pass. #
# #
# When using batch normalization, you'll need to pass self.bn_params[0] to #
# the forward pass for the first batch normalization layer, pass #
# self.bn_params[1] to the forward pass for the second batch normalization #
# layer, etc. #
############################################################################
#计算scores
num_hiddenlayers = self.num_layers-1
cache = {} #记录各层每一级的cache,如第一层:“affine:cache11,batchnorm:cache12....."
z={} #记录各层每一级的输出结果,如第一层:“affine:z11,batchnorm:z12....."
#初始化网络输入值
stringz_last="input"
z[stringz_last] = X
#隐藏层网络循环开始
for i in range(num_hiddenlayers):
#提取当前层的权值、偏置
stringW = 'W' + str(i+1)
stringb = 'b' + str(i+1)
W = self.params[stringW]
b = self.params[stringb]
#当前层的第一个步骤(affine_forward)
stringc = 'cache' + str(i+1) + str(1)
stringz = 'z' + str(i+1) + str(1)
z[stringz],cache[stringc] = affine_forward(z[stringz_last],W,b)
stringz_last=stringz
#当前层的第二个步骤(可选)(batchnorm_forward)
if self.use_batchnorm:
#提取gamma,beta
stringg = 'gamma' + str(i+1)
stringbe = 'beta' + str(i+1)
gamma = self.params[stringg]
beta = self.params[stringbe]
#执行
stringc = 'cache' + str(i+1) + str(2)
stringz = 'z' + str(i+1) + str(2)
z[stringz],cache[stringc]=batchnorm_forward(z[stringz_last], gamma, beta, self.bn_params[i])
stringz_last=stringz
#当前层的第三个步骤(relu_forward)
stringc = 'cache' + str(i+1) + str(3)
stringz = 'z' + str(i+1) + str(3)
z[stringz],cache[stringc] = relu_forward(z[stringz_last])
stringz_last=stringz
#当前层的第四个步骤(可选)(dropout_forward)
if self.use_dropout:
stringc = 'cache' + str(i+1) + str(4)
stringz = 'z' + str(i+1) + str(4)
z[stringz],cache[stringc]=dropout_forward(z[stringz_last], self.dropout_param)
stringz_last=stringz
#循环隐藏层结束
#最后一个隐藏层,affine_forward + softmax_loss:
#提取权值、偏置
stringW = 'W' + str(i+2)
stringb = 'b' + str(i+2)
W = self.params[stringW]
b = self.params[stringb]
stringc = 'cache' + '_out'
stringz = 'z' + '_out'
z[stringz],cache[stringc] = affine_forward(z[stringz_last],W,b)
#得到scores即z_out
scores = z['z_out']
############################################################################
# END OF YOUR CODE #
############################################################################
# If test mode return early
if mode == 'test':
return scores
loss, grads = 0.0, {}
############################################################################
# TODO: Implement the backward pass for the fully-connected net. Store the #
# loss in the loss variable and gradients in the grads dictionary. Compute #
# data loss using softmax, and make sure that grads[k] holds the gradients #
# for self.params[k]. Don't forget to add L2 regularization! #
# #
# When using batch normalization, you don't need to regularize the scale #
# and shift parameters. #
# #
# NOTE: To ensure that your implementation matches ours and you pass the #
# automated tests, make sure that your L2 regularization includes a factor #
# of 0.5 to simplify the expression for the gradient. #
############################################################################
#反向传播
dz = {} #记录反向传播的中间值
#计算loss
loss_without_reg,dz['dz_out'] = softmax_loss(scores,y)
for i in range(num_hiddenlayers+1):
stringW = 'W' + str(i+1)
W = self.params[stringW]
loss += 0.5*((W**2).sum())
loss = loss+loss_without_reg
#计算grads
j=num_hiddenlayers #j=L-1
#最后一层affine反向传播并更新梯度
stringW = 'W' + str(j+1)
stringb = 'b' + str(j+1)
stringdz = 'dz' + str(j) + str(4)
dz[stringdz],grads[stringW], grads[stringb] = affine_backward(dz['dz_out'], cache['cache_out'])
grads[stringW] += self.reg*cache['cache_out'][1]
stringdz_last = stringdz
#(L-1)层隐藏层反向传播并更新梯度
for i in range(j):
#dropout级反向
if self.use_dropout:
stringdz = 'dz' + str(j-i) + str(3)
stringc = 'cache' + str(j-i) + str(4)
dz[stringdz] = dropout_backward(dz[stringdz_last], cache[stringc])
stringdz_last = stringdz
#relu级反向
stringdz = 'dz' + str(j-i) + str(2)
stringc = 'cache' + str(j-i) + str(3)
if not stringc in cache:
stringc = 'cache' + str(j-i) + str(4)
dz[stringdz] = relu_backward(dz[stringdz_last], cache[stringc])
stringdz_last = stringdz
#batchnorm级反向并更新梯度
if self.use_batchnorm:
stringdz = 'dz' + str(j-i) + str(1)
stringc = 'cache' + str(j-i) + str(2)
stringg = 'gamma' + str(j-i)
stringbe = 'beta' + str(j-i)
dz[stringdz],grads[stringg],grads[stringbe] = batchnorm_backward(dz[stringdz_last], cache[stringc])
stringdz_last = stringdz
#affine级反向并更新梯度
stringdz = 'dz' + str(j-i-1) + str(4)
stringc = 'cache' + str(j-i) + str(1)
if not stringc in cache:
stringc = 'cache' + str(j-i) + str(2)
stringW = 'W' + str(j-i)
stringb = 'b' + str(j-i)
dz[stringdz],grads[stringW],grads[stringb] = affine_backward(dz[stringdz_last],cache[stringc])
grads[stringW] += self.reg*cache[stringc][1]
stringdz_last = stringdz
############################################################################
# END OF YOUR CODE #
############################################################################
return loss, grads