mnist_loader.py
"""
mnist_loader
~~~~~~~~~~~~
A library to load the MNIST image data. For details of the data
structures that are returned, see the doc strings for ``load_data``
and ``load_data_wrapper``. In practice, ``load_data_wrapper`` is the
function usually called by our neural network code.
"""
#### Libraries
# Standard library
import pickle
import gzip
# Third-party libraries
import numpy as np
def load_data():
"""Return the MNIST data as a tuple containing the training data,
the validation data, and the test data.
The ``training_data`` is returned as a tuple with two entries.
The first entry contains the actual training images. This is a
numpy ndarray with 50,000 entries. Each entry is, in turn, a
numpy ndarray with 784 values, representing the 28 * 28 = 784
pixels in a single MNIST image.
The second entry in the ``training_data`` tuple is a numpy ndarray
containing 50,000 entries. Those entries are just the digit
values (0...9) for the corresponding images contained in the first
entry of the tuple.
The ``validation_data`` and ``test_data`` are similar, except
each contains only 10,000 images.
This is a nice data format, but for use in neural networks it's
helpful to modify the format of the ``training_data`` a little.
That's done in the wrapper function ``load_data_wrapper()``, see
below.
"""
f = gzip.open('data/mnist.pkl.gz', 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()
return (training_data, validation_data, test_data)
def load_data_wrapper():
"""Return a tuple containing ``(training_data, validation_data,
test_data)``. Based on ``load_data``, but the format is more
convenient for use in our implementation of neural networks.
In particular, ``training_data`` is a list containing 50,000
2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray
containing the input image. ``y`` is a 10-dimensional
numpy.ndarray representing the unit vector corresponding to the
correct digit for ``x``.
``validation_data`` and ``test_data`` are lists containing 10,000
2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional
numpy.ndarry containing the input image, and ``y`` is the
corresponding classification, i.e., the digit values (integers)
corresponding to ``x``.
Obviously, this means we're using slightly different formats for
the training data and the validation / test data. These formats
turn out to be the most convenient for use in our neural network
code."""
tr_d, va_d, te_d = load_data()
training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
training_results = [vectorized_result(y) for y in tr_d[1]]
training_data = list(zip(training_inputs, training_results))
validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
validation_data = list(zip(validation_inputs, va_d[1]))
test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
test_data = list(zip(test_inputs, te_d[1]))
return (training_data, validation_data, test_data)
def vectorized_result(j):
"""Return a 10-dimensional unit vector with a 1.0 in the jth
position and zeroes elsewhere. This is used to convert a digit
(0...9) into a corresponding desired output from the neural
network."""
e = np.zeros((10, 1))
e[j] = 1.0
return e
bp原理及公式: https://blog.csdn.net/qq_46456049/article/details/112751769
import numpy as np
import random
from mnist_loader import load_data_wrapper
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)(1-sigmoid(z))
class MLP_np(object):
def __init__(self, sizes):
"""
:param sizes: [784, 30, 10]
"""
self.sizes = sizes
self.num_layers = len(sizes) - 1
# sizes:[784, 30, 10]
# w: [ch_out, ch_in]
# b: [ch_out]
self.weights = [np.random.randn(ch2, ch1) for ch1, ch2 in zip(sizes[:-1], sizes[1:])] # [784, 30], [30, 10]
self.biases = [np.random.randn(ch, 1) for ch in sizes[1:]]
def forward(self, x):
"""
:params x: [784, 1]
:return: [10, 1]
"""
for b, w in zip(self.biases, self.weights):
# [30, 784]@[784, 1] => [30, 1]+[30, 1] => [30, 1]
z = np.dot(w, x) + b
# [30, 1]
x = sigmoid(z)
return x
def backprop(self, x, y):
"""
:param x: [784, 1]
:param y: [10, 1], one_hot encoding
:return
"""
nabla_w = [np.zeros(w.shape) for w in self.weights]
nabla_b = [np.zeros(b.shape) for b in self.biases]
# 1. forward
# save activation for every layer
activations = [x]
# save z for every layer
zs = []
activation = x
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation) + b
activation = sigmoid(z)
zs.append(z)
activations.append(activation)
loss = np.power(activations[-1] - y, 2).sum()
# 2. backward
# 2.1 compute gradient on output layer
# [10, 1] with [10, 1] => [10, 1]
delta = activations[-1] * (1 - activation[-1]) * (activations[-1] - y)
nabla_b[-1] = delta
# [10, 1]@[1, 30] => [10, 30]
# activation: [30, 1]
nabla_b[-1] = delta
# [10, 1]@[1, 30] => [10, 30]
# activation: [30, 1]
nabla_w[-1] = np.dot(delta, activations[-2].T)
# 2.2 compute hidden gradient
for l in range(2, self.num_layers+1, -1):
l = -l # hidden layer的grad计算
z = zs[l]
a = activations[l]
# delta_j
# [10, 30]T @ [10, 1] => [30, 10]@[10, 1] => [30, 1] * [30, 1] => [30, 1]
delta = np.dot(self.weights[l+1].T, delta) * a * (1 - a)
nabla_b[l] = delta
# [30, 1] @ [784, 1]T => [30, 784]
nabla_w[l] = np.dot(delta, activations[l-1].T)
return nabla_w, nabla_b, loss
def train(self, training_data, epochs, batchsz, lr, test_data):
"""
:param training_data: list of (x, y)
:param epochs: 1000
:param batchsz: 10
:param lr: 0.01
:param test_data: list of (x, y)
:return
"""
if test_data:
n_test = len(test_data)
n = len(training_data)
for j in range(epochs):
random.shuffle(training_data)
mini_batches = [
training_data[k: k+batchsz]
for k in range(0, n, batchsz)
]
# for every batch in current_data
for mini_batch in mini_batches:
loss = self.update_mini_batch(mini_batch, lr)
if test_data:
print("Epochs {0} : {1} / {2}".format(j, self.evaluate(test_data), n_test), loss)
else:
print("Epochs {0} complete".format(j))
def update_mini_batch(self, batch, lr):
"""
:param batch: list of (x, y)
:param lr: 0.01
:return
"""
nabla_w = np.array([np.zeros(w.shape) for w in self.weights])
nabla_b = np.array([np.zeros(b.shape) for b in self.biases])
loss = 0
# for every sample in current batch
for x, y in batch:
# list of every w/b gradient
# [w1, w2, w3]
nabla_w_, nabla_b_, loss_ = self.backprop(x, y)
nabla_w = nabla_w + nabla_w_
nabla_b = nabla_b + nabla_b_
loss += loss_
nabla_w = nabla_w / len(batch)
nabla_b = nabla_b / len(batch)
loss = loss / len(batch)
# w = w - lr * nabla_w
self.weights = list(np.array(self.weights) - lr * nabla_w)
self.biases = list(np.array(self.biases) - lr * nabla_b)
return loss
def evaluate(self, test_data):
result = [(np.argmax(self.forward(x)), y) for x, y in test_data]
correct = sum(int(pred == y) for pred, y in result)
return correct
def main():
training_data, validation_data, test_data = load_data_wrapper()
print(training_data[0][0].shape, training_data[0][1].shape)
net = MLP_np([784, 30, 10])
net.train(training_data, 1000, 10, 0.1, test_data=test_data)
if __name__ == '__main__':
main()