课程笔记及其翻译的网址:
https://zhuanlan.zhihu.com/p/21930884?refer=intelligentunit
一、SGD:
def sgd(w, dw, config=None):
"""
Performs vanilla stochastic gradient descent.
config format:
- learning_rate: Scalar learning rate.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-3)
w -= config['learning_rate'] * dw
return w, config
二、SGD_momentum:
def sgd_momentum(w, dw, config=None):
"""
Performs stochastic gradient descent with momentum.
config format:
- learning_rate: Scalar learning rate.
- momentum: Scalar between 0 and 1 giving the momentum value.
Setting momentum = 0 reduces to sgd.
- velocity: A numpy array of the same shape as w and dw used to store a
moving average of the gradients.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-2)
config.setdefault('momentum', 0.9)
v = config.get('velocity', np.zeros_like(w))
next_w = None
###########################################################################
# TODO: Implement the momentum update formula. Store the updated value in #
# the next_w variable. You should also use and update the velocity v. #
###########################################################################
v = config['momentum'] * v - config['learning_rate'] * dw
next_w = w + v
###########################################################################
# END OF YOUR CODE #
###########################################################################
config['velocity'] = v
return next_w, config
三、rms_prop:
def rmsprop(x, dx, config=None):
"""
Uses the RMSProp update rule, which uses a moving average of squared
gradient values to set adaptive per-parameter learning rates.
config format:
- learning_rate: Scalar learning rate.
- decay_rate: Scalar between 0 and 1 giving the decay rate for the squared
gradient cache.
- epsilon: Small scalar used for smoothing to avoid dividing by zero.
- cache: Moving average of second moments of gradients.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-2)
config.setdefault('decay_rate', 0.99)
config.setdefault('epsilon', 1e-8)
config.setdefault('cache', np.zeros_like(x))
next_x = None
###########################################################################
# TODO: Implement the RMSprop update formula, storing the next value of x #
# in the next_x variable. Don't forget to update cache value stored in #
# config['cache']. #
###########################################################################
config['cache'] = config['decay_rate'] * config['cache'] + (1 - config['decay_rate']) * dx**2
next_x = x - config['learning_rate'] * dx /(np.sqrt(config['cache'] + config['epsilon']))
###########################################################################
# END OF YOUR CODE #
###########################################################################
return next_x, config
四、ADAM:
def adam(x, dx, config=None):
"""
Uses the Adam update rule, which incorporates moving averages of both the
gradient and its square and a bias correction term.
config format:
- learning_rate: Scalar learning rate.
- beta1: Decay rate for moving average of first moment of gradient.
- beta2: Decay rate for moving average of second moment of gradient.
- epsilon: Small scalar used for smoothing to avoid dividing by zero.
- m: Moving average of gradient.
- v: Moving average of squared gradient.
- t: Iteration number.
"""
if config is None: config = {}
config.setdefault('learning_rate', 1e-3)
config.setdefault('beta1', 0.9)
config.setdefault('beta2', 0.999)
config.setdefault('epsilon', 1e-8)
config.setdefault('m', np.zeros_like(x))
config.setdefault('v', np.zeros_like(x))
config.setdefault('t', 1)
next_x = None
###########################################################################
# TODO: Implement the Adam update formula, storing the next value of x in #
# the next_x variable. Don't forget to update the m, v, and t variables #
# stored in config. #
###########################################################################
config['t'] += 1
beta1 = config['beta1']
beta2 = config['beta2']
epsilon = config['epsilon']
learning_rate = config['learning_rate']
config['m'] = beta1 * config['m'] + (1-beta1) * dx
config['v'] = beta2 * config['v'] + (1-beta2) * dx**2
mb = config['m']/(1 - beta1**config['t'])
vb = config['v']/(1 - beta2**config['t'])
next_x = x - learning_rate * mb / np.sqrt(vb+epsilon)
###########################################################################
# END OF YOUR CODE #
###########################################################################
return next_x, config