莫烦tensorflow Batch Normalization总结与tf.identity()/EMA功能

代码来源于莫烦tensorflow Batch Normalization一节。我添加了一些注释（自己的理解），添加了一些函数的功能介绍。

# 对输入的分散的数据要统一数据规格，这样神经网络可以更好的学习数据的规律，因为数据分布对神经网络是有很大影响的，比如：
# w = 0.1, x1 = 1, x2 =20 --> w*x 0.1, w*x2=2 --> tanh activation function --> tanh(w*x1)接近0,tanh(w*x2)接近1，
#这会使得x增加，w*x接近1或者-1，w*x对较大的x特征范围不敏感.这时候可以对输入数据进行Normalization.

#但是这种情况不仅发生在输入层，所以引入Batch Normalization,在每个batch前向传播时，都要在全连接层与激活函数之间进行Normalization,保持数据分布集中在敏感区域.SGD
# Batch Normalization 包括标准化工序(x' = x-u/s),s是方差,u是均值.
# 反标准化工序 y = r * x' + b = BN(x')(参数r, b) 自动学习，用于与标准化工序共同调节数据分布.r:扩展参数,b:平移参数

#=================tf.identity()函数作用与control_dependencies===================
# 第一种：
# x_plus_1 = tf.assign_add(x, 1)
#
# #control_dependencies的意义是，在执行with包含的内容（在这里就是 y = x）前，
# #先执行control_dependencies参数中的内容（在这里就是 x_plus_1），这里的解释不准确，先接着看。。。
# with tf.control_dependencies([x_plus_1]):
# y = x
# 第二种：
# x_plus_1 = tf.assign_add(x, 1)
# with tf.control_dependencies([x_plus_1]):
# y = tf.identity(x)#修改部分
#
# 结论：
# 对于control_dependencies这个管理器，只有当里面的操作是一个op时，才会生效，也就是先执行传入的参数op，再执行里面的op。
# 而y=x仅仅是tensor的一个简单赋值，不是定义的op，所以在图中不会形成一个节点，这样该管理器就失效了。
# tf.identity是返回一个一模一样新的tensor的op，这会增加一个新节点到gragh中，这时control_dependencies就会生效，所以第二种情况的输出符合预期。
#
#=========================================================================

#=============================EMA=========================================
# 指数滑动平均(ExponentialMovingAverage)EMA被广泛的应用在深度学习的BN层中，RMSprop，adadelta，adam等梯度下降方法。
# 1. tensorflow中提供了tf.train.ExponentialMovingAverage来实现滑动平均模型，他使用指数衰减来计算变量的移动平均值。
# tf.train.ExponentialMovingAverage.init(self, decay, num_updates=None, zero_debias=False, name="ExponentialMovingAverage"):
# decay是衰减率在创建ExponentialMovingAverage对象时，需指定衰减率(decay)，用于控制模型的更新速度。decay设置为接近1的值比较合理，通常为：0.999,0.9999。
#
# 2. 影子变量( shadow variable)的初始值与训练变量(variable)的初始值相同。当运行变量更新时，每个影子变量都会更新为：
# shadow variable = (1-decay)* shadow variable + decay * variable
# num_updates是ExponentialMovingAverage提供用来动态设置decay的参数，当初始化时提供了参数，即不为none时，每次的衰减率是：
# decay = min{decay, 1+ num_updates/10+num_updates}
# 3. apply()方法添加了训练变量的影子副本，并保持了其影子副本中训练变量的移动平均值操作。在每次训练之后调用此操作，更新移动平均值。
# 4. average()和average_name()方法可以获取影子变量及其名称。
# 5. https://blog.csdn.net/qq_14845119/article/details/78767544 以上信息来源于这个网址。
#==========================================================================

#代码可运行，如下。

# -*- coding: utf-8 -*-

"""

Know more, visit my Python tutorial page: https://morvanzhou.github.io/tutorials/
My Youtube Channel: https://www.youtube.com/user/MorvanZhou

Dependencies:
tensorflow: 1.1.0
matplotlib
numpy
"""
#import packages

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# define hyperparameter
ACTIVATION = tf.nn.tanh # relu最终loss曲线消失，因为最后数据分布极度不均匀，最终只剩下甚至一个数据点的loss，tanh效果好
N_LAYERS = 7 #隐藏层数
N_HIDDEN_UNITS = 30 #隐藏层神经元数目

def built_net(xs, ys, norm): #添加norm是否标准化的参数
def add_layer(inputs, in_size, out_size, activation_function=None, norm=False):
Weights = tf.Variable(tf.random_normal([in_size, out_size], mean=0, stddev=1.))
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
Wx_plus_b = tf.matmul(inputs, Weights)+biases

#判断是否需要标准化，标准化是在全连接层输出和激活层输入之间
if norm:
fc_mean, fc_var = tf.nn.moments(Wx_plus_b, axes=[0]) #对数据的每个列求均值和方差
scale = tf.Variable(tf.ones([out_size])) # 初始化扩展参数r
shift = tf.Variable(tf.zeros([out_size])) # 初始化平移参数B
epsilon = 0.001
#1版本. tf.nn.batch_normalization 等价于
# Wx_plus_b = (Wx_plus_b-fc_mean)/tf.square(fc_var+epsilon)
# Wx_plus_b = Wx_plus_b * scale + shift
#2版本. Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, fc_mean, fc_var, shift, scale, epsilon)
#tf.nn.batch_normalization 修改为下边几句
#3版本. apply moving average for mean and var when train on batch
ema = tf.train.ExponentialMovingAverage(decay=0.5) # decay是衰减权重
# decay和shadow variable和variable都是随着batch更新的!!!!!!
def mean_var_with_update():
ema_apply_op = ema.apply([fc_mean, fc_var]) # 将ema操作作用在fc_mean和fc_var变量上，产生两个shadow variable，用于更新variable
with tf.control_dependencies([ema_apply_op]): # control_dependencies先执行括号的操作
return tf.identity(fc_mean), tf.identity(fc_var) # 获得batch更新后的mean和var
#获得最新的mean and variance
mean, var = mean_var_with_update()
Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, mean, var, shift, scale, epsilon)

if activation_function is None:
outputs = Wx_plus_b
else:
outputs = activation_function(Wx_plus_b)
return outputs

tf.set_random_seed(1)
np.random.seed(1)

if norm:
fc_mean, fc_var = tf.nn.moments(xs, axes=[0]) #对数据的每个列求均值和方差
#输入和输出都是一维的
scale = tf.Variable(tf.ones([1])) # 初始化扩展参数r
shift = tf.Variable(tf.zeros([1])) # 初始化平移参数B
epsilon = 0.001
#tf.nn.batch_normalization 等价于
# Wx_plus_b = (Wx_plus_b-fc_mean)/tf.square(fc_var+epsilon)
# Wx_plus_b = Wx_plus_b * scale + shift
# xs = tf.nn.batch_normalization(xs, fc_mean, fc_var, shift, scale, epsilon)
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([fc_mean, fc_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(fc_mean), tf.identity(fc_var)
mean, var = mean_var_with_update()
xs = tf.nn.batch_normalization(xs, mean, var, shift, scale, epsilon)

layers_inputs = [xs] #初始化输入数据,记录每层的input

#隐藏层节点数都是N_HIDDEN_UNITS(循环创建多个隐藏层)
for l_n in range(N_LAYERS):
layer_input = layers_inputs[l_n] #第(l_n+1)层的输入数据，是第l_n层的输出，会都加入layers_inputs列表.
in_size = layers_inputs[l_n].get_shape()[1].value # 第(l_n+1)层的输入大小是输入数据的列数
output = add_layer(layer_input, in_size, N_HIDDEN_UNITS, ACTIVATION,norm)
layers_inputs.append(output)

#最终一个数据对应一个点,即一维输出
prediction = add_layer(layers_inputs[-1], 30, 1, activation_function=None)

cost = tf.reduce_mean(tf.reduce_sum(tf.square(ys-prediction), reduction_indices=[1]))
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
return [train_op, cost, layers_inputs]

#画hist，即各层输入数据分布图
def plot_his(inputs, inputs_norm):
# plot histogram for the inputs of every layer
for j, all_inputs in enumerate([inputs, inputs_norm]):
for i, input in enumerate(all_inputs):
plt.subplot(2, len(all_inputs), j*len(all_inputs)+(i+1))
plt.cla()
if i == 0:
the_range = (-7, 10)
else:
the_range = (-1, 1)
plt.hist(input.ravel(), bins=15, range=the_range, color='#FF5733')
plt.yticks(())
if j == 1:
plt.xticks(the_range)
else:
plt.xticks(())
ax = plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
plt.title("%s normalizing" % ("Without" if j == 0 else "With"))
plt.draw()
plt.pause(0.01)

if __name__ == '__main__':
# fake data
tf.set_random_seed(1)
np.random.seed(1)
x_data = np.linspace(-7, 10, 500)[:, np.newaxis]
noise = np.random.normal(0, 8, x_data.shape) # 概率分布(均值，方差，形状),方差显示分散程度
y_data = np.square(x_data) - 5 + noise

#==============================================================================
# plt.scatter(x_data, y_data)
# plt.show()
#==============================================================================

#define variable
xs = tf.placeholder(tf.float32, [None, 1]) # shpape=(batch_size, nfeatures=1)
ys = tf.placeholder(tf.float32, [None, 1])

#创建网络
train_op, cost, layers_inputs = built_net(xs, ys, norm=False)
train_op_norm, cost_norm, layers_inputs_norm = built_net(xs, ys, norm=True)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

cost_hist = []
cost_hist_norm = []
record_step = 5


plt.ion()
plt.figure(figsize=(7, 3))
for i in range(251):
sess.run(train_op, feed_dict={xs:x_data, ys:y_data}) #run training
sess.run(train_op_norm, feed_dict={xs:x_data, ys:y_data})
if i%50 == 0:
all_inputs, all_inputs_norm = sess.run([layers_inputs, layers_inputs_norm], feed_dict={xs:x_data, ys:y_data}) # run layers_inputs
plot_his(all_inputs, all_inputs_norm)

if i % record_step == 0:
# record cost
cost_hist.append(sess.run(cost, feed_dict={xs: x_data, ys: y_data}))
cost_hist_norm.append(sess.run(cost_norm, feed_dict={xs: x_data, ys: y_data}))
plt.ioff()
plt.figure()
# 画是否标准化的损失曲线对比图
plt.plot(np.arange(len(cost_hist)) * record_step, np.array(cost_hist), label='no BN')
plt.plot(np.arange(len(cost_hist)) * record_step, np.array(cost_hist_norm), label='BN')
plt.legend()
#plt.show()
plt.savefig('cost-tanh.png')

莫烦tensorflow Batch Normalization总结与tf.identity()/EMA功能

猜你喜欢