Fork版本项目地址:SSD
作者使用了分布式训练的写法,这使得训练部分代码异常臃肿,我给出了部分注释。我对于多机分布式并不很熟,而且不是重点,所以不过多介绍,简单的给出一点训练中作者的优化手段,包含优化器选择之类的。
一、滑动平均
# =================================================================== # # Configure the moving averages. # =================================================================== # if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None
二、学习率衰减
with tf.device(deploy_config.optimizer_device()): learning_rate = tf_utils.configure_learning_rate(FLAGS, dataset.num_samples, global_step)
细节实现函数,有三种形式,一种是常数学习率,两种不同的衰减方式(默认参数:exponential):
def configure_learning_rate(flags, num_samples_per_epoch, global_step): """Configures the learning rate. Args: num_samples_per_epoch: The number of samples in each epoch of training. global_step: The global_step tensor. Returns: A `Tensor` representing the learning rate. """ decay_steps = int(num_samples_per_epoch / flags.batch_size * flags.num_epochs_per_decay) if flags.learning_rate_decay_type == 'exponential': return tf.train.exponential_decay(flags.learning_rate, global_step, decay_steps, flags.learning_rate_decay_factor, staircase=True, name='exponential_decay_learning_rate') elif flags.learning_rate_decay_type == 'fixed': return tf.constant(flags.learning_rate, name='fixed_learning_rate') elif flags.learning_rate_decay_type == 'polynomial': return tf.train.polynomial_decay(flags.learning_rate, global_step, decay_steps, flags.end_learning_rate, power=1.0, cycle=False, name='polynomial_decay_learning_rate')
三、优化器选择
optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate)
选择很丰富(默认参数:rmsprop):
def configure_optimizer(flags, learning_rate): """Configures the optimizer used for training. Args: learning_rate: A scalar or `Tensor` learning rate. Returns: An instance of an optimizer. """ if flags.optimizer == 'adadelta': optimizer = tf.train.AdadeltaOptimizer( learning_rate, rho=flags.adadelta_rho, epsilon=flags.opt_epsilon) elif flags.optimizer == 'adagrad': optimizer = tf.train.AdagradOptimizer( learning_rate, initial_accumulator_value=flags.adagrad_initial_accumulator_value) elif flags.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate, beta1=flags.adam_beta1, beta2=flags.adam_beta2, epsilon=flags.opt_epsilon) elif flags.optimizer == 'ftrl': optimizer = tf.train.FtrlOptimizer( learning_rate, learning_rate_power=flags.ftrl_learning_rate_power, initial_accumulator_value=flags.ftrl_initial_accumulator_value, l1_regularization_strength=flags.ftrl_l1, l2_regularization_strength=flags.ftrl_l2) elif flags.optimizer == 'momentum': optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=flags.momentum, name='Momentum') elif flags.optimizer == 'rmsprop': optimizer = tf.train.RMSPropOptimizer( learning_rate, decay=flags.rmsprop_decay, momentum=flags.rmsprop_momentum, epsilon=flags.opt_epsilon) elif flags.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(learning_rate) else: raise ValueError('Optimizer [%s] was not recognized', flags.optimizer) return optimizer
四、训练
实际上中间有好一段分布式梯度计算过程,这里不多介绍,大概就是在各个clone上计算出梯度,汇总梯度,再优化各个clone网络,将优化节点提出作为train_tensor等等。
config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0, write_version=2, pad_step_number=False) slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master='', is_chief=True, init_fn=tf_utils.get_init_fn(FLAGS), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_interval_secs, session_config=config, sync_optimizer=None)
至此,SSD项目介绍完毕。
如何使用训练好模型见集智专栏的文章最后一部分。