【深度学习】 目标检测之SSD(Single Shot MultiBox Detector)算法



layer {
  name: "data"
  type: "AnnotatedData"
  top: "data"
  top: "label"
  include {
    phase: TRAIN
  transform_param {
    mirror: true
    //Specify if we want to randomly mirror data.default = false
    mean_value: 104
    mean_value: 117
    mean_value: 123
    resize_param {
      prob: 1
      //Probability of using this resize policy
      resize_mode: WARP
      height: 512
      width: 512
      interp_mode: LINEAR
      //interpolation for for resizing,resize插值的方式,caffe总共支持五种方式,LINERA、AREA、NEAREST、CUBIC和LANCZOS4
      interp_mode: AREA
      interp_mode: NEAREST
      interp_mode: CUBIC
      interp_mode: LANCZOS4
    emit_constraint {
      emit_type: CENTER
      //Condition for emitting annotations.default = CENTER
    distort_param {
      brightness_prob: 0.5
      //The probability of adjusting brightness.随机亮度调整的几率
      brightness_delta: 32
      //Amount to add to the pixel values within [-delta, delta].
      contrast_prob: 0.5
      //The probability of adjusting contrast.随机调整对比度的几率
      contrast_lower: 0.5
      //Lower bound for random contrast factor.
      contrast_upper: 1.5
      //Upper bound for random contrast factor.
      hue_prob: 0.5
      //The probability of adjusting hue.随机调整色度的几率
      hue_delta: 18
      //Amount to add to the hue channel within [-delta, delta].
      saturation_prob: 0.5
      //The probability of adjusting saturation.随机调整饱和度的几率
      saturation_lower: 0.5
      //Lower bound for the random saturation factor.
      saturation_upper: 1.5
      //Upper bound for the random saturation factor.
      random_order_prob: 0
      //The probability of randomly order the image channels.通道顺序随机的几率
    expand_param {
      prob: 0.5
      //Probability of using this expansion policy
      max_expand_ratio: 4.0
      //The ratio to expand the image.
  data_param {
    source: "/home/zhangchen/task/data/object_detection/PASCAL-VOC/VOCdevkit/VOC0712/lmdb/VOC0712_trainval_lmdb"
    batch_size: 1
    backend: LMDB
  annotated_data_param {
  //Define the sampler.
    batch_sampler {
      //有参数use_original_image,Use original image as the source for sampling.default = true
      max_sample: 1
      //If provided, break when found certain number of samples satisfing the sample_constraint.
      max_trials: 1
      //Maximum number of trials for sampling to avoid infinite loop.
    batch_sampler {
    //Sample a batch of bboxes with provided constraints.
      sampler {
      //Sample a bbox in the normalized space [0, 1] with provided constraints.
        min_scale: 0.3
        //Minimum scale of the sampled bbox.default = 1.
        max_scale: 1
        //Maximum scale of the sampled bbox.default = 1.
        min_aspect_ratio: 0.5
        //Minimum aspect ratio of the sampled bbox.default = 1.
        max_aspect_ratio: 2.0
        //Maximum aspect ratio of the sampled bbox.default = 1.
      sample_constraint {
      //Constraints for selecting sampled bbox.
        min_jaccard_overlap: 0.1
        //Minimum Jaccard overlap between sampled bbox and all bboxes in AnnotationGroup.
      max_sample: 1
      max_trials: 50
    batch_sampler {
      sampler {
        min_scale: 0.3
        max_scale: 1
        min_aspect_ratio: 0.5
        max_aspect_ratio: 2.0
      sample_constraint {
        min_jaccard_overlap: 0.3
      max_sample: 1
      max_trials: 50
    batch_sampler {
      sampler {
        min_scale: 0.3
        max_scale: 1
        min_aspect_ratio: 0.5
        max_aspect_ratio: 2.0
      sample_constraint {
        min_jaccard_overlap: 0.5
      max_sample: 1
      max_trials: 50
    batch_sampler {
      sampler {
        min_scale: 0.3
        max_scale: 1
        min_aspect_ratio: 0.5
        max_aspect_ratio: 2.0
      sample_constraint {
        min_jaccard_overlap: 0.7
      max_sample: 1
      max_trials: 50
    batch_sampler {
      sampler {
        min_scale: 0.3
        max_scale: 1
        min_aspect_ratio: 0.5
        max_aspect_ratio: 2.0
      sample_constraint {
        min_jaccard_overlap: 0.9
      max_sample: 1
      max_trials: 50
    batch_sampler {
      sampler {
        min_scale: 0.3
        max_scale: 1
        min_aspect_ratio: 0.5
        max_aspect_ratio: 2.0
      sample_constraint {
        max_jaccard_overlap: 1
        //Maximum Jaccard overlap between sampled bbox and all bboxes in AnnotationGroup.
      max_sample: 1
      max_trials: 50
    label_map_file: "/home/zhangchen/task/code/object_detection/SSD_ResNet-50/PASCAL-VOC/labelmap_voc.prototxt"


layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  param {
    lr_mult: 1
    //The multiplier on the global learning rate for this parameter.default = 1.0
    decay_mult: 1
    //The multiplier on the global weight decay for this parameter.default = 1.0
  param {
    lr_mult: 2
    decay_mult: 0
  convolution_param {
    num_output: 64
    //The number of outputs for the layer
    bias_term: false
    //whether to have bias terms,default = true
    pad: 3
    //The padding size; defaults to 0
    kernel_size: 7
    //The kernel size
    stride: 2
    //The stride; defaults to 1
    weight_filler {
    //The filler for the weight
      type: "msra"
    bias_filler {
    //The filler for the bias
      type: "constant"
      value: 0


layer {
  name: "bn_conv1"
  type: "BatchNorm"
  bottom: "conv1"
  top: "conv1"
  param {
    lr_mult: 0
    decay_mult: 0
  param {
    lr_mult: 0
    decay_mult: 0
  param {
    lr_mult: 0
    decay_mult: 0


Batch Normalization作为最近一年来DL的重要成果,已经广泛被证明其有效性和重要性。虽然有些细节处理还解释不清其理论原因,但是实践证明好用才是真的好,别忘了DL从Hinton对深层网络做Pre-Train开始就是一个经验领先于理论分析的偏经验的一门学问。本文是对论文《Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift》的导读。



为什么深度神经网络随着网络深度加深,训练起来越困难,收敛越来越慢?这是个在DL领域很接近本质的好问题。很多论文都是解决这个问题的,比如ReLU激活函数,再比如Residual Network,BN本质上也是解释并从某个不同的角度来解决这个问题的。
一、“Internal Covariate Shift”问题

从论文名字可以看出,BN是用来解决“Internal Covariate Shift”问题的,那么首先得理解什么是“Internal Covariate Shift”?

论文首先说明Mini-Batch SGD相对于One Example SGD的两个优势:梯度更新方向更准确;并行计算速度快;(为什么要说这些?因为BatchNorm是基于Mini-Batch SGD的,所以先夸下Mini-Batch SGD,当然也是大实话);然后吐槽下SGD训练的缺点:超参数调起来很麻烦。(作者隐含意思是用BN就能解决很多SGD的缺点)

接着引入covariate shift的概念:如果ML系统实例集合<X,Y>中的输入值X的分布老是变,这不符合IID假设,网络模型很难稳定的学规律,这不得引入迁移学习才能搞定吗,我们的ML系统还得去学习怎么迎合这种分布变化啊。对于深度学习这种包含很多隐层的网络结构,在训练过程中,因为各层参数不停在变化,所以每个隐层都会面临covariate shift的问题,也就是在训练过程中,隐层的输入分布老是变来变去,这就是所谓的“Internal Covariate Shift”,Internal指的是深层网络的隐层,是发生在网络内部的事情,而不是covariate shift问题只发生在输入层。

然后提出了BatchNorm的基本思想:能不能让每个隐层节点的激活输入分布固定下来呢?这样就避免了“Internal Covariate Shift”问题了。



THAT’S IT。其实一句话就是:对于每个隐层神经元,把逐渐向非线性函数映射后向取值区间极限饱和区靠拢的输入分布强制拉回到均值为0方差为1的比较标准的正态分布,使得非线性变换函数的输入值落入对输入比较敏感的区域,以此避免梯度消失问题。因为梯度一直都能保持比较大的状态,所以很明显对神经网络的参数调整效率比较高,就是变动大,就是说向损失函数最优值迈动的步子大,也就是说收敛地快。BN说到底就是这么个机制,方法很简单,道理很深刻。









所以BN为了保证非线性的获得,对变换后的满足均值为0方差为1的x又进行了scale加上shift操作(y=scale*x+shift),每个神经元增加了两个参数scale和shift参数,这两个参数是通过训练学习到的,意思是通过scale和shift把这个值从标准正态分布左移或者右移一点并长胖一点或者变瘦一点,每个实例挪动的程度不一样,这样等价于非线性函数的值从正中心周围的线性区往非线性区动了动。核心思想应该是想找到一个线性和非线性的较好平衡点,既能享受非线性的较强表达能力的好处,又避免太靠非线性区两头使得网络收敛速度太慢。当然,这是我的理解,论文作者并未明确这样说。但是很明显这里的scale和shift操作是会有争议的,因为按照论文作者论文里写的理想状态,就会又通过scale和shift操作把变换后的x调整回未变换的状态,那不是饶了一圈又绕回去原始的“Internal Covariate Shift”问题里去了吗,感觉论文作者并未能够清楚地解释scale和shift操作的理论原因。




layer {
  name: "scale_conv1"
  type: "Scale"
  bottom: "conv1"
  top: "conv1"
  scale_param {
    bias_term: true
    //Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but may be more efficient).  Initialized with bias_filler (defaults to 0).


layer {
  name: "conv1_relu"
  type: "ReLU"
  bottom: "conv1"
  top: "conv1"


layer {
  name: "pool1"
  type: "Pooling"
  bottom: "conv1"
  top: "pool1"
  pooling_param {
    pool: MAX
    //The pooling method,default = MAX
    kernel_size: 3
    // The kernel size (square)
    stride: 2
    //// The stride (equal in Y, X),default = 1
    //还有pad参数,The padding size (equal in Y, X),default = 0


layer {
  name: "res2a"
  type: "Eltwise"
  bottom: "res2a_branch1"
  bottom: "res2a_branch2c"
  top: "res2a"
  //有参数EltwiseOp operation,element-wise operation,default = SUM


layer {
  name: "conv4f_norm"
  type: "Normalize"
  bottom: "res4f"
  top: "conv4f_norm"
  norm_param {
    across_spatial: false
    scale_filler {
      type: "constant"
      value: 20
    channel_shared: false
    //Whether or not scale parameters are shared across channels.


layer {
  name: "conv4f_norm_mbox_loc_perm"
  type: "Permute"
  bottom: "conv4f_norm_mbox_loc"
  top: "conv4f_norm_mbox_loc_perm"
  permute_param {
    order: 0
    order: 2
    order: 3
    order: 1


layer {
  name: "conv4f_norm_mbox_loc_flat"
  type: "Flatten"
  bottom: "conv4f_norm_mbox_loc_perm"
  top: "conv4f_norm_mbox_loc_flat"
  flatten_param {
    axis: 1
    //The first axis to flatten: all preceding axes are retained in the output. May be negative to index from the end (e.g., -1 for the last axis).
    //还有参数end_axis,The last axis to flatten: all following axes are retained in the output. May be negative to index from the end (e.g., the default -1 for the last axis).


layer {
  name: "conv4f_norm_mbox_priorbox"
  type: "PriorBox"
  bottom: "conv4f_norm"
  bottom: "data"
  top: "conv4f_norm_mbox_priorbox"
  prior_box_param {
    min_size: 20.48
    // Minimum box size (in pixels). Required!
    //in pixels,这个值很重要,在原文里也有着重描写
    max_size: 51.2
    //Maximum box size (in pixels). Required!
    //in pixels,这个值很重要,在原文里也有着重描写
    aspect_ratio: 2
    //Various of aspect ratios. Duplicate ratios will be ignored. If none is provided, we use default ratio 1.
    flip: true
    //If true, will flip each aspect ratio. For example, if there is aspect ratio "r", we will generate aspect ratio "1.0/r" as well.
    clip: false
    //If true, will clip the prior so that it is within [0, 1]
    variance: 0.1
    //Variance for adjusting the prior bboxes.
    variance: 0.1
    variance: 0.2
    variance: 0.2
    step: 16
    //Explicitly provide the step size.


layer {
  name: "mbox_loc"
  type: "Concat"
  bottom: "conv4f_norm_mbox_loc_flat"
  bottom: "fc7_mbox_loc_flat"
  bottom: "conv6_2_mbox_loc_flat"
  bottom: "conv7_2_mbox_loc_flat"
  bottom: "conv8_2_mbox_loc_flat"
  bottom: "conv9_2_mbox_loc_flat"
  top: "mbox_loc"
  concat_param {
    axis: 1
    //By default, ConcatLayer concatenates blobs along the "channels" axis (1).


layer {
  name: "mbox_loss"
  type: "MultiBoxLoss"
  bottom: "mbox_loc"
  bottom: "mbox_conf"
  bottom: "mbox_priorbox"
  bottom: "label"
  top: "mbox_loss"
  include {
    phase: TRAIN
  propagate_down: true
  propagate_down: true
  propagate_down: false
  propagate_down: false
  loss_param {
    normalization: VALID
  multibox_loss_param {
    loc_loss_type: SMOOTH_L1
    conf_loss_type: SOFTMAX
    loc_weight: 1.0
    //eight for localization loss.default = 1.0
    num_classes: 21
    //Number of classes to be predicted. Required!
    share_location: true
    //If true, bounding box are shared among different classes.
    match_type: PER_PREDICTION
    //matching method during training.BIPARTITE和PER_PREDICTION
    overlap_threshold: 0.5
    //If match_type is PER_PREDICTION, use overlap_threshold to determine the extra matching bboxes.
    use_prior_for_matching: true
    // Use prior for matching.
    background_label_id: 0
    //Background label id.
    use_difficult_gt: false
    //If true, also consider difficult ground truth.
    neg_pos_ratio: 3.0
    //The negative/positive ratio.
    neg_overlap: 0.5
    //The negative overlap upperbound for the unmatched predictions.
    code_type: CENTER_SIZE
    ignore_cross_boundary_bbox: false
    //If true, ignore cross boundary bbox during matching. Cross boundary bbox is a bbox who is outside of the image region.
    mining_type: MAX_NEGATIVE
    //Mining type during training.
    //NONE : use all negatives.
    //MAX_NEGATIVE : select negatives based on the score.
    //HARD_EXAMPLE : select hard examples based on "Training Region-based Object Detectors with Online Hard Example Mining", Shrivastava et.al.



