【深度学习】 目标检测之SSD(Single Shot MultiBox Detector)算法
caffe关键层参数解读
AnnotatedData
layer {
name: "data"
type: "AnnotatedData"
top: "data"
top: "label"
include {
phase: TRAIN
}
transform_param {
mirror: true
//Specify if we want to randomly mirror data.default = false
mean_value: 104
//如果是opencv,B通道要减的值,test要和train保持一致,可以计算数据值的mean值得到
mean_value: 117
//如果是opencv,G通道要减的值,test要和train保持一致,可以计算数据值的mean值得到
mean_value: 123
//如果是opencv,R通道要减的值,test要和train保持一致,可以计算数据值的mean值得到
resize_param {
prob: 1
//Probability of using this resize policy
resize_mode: WARP
//WARP、FIT_SMALL_SIZE、FIT_LARGE_SIZE_AND_PAD三种模式
height: 512
width: 512
//如果是FIT_SMALL_SIZE时,还要有height_scale和width_scale参数
interp_mode: LINEAR
//interpolation for for resizing,resize插值的方式,caffe总共支持五种方式,LINERA、AREA、NEAREST、CUBIC和LANCZOS4
interp_mode: AREA
interp_mode: NEAREST
interp_mode: CUBIC
interp_mode: LANCZOS4
}
emit_constraint {
emit_type: CENTER
//Condition for emitting annotations.default = CENTER
}
distort_param {
brightness_prob: 0.5
//The probability of adjusting brightness.随机亮度调整的几率
brightness_delta: 32
//Amount to add to the pixel values within [-delta, delta].
contrast_prob: 0.5
//The probability of adjusting contrast.随机调整对比度的几率
contrast_lower: 0.5
//Lower bound for random contrast factor.
contrast_upper: 1.5
//Upper bound for random contrast factor.
hue_prob: 0.5
//The probability of adjusting hue.随机调整色度的几率
hue_delta: 18
//Amount to add to the hue channel within [-delta, delta].
saturation_prob: 0.5
//The probability of adjusting saturation.随机调整饱和度的几率
saturation_lower: 0.5
//Lower bound for the random saturation factor.
saturation_upper: 1.5
//Upper bound for the random saturation factor.
random_order_prob: 0
//The probability of randomly order the image channels.通道顺序随机的几率
}
expand_param {
prob: 0.5
//Probability of using this expansion policy
//expand是指对图像进行缩小,图像的其余区域补0
max_expand_ratio: 4.0
//The ratio to expand the image.
}
}
data_param {
source: "/home/zhangchen/task/data/object_detection/PASCAL-VOC/VOCdevkit/VOC0712/lmdb/VOC0712_trainval_lmdb"
batch_size: 1
backend: LMDB
}
annotated_data_param {
//Define the sampler.
batch_sampler {
//有参数use_original_image,Use original image as the source for sampling.default = true
max_sample: 1
//If provided, break when found certain number of samples satisfing the sample_constraint.
max_trials: 1
//Maximum number of trials for sampling to avoid infinite loop.
}
batch_sampler {
//Sample a batch of bboxes with provided constraints.
sampler {
//Sample a bbox in the normalized space [0, 1] with provided constraints.
//在区间[min_scale,max_sacle]内随机生成一个值,这个值作为patch的高Height,然后在[min_aspect_ratio,max_aspect_ratio]范围内生成ratio,从而得到patch的Width。到此为止patch的宽和高随机得到,然后在图像中进行一次patch,要求满足与GT的最小IOU是0.9,也就是IOU>=0.9。如果随机patch满足这个条件,patch被resize到输入大小,作为数据集
min_scale: 0.3
//Minimum scale of the sampled bbox.default = 1.
max_scale: 1
//Maximum scale of the sampled bbox.default = 1.
min_aspect_ratio: 0.5
//Minimum aspect ratio of the sampled bbox.default = 1.
max_aspect_ratio: 2.0
//Maximum aspect ratio of the sampled bbox.default = 1.
}
sample_constraint {
//Constraints for selecting sampled bbox.
min_jaccard_overlap: 0.1
//Minimum Jaccard overlap between sampled bbox and all bboxes in AnnotationGroup.
//最小IoU
}
max_sample: 1
max_trials: 50
}
batch_sampler {
sampler {
min_scale: 0.3
max_scale: 1
min_aspect_ratio: 0.5
max_aspect_ratio: 2.0
}
sample_constraint {
min_jaccard_overlap: 0.3
}
max_sample: 1
max_trials: 50
}
batch_sampler {
sampler {
min_scale: 0.3
max_scale: 1
min_aspect_ratio: 0.5
max_aspect_ratio: 2.0
}
sample_constraint {
min_jaccard_overlap: 0.5
}
max_sample: 1
max_trials: 50
}
batch_sampler {
sampler {
min_scale: 0.3
max_scale: 1
min_aspect_ratio: 0.5
max_aspect_ratio: 2.0
}
sample_constraint {
min_jaccard_overlap: 0.7
}
max_sample: 1
max_trials: 50
}
batch_sampler {
sampler {
min_scale: 0.3
max_scale: 1
min_aspect_ratio: 0.5
max_aspect_ratio: 2.0
}
sample_constraint {
min_jaccard_overlap: 0.9
}
max_sample: 1
max_trials: 50
}
batch_sampler {
sampler {
min_scale: 0.3
max_scale: 1
min_aspect_ratio: 0.5
max_aspect_ratio: 2.0
}
sample_constraint {
max_jaccard_overlap: 1
//Maximum Jaccard overlap between sampled bbox and all bboxes in AnnotationGroup.
}
max_sample: 1
max_trials: 50
}
label_map_file: "/home/zhangchen/task/code/object_detection/SSD_ResNet-50/PASCAL-VOC/labelmap_voc.prototxt"
}
}
Convolution
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
//The multiplier on the global learning rate for this parameter.default = 1.0
decay_mult: 1
//The multiplier on the global weight decay for this parameter.default = 1.0
}
//这个是针对weight的
param {
lr_mult: 2
decay_mult: 0
}
//这个是针对bias的
convolution_param {
num_output: 64
//The number of outputs for the layer
bias_term: false
//whether to have bias terms,default = true
//如果这个是false,关于bias的设置都不起作用
pad: 3
//The padding size; defaults to 0
kernel_size: 7
//The kernel size
stride: 2
//The stride; defaults to 1
weight_filler {
//The filler for the weight
type: "msra"
}
bias_filler {
//The filler for the bias
type: "constant"
value: 0
}
}
}
BatchNorm
layer {
name: "bn_conv1"
type: "BatchNorm"
bottom: "conv1"
top: "conv1"
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
param {
lr_mult: 0
decay_mult: 0
}
}
BN(批量归一化)层原理参考自以下博文:
https://www.cnblogs.com/guoyaohua/p/8724433.html
原作者写的很好,这里直接贴过来:
Batch Normalization作为最近一年来DL的重要成果,已经广泛被证明其有效性和重要性。虽然有些细节处理还解释不清其理论原因,但是实践证明好用才是真的好,别忘了DL从Hinton对深层网络做Pre-Train开始就是一个经验领先于理论分析的偏经验的一门学问。本文是对论文《Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift》的导读。
机器学习领域有个很重要的假设:IID独立同分布假设,就是假设训练数据和测试数据是满足相同分布的,这是通过训练数据获得的模型能够在测试集获得好的效果的一个基本保障。那BatchNorm的作用是什么呢?BatchNorm就是在深度神经网络训练过程中使得每一层神经网络的输入保持相同分布的。
接下来一步一步的理解什么是BN。
为什么深度神经网络随着网络深度加深,训练起来越困难,收敛越来越慢?这是个在DL领域很接近本质的好问题。很多论文都是解决这个问题的,比如ReLU激活函数,再比如Residual Network,BN本质上也是解释并从某个不同的角度来解决这个问题的。
一、“Internal Covariate Shift”问题
从论文名字可以看出,BN是用来解决“Internal Covariate Shift”问题的,那么首先得理解什么是“Internal Covariate Shift”?
论文首先说明Mini-Batch SGD相对于One Example SGD的两个优势:梯度更新方向更准确;并行计算速度快;(为什么要说这些?因为BatchNorm是基于Mini-Batch SGD的,所以先夸下Mini-Batch SGD,当然也是大实话);然后吐槽下SGD训练的缺点:超参数调起来很麻烦。(作者隐含意思是用BN就能解决很多SGD的缺点)
接着引入covariate shift的概念:如果ML系统实例集合<X,Y>中的输入值X的分布老是变,这不符合IID假设,网络模型很难稳定的学规律,这不得引入迁移学习才能搞定吗,我们的ML系统还得去学习怎么迎合这种分布变化啊。对于深度学习这种包含很多隐层的网络结构,在训练过程中,因为各层参数不停在变化,所以每个隐层都会面临covariate shift的问题,也就是在训练过程中,隐层的输入分布老是变来变去,这就是所谓的“Internal Covariate Shift”,Internal指的是深层网络的隐层,是发生在网络内部的事情,而不是covariate shift问题只发生在输入层。
然后提出了BatchNorm的基本思想:能不能让每个隐层节点的激活输入分布固定下来呢?这样就避免了“Internal Covariate Shift”问题了。
BN不是凭空拍脑袋拍出来的好点子,它是有启发来源的:之前的研究表明如果在图像处理中对输入图像进行白化(Whiten)操作的话——所谓白化,就是对输入数据分布变换到0均值,单位方差的正态分布——那么神经网络会较快收敛,那么BN作者就开始推论了:图像是深度神经网络的输入层,做白化能加快收敛,那么其实对于深度网络来说,其中某个隐层的神经元是下一层的输入,意思是其实深度神经网络的每一个隐层都是输入层,不过是相对下一层来说而已,那么能不能对每个隐层都做白化呢?这就是启发BN产生的原初想法,而BN也确实就是这么做的,可以理解为对深层神经网络每个隐层神经元的激活值做简化版本的白化操作。
二、BatchNorm的本质思想
BN的基本思想其实相当直观:因为深层神经网络在做非线性变换前的激活输入值(就是那个x=WU+B,U是输入)随着网络深度加深或者在训练过程中,其分布逐渐发生偏移或者变动,之所以训练收敛慢,一般是整体分布逐渐往非线性函数的取值区间的上下限两端靠近(对于Sigmoid函数来说,意味着激活输入值WU+B是大的负值或正值),所以这导致反向传播时低层神经网络的梯度消失,这是训练深层神经网络收敛越来越慢的本质原因,而BN就是通过一定的规范化手段,把每层神经网络任意神经元这个输入值的分布强行拉回到均值为0方差为1的标准正态分布,其实就是把越来越偏的分布强制拉回比较标准的分布,这样使得激活输入值落在非线性函数对输入比较敏感的区域,这样输入的小变化就会导致损失函数较大的变化,意思是这样让梯度变大,避免梯度消失问题产生,而且梯度变大意味着学习收敛速度快,能大大加快训练速度。
THAT’S IT。其实一句话就是:对于每个隐层神经元,把逐渐向非线性函数映射后向取值区间极限饱和区靠拢的输入分布强制拉回到均值为0方差为1的比较标准的正态分布,使得非线性变换函数的输入值落入对输入比较敏感的区域,以此避免梯度消失问题。因为梯度一直都能保持比较大的状态,所以很明显对神经网络的参数调整效率比较高,就是变动大,就是说向损失函数最优值迈动的步子大,也就是说收敛地快。BN说到底就是这么个机制,方法很简单,道理很深刻。
BN其实就是把每个隐层神经元的激活输入分布从偏离均值为0方差为1的正态分布通过平移均值压缩或者扩大曲线尖锐程度,调整为均值为0方差为1的正态分布。
那么把激活输入x调整到这个正态分布有什么用?首先我们看下均值为0,方差为1的标准正态分布代表什么含义:
这意味着在一个标准差范围内,也就是说64%的概率x其值落在[-1,1]的范围内,在两个标准差范围内,也就是说95%的概率x其值落在了[-2,2]的范围内。那么这又意味着什么?我们知道,激活值x=WU+B,U是真正的输入,x是某个神经元的激活值,假设非线性函数是sigmoid,那么看下sigmoid(x)其图形:
假设没有经过BN调整前x的原先正态分布均值是-6,方差是1,那么意味着95%的值落在了[-8,-4]之间,那么对应的Sigmoid(x)函数的值明显接近于0,这是典型的梯度饱和区,在这个区域里梯度变化很慢,为什么是梯度饱和区?请看下sigmoid(x)如果取值接近0或者接近于1的时候对应导数函数取值,接近于0,意味着梯度变化很小甚至消失。而假设经过BN后,均值是0,方差是1,那么意味着95%的x值落在了[-2,2]区间内,很明显这一段是sigmoid(x)函数接近于线性变换的区域,意味着x的小变化会导致非线性函数值较大的变化,也即是梯度变化较大,对应导数函数图中明显大于0的区域,就是梯度非饱和区。
从上面几个图应该看出来BN在干什么了吧?其实就是把隐层神经元激活输入x=WU+B从变化不拘一格的正态分布通过BN操作拉回到了均值为0,方差为1的正态分布,即原始正态分布中心左移或者右移到以0为均值,拉伸或者缩减形态形成以1为方差的图形。什么意思?就是说经过BN后,目前大部分Activation的值落入非线性函数的线性区内,其对应的导数远离导数饱和区,这样来加速训练收敛过程。
但是很明显,看到这里,稍微了解神经网络的读者一般会提出一个疑问:如果都通过BN,那么不就跟把非线性函数替换成线性函数效果相同了?这意味着什么?我们知道,如果是多层的线性函数变换其实这个深层是没有意义的,因为多层线性网络跟一层线性网络是等价的。这意味着网络的表达能力下降了,这也意味着深度的意义就没有了。
所以BN为了保证非线性的获得,对变换后的满足均值为0方差为1的x又进行了scale加上shift操作(y=scale*x+shift),每个神经元增加了两个参数scale和shift参数,这两个参数是通过训练学习到的,意思是通过scale和shift把这个值从标准正态分布左移或者右移一点并长胖一点或者变瘦一点,每个实例挪动的程度不一样,这样等价于非线性函数的值从正中心周围的线性区往非线性区动了动。核心思想应该是想找到一个线性和非线性的较好平衡点,既能享受非线性的较强表达能力的好处,又避免太靠非线性区两头使得网络收敛速度太慢。当然,这是我的理解,论文作者并未明确这样说。但是很明显这里的scale和shift操作是会有争议的,因为按照论文作者论文里写的理想状态,就会又通过scale和shift操作把变换后的x调整回未变换的状态,那不是饶了一圈又绕回去原始的“Internal Covariate Shift”问题里去了吗,感觉论文作者并未能够清楚地解释scale和shift操作的理论原因。
BatchNorm效果好。①不仅仅极大提升了训练速度,收敛过程大大加快;②还能增加分类效果,一种解释是这是类似于Dropout的一种防止过拟合的正则化表达方式,所以不用Dropout也能达到相当的效果;③另外调参过程也简单多了,对于初始化要求没那么高,而且可以使用大的学习率等。总而言之,经过这么简单的变换,带来的好处多得很,这也是为何现在BN这么快流行起来的原因。
多内容请看作者原文。
https://www.cnblogs.com/guoyaohua/p/8724433.html
Scale
layer {
name: "scale_conv1"
type: "Scale"
bottom: "conv1"
top: "conv1"
scale_param {
bias_term: true
//Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but may be more efficient). Initialized with bias_filler (defaults to 0).
}
}
ReLU
layer {
name: "conv1_relu"
type: "ReLU"
bottom: "conv1"
top: "conv1"
}
Pooling
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
//The pooling method,default = MAX
kernel_size: 3
// The kernel size (square)
stride: 2
//// The stride (equal in Y, X),default = 1
//还有pad参数,The padding size (equal in Y, X),default = 0
}
}
Eltwise
layer {
name: "res2a"
type: "Eltwise"
bottom: "res2a_branch1"
bottom: "res2a_branch2c"
top: "res2a"
//有参数EltwiseOp operation,element-wise operation,default = SUM
}
Normalize
layer {
name: "conv4f_norm"
type: "Normalize"
bottom: "res4f"
top: "conv4f_norm"
norm_param {
across_spatial: false
//如果是false,对单个像素点归一化,如果是True对1xcxhxw归一化
scale_filler {
type: "constant"
value: 20
}
channel_shared: false
//Whether or not scale parameters are shared across channels.
}
}
Permute
layer {
name: "conv4f_norm_mbox_loc_perm"
type: "Permute"
bottom: "conv4f_norm_mbox_loc"
top: "conv4f_norm_mbox_loc_perm"
permute_param {
//置换索引轴顺序
order: 0
order: 2
order: 3
order: 1
}
}
Flatten
layer {
name: "conv4f_norm_mbox_loc_flat"
type: "Flatten"
bottom: "conv4f_norm_mbox_loc_perm"
top: "conv4f_norm_mbox_loc_flat"
flatten_param {
axis: 1
//The first axis to flatten: all preceding axes are retained in the output. May be negative to index from the end (e.g., -1 for the last axis).
//还有参数end_axis,The last axis to flatten: all following axes are retained in the output. May be negative to index from the end (e.g., the default -1 for the last axis).
}
}
PriorBox
layer {
name: "conv4f_norm_mbox_priorbox"
type: "PriorBox"
bottom: "conv4f_norm"
bottom: "data"
top: "conv4f_norm_mbox_priorbox"
prior_box_param {
min_size: 20.48
// Minimum box size (in pixels). Required!
//in pixels,这个值很重要,在原文里也有着重描写
max_size: 51.2
//Maximum box size (in pixels). Required!
//in pixels,这个值很重要,在原文里也有着重描写
aspect_ratio: 2
//Various of aspect ratios. Duplicate ratios will be ignored. If none is provided, we use default ratio 1.
flip: true
//If true, will flip each aspect ratio. For example, if there is aspect ratio "r", we will generate aspect ratio "1.0/r" as well.
clip: false
//If true, will clip the prior so that it is within [0, 1]
variance: 0.1
//Variance for adjusting the prior bboxes.
variance: 0.1
variance: 0.2
variance: 0.2
step: 16
//Explicitly provide the step size.
}
}
Concat
layer {
name: "mbox_loc"
type: "Concat"
bottom: "conv4f_norm_mbox_loc_flat"
bottom: "fc7_mbox_loc_flat"
bottom: "conv6_2_mbox_loc_flat"
bottom: "conv7_2_mbox_loc_flat"
bottom: "conv8_2_mbox_loc_flat"
bottom: "conv9_2_mbox_loc_flat"
top: "mbox_loc"
concat_param {
axis: 1
//By default, ConcatLayer concatenates blobs along the "channels" axis (1).
}
}
MultiBoxLoss
layer {
name: "mbox_loss"
type: "MultiBoxLoss"
bottom: "mbox_loc"
bottom: "mbox_conf"
bottom: "mbox_priorbox"
bottom: "label"
top: "mbox_loss"
include {
phase: TRAIN
}
propagate_down: true
propagate_down: true
propagate_down: false
propagate_down: false
loss_param {
normalization: VALID
}
multibox_loss_param {
loc_loss_type: SMOOTH_L1
conf_loss_type: SOFTMAX
loc_weight: 1.0
//eight for localization loss.default = 1.0
num_classes: 21
//Number of classes to be predicted. Required!
share_location: true
//If true, bounding box are shared among different classes.
match_type: PER_PREDICTION
//matching method during training.BIPARTITE和PER_PREDICTION
overlap_threshold: 0.5
//If match_type is PER_PREDICTION, use overlap_threshold to determine the extra matching bboxes.
use_prior_for_matching: true
// Use prior for matching.
background_label_id: 0
//Background label id.
use_difficult_gt: false
//If true, also consider difficult ground truth.
neg_pos_ratio: 3.0
//The negative/positive ratio.
neg_overlap: 0.5
//The negative overlap upperbound for the unmatched predictions.
code_type: CENTER_SIZE
ignore_cross_boundary_bbox: false
//If true, ignore cross boundary bbox during matching. Cross boundary bbox is a bbox who is outside of the image region.
mining_type: MAX_NEGATIVE
//Mining type during training.
//NONE : use all negatives.
//MAX_NEGATIVE : select negatives based on the score.
//HARD_EXAMPLE : select hard examples based on "Training Region-based Object Detectors with Online Hard Example Mining", Shrivastava et.al.
}
}
结语
如果您有修改意见或问题,欢迎留言或者通过邮箱和我联系。
手打很辛苦,如果我的文章对您有帮助,转载请注明出处。