本文解析的是SSD的tensorflow实现源码,来源 Github: balancap/SSD-Tensorflow。
目录
通过模型获得位置信息与分类信息,并获取默认anchors信息,得到预测结果。
将Ground Truth进行预处理(使得Ground Truth与预测结果一一对应)。
通过预测结果与Ground Truth计算损失函数,通过优化器进行训练。
1.anchor boxes生成:
#####SSD-Tensorflow-master\train_ssd_network.py#####
# ssd_anchors array([[y,x,h,w],...,[y,x,h,w]]) len(layers_anchors) = 6
# 6层每一层特征图上所有anchor box的中心坐标和宽高
ssd_anchors = ssd_net.anchors(ssd_shape)
#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def anchors(self, img_shape, dtype=np.float32):
"""Compute the default anchor boxes, given an image shape.
"""
# return array([[y,x,h,w],...,[y,x,h,w]]) len(layers_anchors) = 6
return ssd_anchors_all_layers(img_shape,
self.params.feat_shapes,
self.params.anchor_sizes,
self.params.anchor_ratios,
self.params.anchor_steps,
self.params.anchor_offset,
dtype)
#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def ssd_anchors_all_layers(img_shape,
layers_shape,
anchor_sizes,
anchor_ratios,
anchor_steps,
offset=0.5,
dtype=np.float32):
"""Compute anchor boxes for all feature layers.
img_shape = (300,300)
layers_shape = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)]
anchor_sizes = [(21., 45.),
(45., 99.),
(99., 153.),
(153., 207.),
(207., 261.),
(261., 315.)]
anchor_ratios = [[2, .5],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5],
[2, .5]]
anchor_steps = [8, 16, 32, 64, 100, 300]
"""
layers_anchors = []
for i, s in enumerate(layers_shape):
anchor_bboxes = ssd_anchor_one_layer(img_shape, s,
anchor_sizes[i],
anchor_ratios[i],
anchor_steps[i],
offset=offset, dtype=dtype)
layers_anchors.append(anchor_bboxes)
# layers_anchors = array([[y,x,h,w],...,[y,x,h,w]]) len(layers_anchors) = 6
return layers_anchors
#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def ssd_anchor_one_layer(img_shape,
feat_shape,
sizes,
ratios,
step,
offset=0.5,
dtype=np.float32):
"""Computer SSD default anchor boxes for one feature layer.
Determine the relative position grid of the centers, and the relative
width and height.
Arguments:
feat_shape: Feature shape, used for computing relative position grids;
size: Absolute reference sizes;
ratios: Ratios to use on these features;
img_shape: Image shape, used for computing height, width relatively to the
former;
offset: Grid offset.
Return:
y, x, h, w: Relative x and y grids, and height and width.
"""
'''
img_shape = (300,300)
feat_shape = (38,38)
sizes = (21., 45.)
ratios = [2, .5]
step = 8
'''
# Compute the position grid: simple way.
# y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
# y = (y.astype(dtype) + offset) / feat_shape[0]
# x = (x.astype(dtype) + offset) / feat_shape[1]
# Weird SSD-Caffe computation using steps values...
'''
y.shape,x.shape = (38,38)
y=array([[0,0,...,0],
[1,1,...,1],
...
[36,36,...,36],
[37,37,...,37]])
x=array([[0,1,...,36,37],
[0,1,...,36,37],
...
[0,1,...,36,37]])
'''
y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
'''
y = (y+0.5)*8/300
x = (x+0.5)*8/300
归一化,为与GT box比较
'''
y = (y.astype(dtype) + offset) * step / img_shape[0]
x = (x.astype(dtype) + offset) * step / img_shape[1]
# Expand dims to support easy broadcasting.
'''
扩展维度,维度为(38,38,1)
np.expand_dims中axis从左到右为0,1,2,从右到左为-1,-2,-3
y = array([[[0.01],[0.01],...,[0.01]],
[[0.04],[0.04],...,[0.04]],
...
[[0.97],[0.97],...,[0.97]],
[[1],[1],...,[1]]])
x = array([[[0.01],[0.04],...,[1]],
[[0.01],[0.04],...,[1]],
...
[[0.01],[0.04],...,[1]],
[[0.01],[0.04],...,[1]]])
'''
y = np.expand_dims(y, axis=-1)
x = np.expand_dims(x, axis=-1)
# Compute relative height and width.
# Tries to follow the original implementation of SSD for the order.
# num_anchors = 2+2
num_anchors = len(sizes) + len(ratios)
# h,w = array([0.,0.,0.,0.])
h = np.zeros((num_anchors, ), dtype=dtype)
w = np.zeros((num_anchors, ), dtype=dtype)
# Add first anchor boxes with ratio=1.
# h[0],w[0] = 21/300
h[0] = sizes[0] / img_shape[0]
w[0] = sizes[0] / img_shape[1]
di = 1
if len(sizes) > 1:
h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]
w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
di += 1
for i, r in enumerate(ratios):
h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
# y,x的shape为(38,38,1),h,w的shape为(4,)
return y, x, h, w
2.ground truth预处理
#####SSD-Tensorflow-master\train_ssd_network.py#####
# Encode groundtruth labels and bboxes.
'''
基于交并比
gclasses 每一层特征图上所有anchor box的类别
glocalisations 对应损失函数的一种变换
gscores anchor box和对应GT box的重合度
'''
gclasses, glocalisations, gscores = \
ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors)
#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def bboxes_encode(self, labels, bboxes, anchors,
scope=None):
"""Encode labels and bounding boxes.
"""
return ssd_common.tf_ssd_bboxes_encode(
labels, bboxes, anchors,
self.params.num_classes,
self.params.no_annotation_label,
ignore_threshold=0.5,
prior_scaling=self.params.prior_scaling,
scope=scope)
#####SSD-Tensorflow-master\nets\ssd_common.py#####
def tf_ssd_bboxes_encode(labels,
bboxes,
anchors,
num_classes,
no_annotation_label,
ignore_threshold=0.5,
prior_scaling=[0.1, 0.1, 0.2, 0.2],
dtype=tf.float32,
scope='ssd_bboxes_encode'):
"""Encode groundtruth labels and bounding boxes using SSD net anchors.
Encoding boxes for all feature layers.
Arguments:
labels: 1D Tensor(int64) containing groundtruth labels;
bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
anchors: List of Numpy array with layer anchors;
matching_threshold: Threshold for positive match with groundtruth bboxes;
prior_scaling: Scaling of encoded coordinates.
Return:
(target_labels, target_localizations, target_scores):
Each element is a list of target Tensors.
"""
# anchors = array([[y,x,h,w],...,[y,x,h,w]]) len(anchors) = 6
with tf.name_scope(scope):
target_labels = []
target_localizations = []
target_scores = []
for i, anchors_layer in enumerate(anchors):
with tf.name_scope('bboxes_encode_block_%i' % i):
t_labels, t_loc, t_scores = \
tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer,
num_classes, no_annotation_label,
ignore_threshold,
prior_scaling, dtype)
target_labels.append(t_labels)
target_localizations.append(t_loc)
target_scores.append(t_scores)
return target_labels, target_localizations, target_scores
#####SSD-Tensorflow-master\nets\ssd_common.py#####
def tf_ssd_bboxes_encode_layer(labels,
bboxes,
anchors_layer,
num_classes,
no_annotation_label,
ignore_threshold=0.5,
prior_scaling=[0.1, 0.1, 0.2, 0.2],
dtype=tf.float32):
"""Encode groundtruth labels and bounding boxes using SSD anchors from
one layer.
Arguments:
labels: 1D Tensor(int64) containing groundtruth labels;
bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
anchors_layer: Numpy array with layer anchors;
matching_threshold: Threshold for positive match with groundtruth bboxes;
prior_scaling: Scaling of encoded coordinates.
Return:
(target_labels, target_localizations, target_scores): Target Tensors.
"""
# Anchors coordinates and volume.
'''
每个anchor box的中心位置及宽高 anchors_layer = array([y,x,h,w])
yref,xref的shape为(38,38,1),href,wref的shape为(4,)
'''
yref, xref, href, wref = anchors_layer
'''
ymin,xmin,ymax,xmax为feature map cell上4个default boxes的左上和右下的纵横坐标,shape均为(38,38,4)
38*38个feature map cell,每个cell上有4个高度不同的box,进而每个cell上分别有4个不同的ymin。
同理,ymax,xmin,xmax也如此。
或者说cell上的4个框分别得到一套ymin,xmin,ymax,xmax。
38*38*[第一个框的ymin,第二个框的ymin,第三个框的ymin,第四个框的ymin]
'''
ymin = yref - href / 2.
xmin = xref - wref / 2.
ymax = yref + href / 2.
xmax = xref + wref / 2.
# 每一个feature map cell的4个default boxes的体积,shape均为(38,38,4)
vol_anchors = (xmax - xmin) * (ymax - ymin)
# Initialize tensors...
# shape = (38,38,4)
shape = (yref.shape[0], yref.shape[1], href.size)
feat_labels = tf.zeros(shape, dtype=tf.int64)
feat_scores = tf.zeros(shape, dtype=dtype)
feat_ymin = tf.zeros(shape, dtype=dtype)
feat_xmin = tf.zeros(shape, dtype=dtype)
feat_ymax = tf.ones(shape, dtype=dtype)
feat_xmax = tf.ones(shape, dtype=dtype)
def jaccard_with_anchors(bbox):
"""Compute jaccard score between a box and the anchors.
"""
# 真实框GT box(bbox)与所有anchor box(default box)的比较,
# 每个feature map cell上的4个不同的ymin分别与bbox[0]比较,得到4个int_ymin。
# bbox[0],bbox[1],bbox[2],bbox[3]的shape为(1,)
# int_ymin,int_xmin,int_ymax,int_xmax的shape为(38,38,4)
int_ymin = tf.maximum(ymin, bbox[0])
int_xmin = tf.maximum(xmin, bbox[1])
int_ymax = tf.minimum(ymax, bbox[2])
int_xmax = tf.minimum(xmax, bbox[3])
# h,w的shape为(38,38,4)
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
# Volumes.
inter_vol = h * w
union_vol = vol_anchors - inter_vol \
+ (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
jaccard = tf.div(inter_vol, union_vol)
# jaccard的shape为(38,38,4)
# 38*38*4个anchor box与第i个GT box(bbox)的jaccard值
return jaccard
def intersection_with_anchors(bbox):
"""Compute intersection between score a box and the anchors.
"""
int_ymin = tf.maximum(ymin, bbox[0])
int_xmin = tf.maximum(xmin, bbox[1])
int_ymax = tf.minimum(ymax, bbox[2])
int_xmax = tf.minimum(xmax, bbox[3])
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
inter_vol = h * w
scores = tf.div(inter_vol, vol_anchors)
return scores
def condition(i, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
"""Condition: check label index.
"""
'''
tf.less(x,y,name=none) x,y都是一个tensor,返回一个bool类型的tensor。逐元素返回是否x<y
其实就是遍历label,因为i在body返回的时候加1了,直到遍历完
tf.shape(labels)为tensor类型的label个数。在遍历完真实框之前,r中元素均为true
'''
r = tf.less(i, tf.shape(labels))
return r[0]
def body(i, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
"""Body: update feature labels, scores and bboxes.
Follow the original SSD paper for that purpose:
- assign values when jaccard > 0.5;
- only update if beat the score of other bboxes.
"""
# Jaccard score.
# 第i个GT box的类别,label shape为(1,)
label = labels[i]
# 第i个GT box的位置,bbox shape为(4,)
bbox = bboxes[i]
# 计算某一层上所有的anchor box(38*38*4个)和第i个GT box的重合度,shape为(38,38,4)
jaccard = jaccard_with_anchors(bbox)
# Mask: check threshold + scores + no annotations + num_classes.
# tf.greater(x,y,name=none) x,y都是一个tensor,返回一个bool类型的tensor。逐元素返回是否x>y
# 与第一个GT box比较后,jaccard值为0的都为false,mask的shape为(38,38,4),bool类型的tensor。
mask = tf.greater(jaccard, feat_scores)
# mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
# 逻辑操作-与,都真为真
mask = tf.logical_and(mask, feat_scores > -0.5)
mask = tf.logical_and(mask, label < num_classes)
# 类型转换函数
imask = tf.cast(mask, tf.int64)
# dtype = tf.float32
fmask = tf.cast(mask, dtype)
# Update values using mask.
# mask bool, imask int64, fmask float32. shape (38,38,4)
# mask为true即imask为1,fmask为1.0。
# 与第一个GT box比较后,jaccard值>0并且feat_scores>-0.5并且label<num_classes。其余为false
# feat_labels中mask为true即imask为1的位置赋值label,mask为false即imask为0的位置仍为feat_labels
feat_labels = imask * label + (1 - imask) * feat_labels
# if mask true,jaccard;if mask false,feat_scores.
# feat_scores中对应mask中false的位置的元素值不变,其余元素替换成jaccard中对应位置的元素值
feat_scores = tf.where(mask, jaccard, feat_scores)
# feat_ymin中mask为true即fmask为1.0的位置赋值bbox[0],mask为false即fmask为0.0的位置仍为feat_ymin
feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin
feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax
# Check no annotation label: ignore these anchors...
# interscts = intersection_with_anchors(bbox)
# mask = tf.logical_and(interscts > ignore_threshold,
# label == no_annotation_label)
# # Replace scores by -1.
# feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)
'''
某一层上所有的anchor box(38*38*4个)依次与每个GT box比较,
每个anchor box与某个GT box比较的jaccard值小的,
会被此anchor box与后面某个GT box比较的jaccard值大的覆盖。
最后,仅保留jaccard值最大的相关信息。
遍历到最后,feat_labels为每个anchor box与所有GT box比较后的
重叠度最高的GT box的类别,
feat_scores为每个anchor box与所有GT box比较后的重叠度最高的值,
feat_ymin等等为每个anchor box与所有GT box比较后的重叠度最高的GT box的位置。
'''
return [i+1, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax]
# Main loop definition.
'''
tf.while_loop(cond,body,loop_vars)
Repeat body while the condition cond is true.
loop_vars is a (possibly nested) tuple, namedtuple or list of tensors that is passed to both cond and body.
cond and body both take as many arguments as there are loop_vars.
等价于
loop_vars = []
while cond(loop_vars):
loop_vars = body(loop_vars)
计算某一层上所有的anchor box(38*38*4个)和所有GT boxes的比较
'''
i = 0
[i, feat_labels, feat_scores,
feat_ymin, feat_xmin,
feat_ymax, feat_xmax] = tf.while_loop(condition, body,
[i, feat_labels, feat_scores,
feat_ymin, feat_xmin,
feat_ymax, feat_xmax])
# Transform to center / size.
# feat_cy,feat_cx,feat_h,feat_w shape (38,38,4)
# 计算每个anchor box对应的GT box的中心位置和宽高
feat_cy = (feat_ymax + feat_ymin) / 2.
feat_cx = (feat_xmax + feat_xmin) / 2.
feat_h = feat_ymax - feat_ymin
feat_w = feat_xmax - feat_xmin
# Encode features.
# yref,xref的shape为(38,38,1),href,wref的shape为(4,)
# anchor box和GT box的一种变换,对应位置损失函数
feat_cy = (feat_cy - yref) / href / prior_scaling[0]
feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
feat_h = tf.log(feat_h / href) / prior_scaling[2]
feat_w = tf.log(feat_w / wref) / prior_scaling[3]
# Use SSD ordering: x / y / w / h instead of ours.
# tf.stack ,axis 指明以何种方式对矩阵进行拼接,就是对原矩阵的哪个维度进行拼接。
# feat_localizations shape为(38,38,4,4)
feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)
# 返回每个anchor box的类别,一种变换,anchor box和对应GT box的重合度
# feat_labels、feat_scores的shape均为[feature_map_height, feature_map_width, num_anchors]。
return feat_labels, feat_localizations, feat_scores
3.网络结构
#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
# SSD class definition.
class SSDNet(object):
"""Implementation of the SSD VGG-based 300 network.
The default features layers with 300x300 image input are:
conv4 ==> 38 x 38
conv7 ==> 19 x 19
conv8 ==> 10 x 10
conv9 ==> 5 x 5
conv10 ==> 3 x 3
conv11 ==> 1 x 1
The default image size used to train this network is 300x300.
"""
default_params = SSDParams(
img_shape=(300, 300), #输入图片大小
num_classes=21, #类别数+背景
no_annotation_label=21,
feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'],
feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
anchor_size_bounds=[0.15, 0.90],
# anchor_size_bounds=[0.20, 0.90],
anchor_sizes=[(21., 45.),
(45., 99.),
(99., 153.),
(153., 207.),
(207., 261.),
(261., 315.)],
# anchor_sizes=[(30., 60.),
# (60., 111.),
# (111., 162.),
# (162., 213.),
# (213., 264.),
# (264., 315.)],
anchor_ratios=[[2, .5],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5, 3, 1./3],
[2, .5],
[2, .5]],
anchor_steps=[8, 16, 32, 64, 100, 300], #特征图的单元大小
anchor_offset=0.5,
normalizations=[20, -1, -1, -1, -1, -1],
prior_scaling=[0.1, 0.1, 0.2, 0.2]
)
def __init__(self, params=None):
"""Init the SSD net with some parameters. Use the default ones
if none provided.
"""
if isinstance(params, SSDParams):
self.params = params
else:
self.params = SSDNet.default_params
# ======================================================================= #
def net(self, inputs,
is_training=True,
update_feat_shapes=True,
dropout_keep_prob=0.5,
prediction_fn=slim.softmax,
reuse=None,
scope='ssd_300_vgg'):
"""SSD network definition.
"""
# inputs 训练图片
r = ssd_net(inputs,
num_classes=self.params.num_classes,
feat_layers=self.params.feat_layers,
anchor_sizes=self.params.anchor_sizes,
anchor_ratios=self.params.anchor_ratios,
normalizations=self.params.normalizations,
is_training=is_training,
dropout_keep_prob=dropout_keep_prob,
prediction_fn=prediction_fn,
reuse=reuse,
scope=scope)
# Update feature shapes (try at least!)
if update_feat_shapes:
shapes = ssd_feat_shapes_from_net(r[0], self.params.feat_shapes)
self.params = self.params._replace(feat_shapes=shapes)
return r
def arg_scope(self, weight_decay=0.0005, data_format='NHWC'):
"""Network arg_scope.
"""
return ssd_arg_scope(weight_decay, data_format=data_format)
def arg_scope_caffe(self, caffe_scope):
"""Caffe arg_scope used for weights importing.
"""
return ssd_arg_scope_caffe(caffe_scope)
# ======================================================================= #
def update_feature_shapes(self, predictions):
"""Update feature shapes from predictions collection (Tensor or Numpy
array).
"""
shapes = ssd_feat_shapes_from_net(predictions, self.params.feat_shapes)
self.params = self.params._replace(feat_shapes=shapes)
def anchors(self, img_shape, dtype=np.float32):
"""Compute the default anchor boxes, given an image shape.
"""
# return array([[y,x,h,w],...,[y,x,h,w]]) len(layers_anchors) = 6
return ssd_anchors_all_layers(img_shape,
self.params.feat_shapes,
self.params.anchor_sizes,
self.params.anchor_ratios,
self.params.anchor_steps,
self.params.anchor_offset,
dtype)
def bboxes_encode(self, labels, bboxes, anchors,
scope=None):
"""Encode labels and bounding boxes.
"""
return ssd_common.tf_ssd_bboxes_encode(
labels, bboxes, anchors,
self.params.num_classes,
self.params.no_annotation_label,
ignore_threshold=0.5,
prior_scaling=self.params.prior_scaling,
scope=scope)
def bboxes_decode(self, feat_localizations, anchors,
scope='ssd_bboxes_decode'):
"""Encode labels and bounding boxes.
"""
return ssd_common.tf_ssd_bboxes_decode(
feat_localizations, anchors,
prior_scaling=self.params.prior_scaling,
scope=scope)
def detected_bboxes(self, predictions, localisations,
select_threshold=None, nms_threshold=0.5,
clipping_bbox=None, top_k=400, keep_top_k=200):
"""Get the detected bounding boxes from the SSD network output.
"""
# Select top_k bboxes from predictions, and clip
rscores, rbboxes = \
ssd_common.tf_ssd_bboxes_select(predictions, localisations,
select_threshold=select_threshold,
num_classes=self.params.num_classes)
rscores, rbboxes = \
tfe.bboxes_sort(rscores, rbboxes, top_k=top_k)
# Apply NMS algorithm.
rscores, rbboxes = \
tfe.bboxes_nms_batch(rscores, rbboxes,
nms_threshold=nms_threshold,
keep_top_k=keep_top_k)
if clipping_bbox is not None:
rbboxes = tfe.bboxes_clip(clipping_bbox, rbboxes)
return rscores, rbboxes
def losses(self, logits, localisations,
gclasses, glocalisations, gscores,
match_threshold=0.5,
negative_ratio=3.,
alpha=1.,
label_smoothing=0.,
scope='ssd_losses'):
"""Define the SSD network losses.
"""
return ssd_losses(logits, localisations,
gclasses, glocalisations, gscores,
match_threshold=match_threshold,
negative_ratio=negative_ratio,
alpha=alpha,
label_smoothing=label_smoothing,
scope=scope)
#####SSD-Tensorflow-master\train_ssd_network.py#####
# Construct SSD network.
arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay,
data_format=DATA_FORMAT)
# 打印arg_scope,查看嵌套使用规则
with slim.arg_scope(arg_scope):
'''
b_image 训练图片
经过卷积
prediction 经过激活函数的预测结果
localisations 预测框
logits 没有激活函数的预测结果
end_poins 每一层的输出
'''
predictions, localisations, logits, end_points = \
ssd_net.net(b_image, is_training=True)
#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def arg_scope(self, weight_decay=0.0005, data_format='NHWC'):
"""Network arg_scope.
"""
return ssd_arg_scope(weight_decay, data_format=data_format)
#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def ssd_arg_scope(weight_decay=0.0005, data_format='NHWC'):
"""Defines the VGG arg scope.
Args:
weight_decay: The l2 regularization coefficient.
Returns:
An arg_scope.
"""
'''
为一些操作提供默认参数,可以嵌套使用、复用。
在其中调用的函数,可以不用重复写一些参数,但也允许覆盖。
'''
with slim.arg_scope([slim.conv2d, slim.fully_connected],
activation_fn=tf.nn.relu,
weights_regularizer=slim.l2_regularizer(weight_decay),
weights_initializer=tf.contrib.layers.xavier_initializer(),
biases_initializer=tf.zeros_initializer()):
with slim.arg_scope([slim.conv2d, slim.max_pool2d],
padding='SAME',
data_format=data_format):
with slim.arg_scope([custom_layers.pad2d,
custom_layers.l2_normalization,
custom_layers.channel_to_last],
data_format=data_format) as sc:
return sc
#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
def ssd_net(inputs,
num_classes=SSDNet.default_params.num_classes,
feat_layers=SSDNet.default_params.feat_layers,
anchor_sizes=SSDNet.default_params.anchor_sizes,
anchor_ratios=SSDNet.default_params.anchor_ratios,
normalizations=SSDNet.default_params.normalizations,
is_training=True,
dropout_keep_prob=0.5,
prediction_fn=slim.softmax,
reuse=None,
scope='ssd_300_vgg'):
"""SSD net definition.
"""
# if data_format == 'NCHW':
# inputs = tf.transpose(inputs, perm=(0, 3, 1, 2))
# End_points collect relevant activations for external use.
# inputs 训练图片
# endpoints 字典,包含的是不同特征图的输出,就是SSD不是只利用一层特征,而是多层,所以这个地方存放多层的输出。
end_points = {}
with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
# Original VGG-16 blocks.
net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
end_points['block1'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool1')
# Block 2.
net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
end_points['block2'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool2')
# Block 3.
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
end_points['block3'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool3')
# Block 4.
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
end_points['block4'] = net
net = slim.max_pool2d(net, [2, 2], scope='pool4')
# Block 5.
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
end_points['block5'] = net
net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5')
# Additional SSD blocks.
# Block 6: let's dilate the hell out of it!
net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6')
end_points['block6'] = net
net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)
# Block 7: 1x1 conv. Because the fuck.
net = slim.conv2d(net, 1024, [1, 1], scope='conv7')
end_points['block7'] = net
net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)
# Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
end_point = 'block8'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 256, [1, 1], scope='conv1x1')
net = custom_layers.pad2d(net, pad=(1, 1))
net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID')
end_points[end_point] = net
end_point = 'block9'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
net = custom_layers.pad2d(net, pad=(1, 1))
net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID')
end_points[end_point] = net
end_point = 'block10'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
end_points[end_point] = net
end_point = 'block11'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
end_points[end_point] = net
# Prediction and localisations layers.
predictions = []
logits = []
localisations = []
for i, layer in enumerate(feat_layers):
with tf.variable_scope(layer + '_box'):
# p = cls_pred,l = loc_pred ,表示每一层的预测结果
p, l = ssd_multibox_layer(end_points[layer],
num_classes,
anchor_sizes[i],
anchor_ratios[i],
normalizations[i])
# 对于类别再进行tf.softmax
predictions.append(prediction_fn(p))
logits.append(p)
localisations.append(l)
# prediction 保存了经过激活函数的,logits 没有激活函数的
# localisations 保存预测的框,end_poins 每一层的输出
return predictions, localisations, logits, end_points
4.损失函数
#####SSD-Tensorflow-master\train_ssd_network.py#####
# Add loss function.
ssd_net.losses(logits, localisations,
b_gclasses, b_glocalisations, b_gscores,
match_threshold=FLAGS.match_threshold,
negative_ratio=FLAGS.negative_ratio,
alpha=FLAGS.loss_alpha,
label_smoothing=FLAGS.label_smoothing)
#####SSD-Tensorflow-master\nets\ssd_vgg_300.py#####
# SSD loss function.
def ssd_losses(logits, localisations,
gclasses, glocalisations, gscores,
match_threshold=0.5,
negative_ratio=3.,
alpha=1.,
label_smoothing=0.,
device='/cpu:0',
scope=None):
'''
logits 卷积后没有激活函数的预测的类别
localisations 卷积后预测的位置
gclasses 基于交并比,每一层特征图上所有anchor box的类别
glocalisations 基于交并比,对应损失函数的一种变换,位置
gscores 基于交并比,anchor box和对应GT box的得分
'''
with tf.name_scope(scope, 'ssd_losses'):
# 提取类别数和batch_size
lshape = tfe.get_shape(logits[0], 5)
num_classes = lshape[-1]
batch_size = lshape[0]
# Flatten out all vectors!
flogits = []
fgclasses = []
fgscores = []
flocalisations = []
fglocalisations = []
# 按照ssd特征层循环
for i in range(len(logits)):
flogits.append(tf.reshape(logits[i], [-1, num_classes]))
fgclasses.append(tf.reshape(gclasses[i], [-1]))
fgscores.append(tf.reshape(gscores[i], [-1]))
flocalisations.append(tf.reshape(localisations[i], [-1, 4]))
fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))
# And concat the crap!
'''
[<tf.Tensor 'ssd_losses/concat:0' shape=(279424, 21) dtype=float32>,
<tf.Tensor 'ssd_losses/concat_1:0' shape=(279424,) dtype=int64>,
<tf.Tensor 'ssd_losses/concat_2:0' shape=(279424,) dtype=float32>,
<tf.Tensor 'ssd_losses/concat_3:0' shape=(279424, 4) dtype=float32>,
<tf.Tensor 'ssd_losses/concat_4:0' shape=(279424, 4) dtype=float32>]
'''
logits = tf.concat(flogits, axis=0)
gclasses = tf.concat(fgclasses, axis=0)
gscores = tf.concat(fgscores, axis=0)
localisations = tf.concat(flocalisations, axis=0)
glocalisations = tf.concat(fglocalisations, axis=0)
dtype = logits.dtype
# Compute positive matching mask...
pmask = gscores > match_threshold
# 类型转换函数
fpmask = tf.cast(pmask, dtype)
# 正例的个数,压缩求和
n_positives = tf.reduce_sum(fpmask)
# Hard negative mining...
# {0,1} 前景是1,背景是0
no_classes = tf.cast(pmask, tf.int32)
# 此时每一行的21个数转化为概率
predictions = slim.softmax(logits)
# 逻辑操作-与,都真为真 IOU达不到阈值的类别搜索框位置记1
nmask = tf.logical_and(tf.logical_not(pmask),
gscores > -0.5)
fnmask = tf.cast(nmask, dtype)
# if nmask true,predictions[:, 0];if nmask false,1. - fnmask.
# 框内无物体标记为背景预测概率;框内有物体位置标记为1
nvalues = tf.where(nmask,
predictions[:, 0],
1. - fnmask)
# pass '[-1]' to flatten 't'
# reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
nvalues_flat = tf.reshape(nvalues, [-1])
# Number of negative entries to select.
# 设置反例数量为正例的negative_ratio
max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32)
n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size
n_neg = tf.minimum(n_neg, max_neg_entries)
'''
负样本不超过正样本数目的3倍,确保能够收敛,
由于知道这些负样本都属于背景(和真实框IOU不足),所以理论上其class 0预测值越大越好,
取class 0预测值最小的3倍正样本数目的负样本,最大化其class 0预测值,达到最小化损失函数的目的。
筛选后的负样本(fnmask标记)为原负样本中class 0预测值最小的目标数目的点。
'''
# 最不可能为背景的n_neg个点
val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)
max_hard_pred = -val[-1]
# Final negative mask.
nmask = tf.logical_and(nmask, nvalues < max_hard_pred)
fnmask = tf.cast(nmask, dtype)
# Add cross-entropy loss.
# 计算正例的分类误差
# gclasses 0-20
with tf.name_scope('cross_entropy_pos'):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=gclasses)
# fpmask是为了过滤掉负样本,因为负样本的label是0,其他是1
loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value')
tf.losses.add_loss(loss)
# 计算反例的分类误差
# no_classes 0,1
with tf.name_scope('cross_entropy_neg'):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=no_classes)
# fnmask也是过滤掉正例
loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value')
tf.losses.add_loss(loss)
# Add localization loss: smooth L1, L2, ...
with tf.name_scope('localization'):
# Weights Tensor: positive mask + random negative.
# fpmask 过滤框没有目标的
weights = tf.expand_dims(alpha * fpmask, axis=-1)
loss = custom_layers.abs_smooth(localisations - glocalisations)
loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value')
tf.losses.add_loss(loss)