训练、测试都是调用了网络。那么网络是如何张开、恢复、调用,数据是如何在网络里面流动的呢?
network类基本包含了整个网络的架构设计,直接给出代码解析:
# --------------------------------------------------------
# Tensorflow Faster R-CNN
# Licensed under The MIT License [see LICENSE for details]
# Written by Xinlei Chen
# 南石北岸生2019.4.7
# https://mp.csdn.net/postedit
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import tensorflow.contrib.slim as slim
from tensorflow.contrib.slim import losses
from tensorflow.contrib.slim import arg_scope
import numpy as np
from layer_utils.snippets import generate_anchors_pre, generate_anchors_pre_tf
from layer_utils.proposal_layer import proposal_layer, proposal_layer_tf
from layer_utils.proposal_top_layer import proposal_top_layer, proposal_top_layer_tf
from layer_utils.anchor_target_layer import anchor_target_layer
from layer_utils.proposal_target_layer import proposal_target_layer
from utils.visualization import draw_bounding_boxes
from model.config import cfg
class Network(object):
def __init__(self):#构造方法,自动生成下列变量,用了self,可以不传入值得生成变量。
self._predictions = {}#
self._losses = {}
self._anchor_targets = {}
self._proposal_targets = {}
self._layers = {}
self._gt_image = None
self._act_summaries = []
self._score_summaries = {}
self._train_summaries = []
self._event_summaries = {}
self._variables_to_fix = {}
#最后还原图像的时候加上均值并且进行通道变换
def _add_gt_image(self):
# add back mean
image = self._image + cfg.PIXEL_MEANS
# BGR to RGB (opencv uses BGR)
resized = tf.image.resize_bilinear(image, tf.to_int32(self._im_info[:2] / self._im_info[2]))
self._gt_image = tf.reverse(resized, axis=[-1])
#以下都是tensorboard用到的summary
def _add_gt_image_summary(self):
# use a customized visualization function to visualize the boxes
if self._gt_image is None:
self._add_gt_image()
image = tf.py_func(draw_bounding_boxes,
[self._gt_image, self._gt_boxes, self._im_info],
tf.float32, name="gt_boxes")
return tf.summary.image('GROUND_TRUTH', image)
def _add_act_summary(self, tensor):
tf.summary.histogram('ACT/' + tensor.op.name + '/activations', tensor)
tf.summary.scalar('ACT/' + tensor.op.name + '/zero_fraction',
tf.nn.zero_fraction(tensor))
def _add_score_summary(self, key, tensor):
tf.summary.histogram('SCORE/' + tensor.op.name + '/' + key + '/scores', tensor)
def _add_train_summary(self, var):
tf.summary.histogram('TRAIN/' + var.op.name, var)
#rpn对block4的feature maps 利用18个1x1的卷积得到一个18通道的特征图,对于特征图上每一个点,都对应一个长度为18的向量
#这个向量对应这9个尺寸的anchor,每个anchor对应了前景和背景两类。
def _reshape_layer(self, bottom, num_dim, name):#num_dim是要强制转换的通道数。
#整体思路:首先进行通道顺序变换,然后强制将1x18通道形状转换为2x9的通道形状
input_shape = tf.shape(bottom)#读取数据维度
with tf.variable_scope(name) as scope:#打开变量域,定位这个变量
# change the channel to the caffe format
#将通道顺序变换
to_caffe = tf.transpose(bottom, [0, 3, 1, 2])
# then force it to have channel 2
#首先进行tf.concat,得到一个shape,按照这个shape进行reshape,
#
reshaped = tf.reshape(to_caffe,
tf.concat(axis=0, values=[[1, num_dim, -1], [input_shape[2]]]))
# then swap the channel back
#然后将通道顺序换回原来的通道。
to_tf = tf.transpose(reshaped, [0, 2, 3, 1])
return to_tf
#针对以上,给出1组示例,帮助理解。
#feature map 大小为38x38x24,24是因为我设置的anchor参数为[4,8,16,32]和[0.5,1,2]。每个anchor centre有3x4=12个anchor框,每个框有前景和背景2类得分值。所以是24.
#softmax用于将得分转换为概率
#key:tf.nn.softmax()
def _softmax_layer(self, bottom, name):
if name.startswith('rpn_cls_prob_reshape'):
input_shape = tf.shape(bottom)#输入的shape
bottom_reshaped = tf.reshape(bottom, [-1, input_shape[-1]])#重新排版一下得分矩阵以满足tf.nn.softmax的输入要求
reshaped_score = tf.nn.softmax(bottom_reshaped, name=name)#将得分矩阵转换为概率矩阵。
return tf.reshape(reshaped_score, input_shape)#转换回输入的数据组织顺序
return tf.nn.softmax(bottom, name=name)#如果不是rpn_cls_prob_reshape,即不是经过reshape的rpn_cls_prob,就直接满足tf.nn.softmax了,可以直接调用并返回结果
#测试的时候有两种模式,一种是top,一种是nms,都是对众多框的选测方法,nms快,top慢,但是文档解释说top更好,默认是nms。
def _proposal_top_layer(self, rpn_cls_prob, rpn_bbox_pred, name):
#这个函数跟proposal_layer()功能类似
with tf.variable_scope(name) as scope:
if cfg.USE_E2E_TF:#USE_E2E_TF是端端的tf模型,这个参数默认为True。相对的是非端端的tf模型,可以输入到多个tf实例模型里面运行。
#端到端的tf模型只在测试的前馈过程中测试过。这个参数最好别动,不然下面代码可能会出BUG
#proposal_top_layer的函数有两种,一种是非tf的,一种是tf的。这个版本的faster用的tf实现的。毕竟tf可以调用gpu,更快。
rois, rpn_scores = proposal_top_layer_tf(
rpn_cls_prob,
rpn_bbox_pred,
self._im_info,
self._feat_stride,
self._anchors,
self._num_anchors
)#选择前TEST.RPN_TOP_N个,这个参数默认是C.TRAIN.RPN_PRE_NMS_TOP_N = 12000
#需要注意的是,对于得到的框数m,m小于12000的话,会从5000个值里面随机选择m个框,所以会选到空白,导致丢失重要的框,不过这种情况很少发生
else:
#如果不是端端的tfmodel,就调用tf.py_func,将tensor转换为numpy array格式进行np的处理,然后输出numpy array,并转换为tensor后返回。
rois, rpn_scores = tf.py_func(proposal_top_layer,
[rpn_cls_prob, rpn_bbox_pred, self._im_info,
self._feat_stride, self._anchors, self._num_anchors],
[tf.float32, tf.float32], name="proposal_top")
rois.set_shape([cfg.TEST.RPN_TOP_N, 5])
rpn_scores.set_shape([cfg.TEST.RPN_TOP_N, 1])
#以上总之就是proposals挑选的两种方法nms、TOP中的TOP。TOP又分两种模式,端到端的tf处理或者调用np处理,增加tensor接口的灵活性,结果没差。
return rois, rpn_scores
#和上面对应,下面的是nms模式。函数名字的区别就是有无top。
#_proposal_layer从所有anchor框中选择出选择C.TRAIN.RPN_POST_NMS_TOP_N = 2000个作为rois供给fast rcnn部分,即rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
def _proposal_layer(self, rpn_cls_prob, rpn_bbox_pred, name):
with tf.variable_scope(name) as scope:
if cfg.USE_E2E_TF:
rois, rpn_scores = proposal_layer_tf(
rpn_cls_prob,
rpn_bbox_pred,
self._im_info,
self._mode,
self._feat_stride,
self._anchors,
self._num_anchors
)
else:
rois, rpn_scores = tf.py_func(proposal_layer,
[rpn_cls_prob, rpn_bbox_pred, self._im_info, self._mode,
self._feat_stride, self._anchors, self._num_anchors],
[tf.float32, tf.float32], name="proposal")
rois.set_shape([None, 5])
rpn_scores.set_shape([None, 1])
return rois, rpn_scores
# Only use it if you have roi_pooling op written in tf.image
#实际上并没有调用到,roi-pooling在resnet里面定义为crop_pool_layer,在network类里面也有,但是在resnet子类里进行了方法重定义,覆盖了network的_crop_pool_layer
#这个roi-pooling调用的是tensorflow本身的roi-pooling,但tf.image本身没有哦。所以要不自己写一个添加进去,不过这样比较麻烦,所以直接在faster项目代码里面写到_crop_pool_layer()里。
def _roi_pool_layer(self, bootom, rois, name):#这个输入和_crop_pool_layer一样的。这里并没有实现,就是做个样子
with tf.variable_scope(name) as scope:
return tf.image.roi_pooling(bootom, rois,
pooled_height=cfg.POOLING_SIZE,
pooled_width=cfg.POOLING_SIZE,
spatial_scale=1. / 16.)[0]
#roi-pooling
def _crop_pool_layer(self, bottom, rois, name):
#其中rois为regions of intesting,就是坐标信息。Bottom是block4的输出feature map
with tf.variable_scope(name) as scope:
#tf.slice(input_, begin, size, name=None):输入、开始切片的位置,切片的范围:
batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name="batch_id"), [1])
#tf.squeeze是用来删除空数据的。
# Get the normalized coordinates of bounding boxes
#输入的rois的坐标位置是特征图上的坐标,相对于resnet101而言,从feature到原图的缩放比例大概是16。因为中间有padding,所以这个位置信息我一直觉得是不精确的16倍缩放。
bottom_shape = tf.shape(bottom)
#获取图像的高度
height = (tf.to_float(bottom_shape[1]) - 1.) * np.float32(self._feat_stride[0])
#获取图像的宽度
width = (tf.to_float(bottom_shape[2]) - 1.) * np.float32(self._feat_stride[0])
#获取相对坐标。相对坐标是归一化坐标,也就是原图中的实际x/宽,Y/边,得到的数范围在0-1
x1 = tf.slice(rois, [0, 1], [-1, 1], name="x1") / width
y1 = tf.slice(rois, [0, 2], [-1, 1], name="y1") / height
x2 = tf.slice(rois, [0, 3], [-1, 1], name="x2") / width
y2 = tf.slice(rois, [0, 4], [-1, 1], name="y2") / height
# Won't be back-propagated to rois anyway, but to save time
#首先计算出来bounding boxes归一化坐标,然后组织这个节点的bp传播。
bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], axis=1))
pre_pool_size = cfg.POOLING_SIZE * 2
#pooling size的默认参数是7,为了进行max pooling,我们将范围扩大到14x14,这样经过下面的max pooling出来就是7x7
#tf.image.crop_and_resize=roi-pooling
crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size], name="crops")
return slim.max_pool2d(crops, [2, 2], padding='SAME')
#利用tf.nn.dropoutDropout就是在不同的训练过程中随机扔掉一部分神经元。对每个神经元,以一定的概率ratio,让其停止工作,这次训练过程中不更新权值,也不参加神经网络的计算。
def _dropout_layer(self, bottom, name, ratio=0.5):
return tf.nn.dropout(bottom, ratio, name=name)
#_anchor_target_layer从所有anchor框中选择batch个框训练RPN网络,即rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
def _anchor_target_layer(self, rpn_cls_score, name):
#根据得分对anchor打标签的部分.调用了py_func方法,在anchor_target_layer.py里面调用numpy来处理tensor,并不会反传
with tf.variable_scope(name) as scope:
#调用anchor_target_layer()进行一系列处理,具体在另一个代码里面,这里就不说了
rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = tf.py_func(
anchor_target_layer,
[rpn_cls_score, self._gt_boxes, self._im_info, self._feat_stride, self._anchors, self._num_anchors],
[tf.float32, tf.float32, tf.float32, tf.float32],
name="anchor_target")
#set_shape是设置句柄placeholder的shape的。
rpn_labels.set_shape([1, 1, None, None])
rpn_bbox_targets.set_shape([1, None, None, self._num_anchors * 4])
rpn_bbox_inside_weights.set_shape([1, None, None, self._num_anchors * 4])
rpn_bbox_outside_weights.set_shape([1, None, None, self._num_anchors * 4])
#数据格式转为int32,标签嘛,整数。
rpn_labels = tf.to_int32(rpn_labels, name="to_int32")
self._anchor_targets['rpn_labels'] = rpn_labels
self._anchor_targets['rpn_bbox_targets'] = rpn_bbox_targets
self._anchor_targets['rpn_bbox_inside_weights'] = rpn_bbox_inside_weights
self._anchor_targets['rpn_bbox_outside_weights'] = rpn_bbox_outside_weights
self._score_summaries.update(self._anchor_targets)#加入记录节点,跟tensorboard相关,没啥说的。
return rpn_labels#得到标签有-1\0\1三类,两个阈值,比如<0.3为背景,标记0。得分>0.5为前景,标记为1,中间的标记为-1,不做处理
#_proposal_target_layer在完成_anchor_target_layer的基础上从5000个rois中选择出128个训练fast rcnn,即rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
def _proposal_target_layer(self, rois, roi_scores, name):
with tf.variable_scope(name) as scope:#打开变量作用域
#使用py接口,用numpy函数处理proposal。得到用于训练的rois
rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights = tf.py_func(
proposal_target_layer,
[rois, roi_scores, self._gt_boxes, self._num_classes],
[tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32],
name="proposal_target")
#
rois.set_shape([cfg.TRAIN.BATCH_SIZE, 5])
roi_scores.set_shape([cfg.TRAIN.BATCH_SIZE])
labels.set_shape([cfg.TRAIN.BATCH_SIZE, 1])
bbox_targets.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4])
bbox_inside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4])
bbox_outside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4])
self._proposal_targets['rois'] = rois
self._proposal_targets['labels'] = tf.to_int32(labels, name="to_int32")
self._proposal_targets['bbox_targets'] = bbox_targets
self._proposal_targets['bbox_inside_weights'] = bbox_inside_weights
self._proposal_targets['bbox_outside_weights'] = bbox_outside_weights
self._score_summaries.update(self._proposal_targets)
return rois, roi_scores
#self.proposal_layer()、self._anchor_target_layer()以及self._proposal_target_layer()三个函数其实是一个完整的proposal结构
#同样两种模式,一种用tf函数计算,一种用numpy计算,前者是tf端到端的模型。
#generate_anchors_pre_tf或者generate_anchors_pre是重点,作用是生成anchors。
def _anchor_component(self):
with tf.variable_scope('ANCHOR_' + self._tag) as scope:
# just to get the shape right
height = tf.to_int32(tf.ceil(self._im_info[0] / np.float32(self._feat_stride[0])))
width = tf.to_int32(tf.ceil(self._im_info[1] / np.float32(self._feat_stride[0])))
if cfg.USE_E2E_TF:
anchors, anchor_length = generate_anchors_pre_tf(
height,
width,
self._feat_stride,
self._anchor_scales,
self._anchor_ratios
)
else:#调用py接口处理tensor.默认是tf端端处理
anchors, anchor_length = tf.py_func(generate_anchors_pre,
[height, width,
self._feat_stride, self._anchor_scales, self._anchor_ratios],
[tf.float32, tf.int32], name="generate_anchors")
anchors.set_shape([None, 4])
anchor_length.set_shape([])
#保存到全局变量里,后面会用到
self._anchors = anchors
self._anchor_length = anchor_length
#建立网络,默认可训练模式
def _build_network(self, is_training=True):
# select initializers
#使用截断的正态分布初始化权重,默认是False
if cfg.TRAIN.TRUNCATED:
initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01)
initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001)
else:#随机初始化
initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)
initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001)
#经过backbone得到最后一层conv4的feature map
net_conv = self._image_to_head(is_training)#把输入从全局变量传过来
with tf.variable_scope(self._scope, self._scope):#打开变量作用域
# build the anchors for the image
self._anchor_component()#生成所有anchors.在网络网络和输入确定的时候所有anchor就可以计算了,和特征没关系,主要是针对输入图像尺寸的
# region proposal network
rois = self._region_proposal(net_conv, is_training, initializer)
# region of interest pooling
#roi-pooling,完成proposal的最后一步处理
#这里注意一下,这个pool5不是单纯的将feature map进行pooling,而是对于每一个proposal,先crop,然后pooling。[batcb_id,proposal_id,...]
if cfg.POOLING_MODE == 'crop':
pool5 = self._crop_pool_layer(net_conv, rois, "pool5")
else:
raise NotImplementedError
#将pooling结果输入全连接层
# self._head_to_tail在resnetv1子类里面定义了类方法,进行全连接的计算,输出全连接层结果,取均值处理
fc7 = self._head_to_tail(pool5, is_training)
with tf.variable_scope(self._scope, self._scope):
# region classification
#对于每一个proposal的ROI-pooling结果,进行分类
cls_prob, bbox_pred = self._region_classification(fc7, is_training, #is_training这个参数是因为里面有全连接op,要定义参数是可训练的参数
initializer, initializer_bbox)
self._score_summaries.update(self._predictions)
return rois, cls_prob, bbox_pred#返回rois及其对应的类别得分和框的预测
#对于回归的loss计算
def _smooth_l1_loss(self, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, sigma=1.0, dim=[1]):
sigma_2 = sigma ** 2#=1
box_diff = bbox_pred - bbox_targets#计算与gt的偏移量
in_box_diff = bbox_inside_weights * box_diff#经过一层网络之后的输出
abs_in_box_diff = tf.abs(in_box_diff)#绝对值化
#tf.less返回两个张量各元素比较(x<y)得到的真假值组成的张量
#总之就是smoothL1分段函数的实现
smoothL1_sign = tf.stop_gradient(tf.to_float(tf.less(abs_in_box_diff, 1. / sigma_2)))
in_loss_box = tf.pow(in_box_diff, 2) * (sigma_2 / 2.) * smoothL1_sign \
+ (abs_in_box_diff - (0.5 / sigma_2)) * (1. - smoothL1_sign)
out_loss_box = bbox_outside_weights * in_loss_box
loss_box = tf.reduce_mean(tf.reduce_sum(
out_loss_box,
axis=dim
))
return loss_box#bbox_outside_weights、bbox_inside_weights为正负样本的权重矩阵
def _add_losses(self, sigma_rpn=3.0):
with tf.variable_scope('LOSS_' + self._tag) as scope:
# RPN, class loss
#采用的交叉熵分类损失,一个batch得到的loss取均值进行反向传播
rpn_cls_score = tf.reshape(self._predictions['rpn_cls_score_reshape'], [-1, 2])
rpn_label = tf.reshape(self._anchor_targets['rpn_labels'], [-1])
rpn_select = tf.where(tf.not_equal(rpn_label, -1))
rpn_cls_score = tf.reshape(tf.gather(rpn_cls_score, rpn_select), [-1, 2])
rpn_label = tf.reshape(tf.gather(rpn_label, rpn_select), [-1])
rpn_cross_entropy = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(logits=rpn_cls_score, labels=rpn_label))
#box loss采用smooth_L1损失函数
# RPN, bbox loss
rpn_bbox_pred = self._predictions['rpn_bbox_pred']
rpn_bbox_targets = self._anchor_targets['rpn_bbox_targets']
rpn_bbox_inside_weights = self._anchor_targets['rpn_bbox_inside_weights']
rpn_bbox_outside_weights = self._anchor_targets['rpn_bbox_outside_weights']
rpn_loss_box = self._smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights,
rpn_bbox_outside_weights, sigma=sigma_rpn, dim=[1, 2, 3])
#RPN的loss是基于anchor计算的,而RCNN是基于anchor得到的proposals计算的
# RCNN, class loss
cls_score = self._predictions["cls_score"]
label = tf.reshape(self._proposal_targets["labels"], [-1])
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=cls_score, labels=label))
# RCNN, bbox loss
bbox_pred = self._predictions['bbox_pred']
bbox_targets = self._proposal_targets['bbox_targets']
bbox_inside_weights = self._proposal_targets['bbox_inside_weights']
bbox_outside_weights = self._proposal_targets['bbox_outside_weights']
loss_box = self._smooth_l1_loss(bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights)
#将中间计算结果保存在self._losses,后面将传入layers_to_output()中
self._losses['cross_entropy'] = cross_entropy
self._losses['loss_box'] = loss_box
self._losses['rpn_cross_entropy'] = rpn_cross_entropy
self._losses['rpn_loss_box'] = rpn_loss_box
#几个loss进行加和,得到最终的Loss
loss = cross_entropy + loss_box + rpn_cross_entropy + rpn_loss_box
regularization_loss = tf.add_n(tf.losses.get_regularization_losses(), 'regu')
self._losses['total_loss'] = loss + regularization_loss
self._event_summaries.update(self._losses)
return loss
#搭建RPN网络,首先输入conv4计算得到的feature map,关于原图的变量已经在全局变量里面,
def _region_proposal(self, net_conv, is_training, initializer):
rpn = slim.conv2d(net_conv, cfg.RPN_CHANNELS, [3, 3], trainable=is_training, weights_initializer=initializer,
scope="rpn_conv/3x3")
#首先进行一个3x3的卷积层卷积,RPN_CHANNELS=512默认
self._act_summaries.append(rpn)
#再经过_num_anchors * 2个1x1的卷积,目的是通道转换
rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training,
weights_initializer=initializer,
padding='VALID', activation_fn=None, scope='rpn_cls_score')
# change it so that the score has 2 as its channel size
#首先进行通道变换,针对每一个anchor分为两类,
rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape')
#然后对两类的得分score进行softmax得到prob
rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape")
#将得分的通道变换回去,变回rpn_cls_pred
rpn_cls_pred = tf.argmax(tf.reshape(rpn_cls_score_reshape, [-1, 2]), axis=1, name="rpn_cls_pred")
#将进行softmax得到prob变换回去,得到rpn_cls_prob
rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob")
#上面是cls的得分和概率计算,下面计算boxes的偏移量预测结果rpn_bbox_pred
rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training,
weights_initializer=initializer,
padding='VALID', activation_fn=None, scope='rpn_bbox_pred')
if is_training:#如果是训练模式,就要通过_proposal_target_layer计算用于训练RCNN的Proposal,以及调用_anchor_target_layer()选择用于训练RPN的anchor
rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
# Try to have a deterministic order for the computing graph, for reproducibility
with tf.control_dependencies([rpn_labels]):#顺序控制器
#tf.control_dependencies[a,b],函数指定某些操作执行的依赖关系,先a再b,这里先执行rpn_labels的op再执行下面,相当于op判断语句
rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
else:#如果仅仅是测试,就不需要进行这个挑选,直接_proposal_layer(),经过nms或者top过滤anchor得到Proposal再进入RCNN计算分类和box偏移量就好
if cfg.TEST.MODE == 'nms':
rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
elif cfg.TEST.MODE == 'top':
rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
else:
raise NotImplementedError
#计算得到的中间结果保存到self._predictions里面,在计算loss的时候或者测试的时候都会用到
self._predictions["rpn_cls_score"] = rpn_cls_score
self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape
self._predictions["rpn_cls_prob"] = rpn_cls_prob
self._predictions["rpn_cls_pred"] = rpn_cls_pred
self._predictions["rpn_bbox_pred"] = rpn_bbox_pred
self._predictions["rois"] = rois
return rois#RPN返回的是ROIS
#对rois的分类和回归都写在一起了,其实是classification and regression
#也就是RCNN对rois的分类
def _region_classification(self, fc7, is_training, initializer, initializer_bbox):
#首先经过一个全连接层,输出节点的数量就是分类类别数,比如21
cls_score = slim.fully_connected(fc7, self._num_classes,
weights_initializer=initializer,
trainable=is_training,
activation_fn=None, scope='cls_score')
#得分经过softmax转换为概率
cls_prob = self._softmax_layer(cls_score, "cls_prob")
#排序一下
cls_pred = tf.argmax(cls_score, axis=1, name="cls_pred")
#与上面的分类并行,对于输入也进行一个全连接层,输出节点数量=类别数量x4,也就是每一个类别有4个位置偏移量
bbox_pred = slim.fully_connected(fc7, self._num_classes * 4,
weights_initializer=initializer_bbox,
trainable=is_training,
activation_fn=None, scope='bbox_pred')
#将结果保存在self._predictions,里面还有rpn的cls和bbox的prob\pred\score
self._predictions["cls_score"] = cls_score
self._predictions["cls_pred"] = cls_pred
self._predictions["cls_prob"] = cls_prob
self._predictions["bbox_pred"] = bbox_pred
return cls_prob, bbox_pred
#在网络子类里面实现
def _image_to_head(self, is_training, reuse=None):
raise NotImplementedError
def _head_to_tail(self, pool5, is_training, reuse=None):
raise NotImplementedError
#create_architecture构建网络,在demo.py里面调用训练脚本或者测试脚本也调用。
#是整体网络结构了我们来捋一捋这个网络最上层的封装
def create_architecture(self, mode, num_classes, tag=None,
anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)):
self._image = tf.placeholder(tf.float32, shape=[1, None, None, 3])#输入图像
self._im_info = tf.placeholder(tf.float32, shape=[3])#输入图像的长、宽信息,还有一个维度应该是id
self._gt_boxes = tf.placeholder(tf.float32, shape=[None, 5])#ground true
self._tag = tag
self._num_classes = num_classes#类别数
self._mode = mode#模式,nms或者top
self._anchor_scales = anchor_scales#anchor参数
self._num_scales = len(anchor_scales)#anchor数量
#都是anchor的
self._anchor_ratios = anchor_ratios
self._num_ratios = len(anchor_ratios)
self._num_anchors = self._num_scales * self._num_ratios
#调用模式
training = mode == 'TRAIN'
testing = mode == 'TEST'
assert tag != None#这里对tag必须为空
# handle most of the regularizers here
#调用tf.contrib.layers.l2_regularizer()处理绝大部分的正则化
#cfg.TRAIN.WEIGHT_DECAY=0.0001,权重衰减,用于正则化
#tf.contrib.layers.l2_regularizer是tensorflow自带的L2正则化项计算,目的是限制权重,防止过拟合
weights_regularizer = tf.contrib.layers.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)
if cfg.TRAIN.BIAS_DECAY:#偏置是否也进行正则化
biases_regularizer = weights_regularizer
else:
biases_regularizer = tf.no_regularizer
# list as many types of layers as possible, even if they are not used now
#打开尽量多功能层的变量作用域
with arg_scope([slim.conv2d, slim.conv2d_in_plane, \
slim.conv2d_transpose, slim.separable_conv2d, slim.fully_connected],
weights_regularizer=weights_regularizer,
biases_regularizer=biases_regularizer,
biases_initializer=tf.constant_initializer(0.0)):
rois, cls_prob, bbox_pred = self._build_network(training)#构建网络
layers_to_output = {'rois': rois}#layers_to_output字典变量,其中Key='rois'里面存放检测好的rois
for var in tf.trainable_variables():#tf.trainable_variables()用于返回可训练类型的变量列表
self._train_summaries.append(var)
if testing:#测试模式,均值和方差都是设好的
stds = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (self._num_classes))
means = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (self._num_classes))
#对框进行固定值的修正
self._predictions["bbox_pred"] *= stds#(0.1, 0.1, 0.2, 0.2)
self._predictions["bbox_pred"] += means#(0, 0, 0, 0)
else:#训练模式,self._add_losses得到RPN和RCNN的box\cls loss,以及总loss
#先计算Loss
self._add_losses()
layers_to_output.update(self._losses)
val_summaries = []
with tf.device("/cpu:0"):#用CPU运行以下,都是训练过程中保存的信息
val_summaries.append(self._add_gt_image_summary())
for key, var in self._event_summaries.items():
val_summaries.append(tf.summary.scalar(key, var))
for key, var in self._score_summaries.items():
self._add_score_summary(key, var)
for var in self._act_summaries:
self._add_act_summary(var)
for var in self._train_summaries:
self._add_train_summary(var)
self._summary_op = tf.summary.merge_all()
self._summary_op_val = tf.summary.merge(val_summaries)
layers_to_output.update(self._predictions)#同样,加入新的键值对,主要是预测结果
return layers_to_output
def get_variables_to_restore(self, variables, var_keep_dic):#恢复网络
raise NotImplementedError
def fix_variables(self, sess, pretrained_model):#手动修改网络头部的卷积,在具体网络如resnet里面实现
raise NotImplementedError
# Extract the head feature maps, for example for vgg16 it is conv5_3
# only useful during testing mode
#没用到,对于self._layers["head"]这个op,指定数据输入feed_dict={self._image: image},得到feature map
#本来是想设计为backbone输出的fp,但是这个版本的faster没有定义这个op
def extract_head(self, sess, image):
feed_dict = {self._image: image}
feat = sess.run(self._layers["head"], feed_dict=feed_dict)
return feat
# only useful during testing mode
#test.py里面调用到,主要是输入图像,并且run出计算结果。
def test_image(self, sess, image, im_info):
feed_dict = {self._image: image,
self._im_info: im_info}
#run的op是在test_net.py里面用下面语句生成的
#saver.restore(sess, args.model)
#sess.run(tf.global_variables_initializer())
else:
print(('Loading initial weights from {:s}').format(args.weight))
sess.run(tf.global_variables_initializer())#首先初始化,然后run出op得到结果
cls_score, cls_prob, bbox_pred, rois = sess.run([self._predictions["cls_score"],
self._predictions['cls_prob'],
self._predictions['bbox_pred'],
self._predictions['rois']],
feed_dict=feed_dict)
return cls_score, cls_prob, bbox_pred, rois
def get_summary(self, sess, blobs):
feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'],
self._gt_boxes: blobs['gt_boxes']}
summary = sess.run(self._summary_op_val, feed_dict=feed_dict)
return summary
#训练train_val.py里面调用到,用于获取loss进行阶段性汇报
def train_step(self, sess, blobs, train_op):
feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'],
self._gt_boxes: blobs['gt_boxes']}
rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, _ = sess.run([self._losses["rpn_cross_entropy"],
self._losses['rpn_loss_box'],
self._losses['cross_entropy'],
self._losses['loss_box'],
self._losses['total_loss'],
train_op],
feed_dict=feed_dict)
return rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss
#训练的时候,一组图片组成blobs进行输入
def train_step_with_summary(self, sess, blobs, train_op):
feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'],
self._gt_boxes: blobs['gt_boxes']}
rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, summary, _ = sess.run([self._losses["rpn_cross_entropy"],
self._losses['rpn_loss_box'],
self._losses['cross_entropy'],
self._losses['loss_box'],
self._losses['total_loss'],
self._summary_op,
train_op],
feed_dict=feed_dict)
return rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, summary
#跟上面的def一样,只是多了一个_summary_op
#下面的函数没有用到
def train_step_no_return(self, sess, blobs, train_op):
feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'],
self._gt_boxes: blobs['gt_boxes']}
sess.run([train_op], feed_dict=feed_dict)