faster RCNN(keras版本)代码讲解(5)-RPN层详情

faster RCNN(keras版本)代码讲解博客索引:
1.faster RCNN(keras版本)代码讲解(1)-概述
 2.faster RCNN(keras版本)代码讲解(2)-数据准备
 3.faster RCNN(keras版本)代码讲解(3)-训练流程详情
 4.faster RCNN(keras版本)代码讲解(4)-共享卷积层详情
 5.faster RCNN(keras版本)代码讲解(5)-RPN层详情
6.faster RCNN(keras版本)代码讲解(6)-ROI Pooling层详情

一.RPN层网络
其实就是ResNet的卷积层+一层Conv,然后进行分类(该框是否是物体),回归(定位一个框的四个坐标值)

def rpn(base_layers,num_anchors):

    x = Convolution2D(512, (3, 3), padding='same', activation='relu', kernel_initializer='normal', name='rpn_conv1')(base_layers)
    #对于rpn的2分类使用的是sigmoid激活函数，所以这里的filter数量为num_anchors，而论文中使用的softmax，所以论文中filter数量为num_anchors*2，使用1*1卷积代替全连接
    x_class = Convolution2D(num_anchors, (1, 1), activation='sigmoid', kernel_initializer='uniform', name='rpn_out_class')(x)
    x_regr = Convolution2D(num_anchors * 4, (1, 1), activation='linear', kernel_initializer='zero', name='rpn_out_regress')(x)

    return [x_class, x_regr, base_layers]

二.RPN网络层的标签
1.根据特征图的大小，对每个特征图上的点，反推回原始图像，然后在原始图像上找到框
2.如果该框和实际的框gt IOU>=0.7,则认为该框为正例，如果IOU<=0.3,则认为为背景，如果0.3大于IOU小于0.7则将该框丢弃

# RPN，生成region proposal
def calc_rpn(C, img_data, width, height, resized_width, resized_height, img_length_calc_function):

    #downscale = 16，因为共享卷积层会将resize后的图片压缩16倍
    downscale = float(C.rpn_stride)
    anchor_sizes = C.anchor_box_scales #[128, 256, 512]
    anchor_ratios = C.anchor_box_ratios
    num_anchors = len(anchor_sizes) * len(anchor_ratios)    

    # calculate the output map size based on the network architecture
    # 计算图片经过共享卷积以后输出的特征图的大小
    (output_width, output_height) = img_length_calc_function(resized_width, resized_height)
    print("经过共享卷积层后特征图大小after share conv",output_height,output_width)

    n_anchratios = len(anchor_ratios)

    # initialise empty output objectives
    #保存和gt 是否重叠(是正例还是反例)，根据IOU值，output_height, output_width中的内容对应在其中一种模板框下框的坐标，总体表示为在这个坐标下，是否和gt重叠，重叠为1正例，不重叠为0反例，num_anchors表示厚度，表示9重不同规模的框
    y_rpn_overlap = np.zeros((output_height, output_width, num_anchors))
    #保存 框是否启用
    y_is_box_valid = np.zeros((output_height, output_width, num_anchors))
    #保存 框的4个坐标值
    y_rpn_regr = np.zeros((output_height, output_width, num_anchors * 4))

    num_bboxes = len(img_data['bboxes'])
    print("图片中物体个数",num_bboxes)
    num_anchors_for_bbox = np.zeros(num_bboxes).astype(int)
    best_anchor_for_bbox = -1*np.ones((num_bboxes, 4)).astype(int)
    best_iou_for_bbox = np.zeros(num_bboxes).astype(np.float32)
    best_x_for_bbox = np.zeros((num_bboxes, 4)).astype(int)
    best_dx_for_bbox = np.zeros((num_bboxes, 4)).astype(np.float32)

    # get the GT box coordinates, and resize to account for image resizing
    gta = np.zeros((num_bboxes, 4))
    for bbox_num, bbox in enumerate(img_data['bboxes']):
        # get the GT box coordinates, and resize to account for image resizing
        #重新获得resize过后物体的位置
        gta[bbox_num, 0] = bbox['x1'] * (resized_width / float(width))
        gta[bbox_num, 1] = bbox['x2'] * (resized_width / float(width))
        gta[bbox_num, 2] = bbox['y1'] * (resized_height / float(height))
        gta[bbox_num, 3] = bbox['y2'] * (resized_height / float(height))

    # rpn ground truth
    # 对9种框进行遍历，里面还有个对图片中所有物体的遍历，所有for循环次数  len(anchor_sizes)
    # 遍历顺序，先遍历框，然后拿特征图和原始图框坐标，然后再和原始图中物体进行IOU计算，拿IOU最高的框
    for anchor_size_idx in range(len(anchor_sizes)):
        for anchor_ratio_idx in range(n_anchratios):
            anchor_x = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][0]
            anchor_y = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][1]   

            # 根据特征图的大小，对每个特征图上的点，反推回原始图像，然后在原始图像上找到框
            for ix in range(output_width):                  
                # x-coordinates of the current anchor box   
                x1_anc = downscale * (ix + 0.5) - anchor_x / 2
                x2_anc = downscale * (ix + 0.5) + anchor_x / 2  

                # ignore boxes that go across image boundaries
                # 超过边界的框就不要了                    
                if x1_anc < 0 or x2_anc > resized_width:
                    continue

                for jy in range(output_height):

                    # y-coordinates of the current anchor box
                    y1_anc = downscale * (jy + 0.5) - anchor_y / 2
                    y2_anc = downscale * (jy + 0.5) + anchor_y / 2

                    # ignore boxes that go across image boundaries
                    if y1_anc < 0 or y2_anc > resized_height:
                        continue

                    # bbox_type indicates whether an anchor should be a target 
                    bbox_type = 'neg' #negative

                    # this is the best IOU for the (x,y) coord and the current anchor
                    # note that this is different from the best IOU for a GT bbox
                    best_iou_for_loc = 0.0

                    for bbox_num in range(num_bboxes):

                        # get IOU of the current GT box and the current anchor box
                        # 对框进行一个IOU计算，IOU越高，越说明候选框anchor和gt重叠比列越高，越接近gt
                        curr_iou = iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1_anc, y1_anc, x2_anc, y2_anc])
                        # calculate the regression targets if they will be needed
                        # 这里没看懂，求gt和框的中心点重叠程度？
                        if curr_iou > best_iou_for_bbox[bbox_num] or curr_iou > C.rpn_max_overlap:
                            cx = (gta[bbox_num, 0] + gta[bbox_num, 1]) / 2.0
                            cy = (gta[bbox_num, 2] + gta[bbox_num, 3]) / 2.0
                            cxa = (x1_anc + x2_anc)/2.0
                            cya = (y1_anc + y2_anc)/2.0

                            tx = (cx - cxa) / (x2_anc - x1_anc)
                            ty = (cy - cya) / (y2_anc - y1_anc)
                            tw = np.log((gta[bbox_num, 1] - gta[bbox_num, 0]) / (x2_anc - x1_anc))
                            th = np.log((gta[bbox_num, 3] - gta[bbox_num, 2]) / (y2_anc - y1_anc))

                        if img_data['bboxes'][bbox_num]['class'] != 'bg':

                            # all GT boxes should be mapped to an anchor box, so we keep track of which anchor box was best
                            if curr_iou > best_iou_for_bbox[bbox_num]:
                                #保存当前最好的特征图上的anchor坐标，注意是当前
                                best_anchor_for_bbox[bbox_num] = [jy, ix, anchor_ratio_idx, anchor_size_idx]
                                #保存gt对应当前最好anchor的IOU值
                                best_iou_for_bbox[bbox_num] = curr_iou
                                #保存当前最好anchor对应在原图上的坐标
                                best_x_for_bbox[bbox_num,:] = [x1_anc, x2_anc, y1_anc, y2_anc]
                                #这个应该是留着反向传播用吧
                                best_dx_for_bbox[bbox_num,:] = [tx, ty, tw, th]

                            # we set the anchor to positive if the IOU is >0.7 (it does not matter if there was another better box, it just indicates overlap)
                            # 如果IOU>0.7,我们就设置为这个anchor为正例
                            if curr_iou > C.rpn_max_overlap:
                                bbox_type = 'pos' #positive
                                #记录每个物体框有几个anchor有较高重叠率IOU>0.7的
                                num_anchors_for_bbox[bbox_num] += 1
                                # we update the regression layer target if this IOU is the best for the current (x,y) and anchor position
                                if curr_iou > best_iou_for_loc:
                                    best_iou_for_loc = curr_iou
                                    best_regr = (tx, ty, tw, th)

                            # if the IOU is >0.3 and <0.7, it is ambiguous and no included in the objective
                            #if IOU is >0.3 and <0.7，就丢弃这些anchor
                            if C.rpn_min_overlap < curr_iou < C.rpn_max_overlap:
                                # gray zone between neg and pos
                                if bbox_type != 'pos':
                                    bbox_type = 'neutral'

                    # turn on or off outputs depending on IOUs
                    if bbox_type == 'neg':
                        y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                        y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                    elif bbox_type == 'neutral':
                        y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                        y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0
                    elif bbox_type == 'pos':
                        y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                        y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1
                        start = 4 * (anchor_ratio_idx + n_anchratios * anchor_size_idx)
                        y_rpn_regr[jy, ix, start:start+4] = best_regr

    # we ensure that every bbox has at least one positive RPN region
    # 确保每一个物体框都有一个正例
    for idx in range(num_anchors_for_bbox.shape[0]):
        if num_anchors_for_bbox[idx] == 0:
            # no box with an IOU greater than zero ...
            if best_anchor_for_bbox[idx, 0] == -1:
                continue
            y_is_box_valid[
                best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], best_anchor_for_bbox[idx,2] + n_anchratios *
                best_anchor_for_bbox[idx,3]] = 1
            y_rpn_overlap[
                best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], best_anchor_for_bbox[idx,2] + n_anchratios *
                best_anchor_for_bbox[idx,3]] = 1
            start = 4 * (best_anchor_for_bbox[idx,2] + n_anchratios * best_anchor_for_bbox[idx,3])
            y_rpn_regr[
                best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], start:start+4] = best_dx_for_bbox[idx, :]

    y_rpn_overlap = np.transpose(y_rpn_overlap, (2, 0, 1))
    y_rpn_overlap = np.expand_dims(y_rpn_overlap, axis=0)

    y_is_box_valid = np.transpose(y_is_box_valid, (2, 0, 1))
    y_is_box_valid = np.expand_dims(y_is_box_valid, axis=0)

    y_rpn_regr = np.transpose(y_rpn_regr, (2, 0, 1))
    y_rpn_regr = np.expand_dims(y_rpn_regr, axis=0)

    pos_locs = np.where(np.logical_and(y_rpn_overlap[0, :, :, :] == 1, y_is_box_valid[0, :, :, :] == 1))
    neg_locs = np.where(np.logical_and(y_rpn_overlap[0, :, :, :] == 0, y_is_box_valid[0, :, :, :] == 1))

    num_pos = len(pos_locs[0])

    # one issue is that the RPN has many more negative than positive regions, so we turn off some of the negative
    # regions. We also limit it to 256 regions.
    #因为RPN中negative要比positive regions多，所以要剪掉一些negative，然后现在regions的总量256，正例不能超过128
    num_regions = 256

    if len(pos_locs[0]) > num_regions/2:
        val_locs = random.sample(range(len(pos_locs[0])), len(pos_locs[0]) - num_regions/2)
        y_is_box_valid[0, pos_locs[0][val_locs], pos_locs[1][val_locs], pos_locs[2][val_locs]] = 0
        num_pos = num_regions/2

    if len(neg_locs[0]) + num_pos > num_regions:
        val_locs = random.sample(range(len(neg_locs[0])), len(neg_locs[0]) - num_pos)
        y_is_box_valid[0, neg_locs[0][val_locs], neg_locs[1][val_locs], neg_locs[2][val_locs]] = 0

    y_rpn_cls = np.concatenate([y_is_box_valid, y_rpn_overlap], axis=1)
    y_rpn_regr = np.concatenate([np.repeat(y_rpn_overlap, 4, axis=1), y_rpn_regr], axis=1)
    print("特征图上每点(其实是点映射的框)对应的类别y_rpn_cls",y_rpn_cls.shape)
    print("特征图上每点(其实是点映射的框)对应的4个坐标y_rpn_regr",y_rpn_regr.shape)

    return np.copy(y_rpn_cls), np.copy(y_rpn_regr)

三.RPN层损失函数的计算

#RPN分类损失，先传递框的数量
def rpn_loss_cls(num_anchors):
    def rpn_loss_cls_fixed_num(y_true, y_pred):
        if K.image_dim_ordering() == 'tf':
            #K.sum(y_true[:, :, :, :num_anchors]为框是否启用，binary_crossentropy采用交叉熵，/K.sum(epsilon + y_true[:, :, :, :num_anchors])，再求个平均
            return lambda_rpn_class * K.sum(y_true[:, :, :, :num_anchors] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, :, :, num_anchors:])) / K.sum(epsilon + y_true[:, :, :, :num_anchors])
        else:
            return lambda_rpn_class * K.sum(y_true[:, :num_anchors, :, :] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, num_anchors:, :, :])) / K.sum(epsilon + y_true[:, :num_anchors, :, :])

    return rpn_loss_cls_fixed_num

faster RCNN(keras版本)代码讲解(5)-RPN层详情

猜你喜欢