一、导包

首先，导入包。

import os
import sys
import itertools
import math
import logging
import json
import re
import random
from collections import OrderedDict
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.lines as lines
from matplotlib.patches import Polygon

#设置项目根目录
ROOT_DIR = os.path.abspath("../../")

#导入Mask RCNN
sys.path.append(ROOT_DIR)
from mrcnn import utils
from mrcnn import visualize
from mrcnn.visualize import display_images
import mrcnn.model as modellib
from mrcnn.model import log

%matplotlib inline

二、配置

配置信息。

#下面两个代码块选择运行其一即可

# Shapes toy dataset
# import shapes
# config = shapes.ShapesConfig()

# MS COCO Dataset
import coco
config = coco.CocoConfig()
#COCO数据集的路径
COCO_DIR = "path/to/COCO"

三、数据集

#加载数据集dataset
#加载shapes数据集
if config.NAME == 'shapes':
    dataset = shapes.ShapesDataset()
    dataset.load_shapes(500, config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1])
#加载coco数据集
elif config.NAME == "coco":
    dataset = coco.CocoDataset()
    dataset.load_coco(COCO_DIR, "train")

#下面这句必须在使用数据集之前调用
dataset.prepare()

#打印数据集信息
print("Image Count: {}".format(len(dataset.image_ids)))
print("Class Count: {}".format(dataset.num_classes))
for i, info in enumerate(dataset.class_info):
    print("{:3}. {:50}".format(i, info['name']))

四、可视化

可视化数据集信息和mask。

#随机选取一个样本加载并显示
image_ids = np.random.choice(dataset.image_ids, 4)
for image_id in image_ids:
    image = dataset.load_image(image_id)
    mask, class_ids = dataset.load_mask(image_id)
    visualize.display_top_masks(image, mask, class_ids, dataset.class_names)

五、Bounding Boxes

这里代码没有使用数据集提供的bounding box坐标，而是通过mask计算得到。这样可以不管是什么数据集都可以用同一方法处理bounding boxes，并且可以很容易进行缩放，旋转和裁剪图像，因为我们是通过更新mask来产生bounding boxes，而不是通过每一种图像变换来计算bounding box的变换。

#随机选择图像并加载mask.
image_id = random.choice(dataset.image_ids)
image = dataset.load_image(image_id)
mask, class_ids = dataset.load_mask(image_id)
#计算Bounding box
bbox = utils.extract_bboxes(mask)

#打印图像和一些额外的信息
print("image_id ", image_id, dataset.image_reference(image_id))
log("image", image)
log("mask", mask)
log("class_ids", class_ids)
log("bbox", bbox)
# 显示图像和instances
visualize.display_instances(image, bbox, mask, class_ids, dataset.class_names)

image_id  74886 http://cocodataset.org/#explore?id=118535
image                    shape: (375, 500, 3)         min:    0.00000  max:  255.00000
mask                     shape: (375, 500, 5)         min:    0.00000  max:    1.00000
class_ids                shape: (5,)                  min:    1.00000  max:   35.00000
bbox                     shape: (5, 4)                min:    1.00000  max:  329.00000

六、缩放图像

因为在一个batch中要处理多幅图像，所以所有图像同一缩放到一个尺寸（1024x2014）。尽管代码中提供了一个长宽比参数，但是当图像不是正方形时，会在图像上/下或者左/右填充0。

#随机选择图像.
image_id = np.random.choice(dataset.image_ids, 1)[0]
image = dataset.load_image(image_id)
mask, class_ids = dataset.load_mask(image_id)
original_shape = image.shape
# Resize
image, window, scale, padding, _ = utils.resize_image(
    image, 
    min_dim=config.IMAGE_MIN_DIM, 
    max_dim=config.IMAGE_MAX_DIM,
    mode=config.IMAGE_RESIZE_MODE)
mask = utils.resize_mask(mask, scale, padding)
#计算Bounding box
bbox = utils.extract_bboxes(mask)

#打印信息
print("image_id: ", image_id, dataset.image_reference(image_id))
print("Original shape: ", original_shape)
log("image", image)
log("mask", mask)
log("class_ids", class_ids)
log("bbox", bbox)
#显示图像instances
visualize.display_instances(image, bbox, mask, class_ids, dataset.class_names)

image_id:  6480 http://cocodataset.org/#explore?id=402563
Original shape:  (476, 640, 3)
image                    shape: (1024, 1024, 3)       min:    0.00000  max:  255.00000
mask                     shape: (1024, 1024, 32)      min:    0.00000  max:    1.00000
class_ids                shape: (32,)                 min:    1.00000  max:   77.00000
bbox                     shape: (32, 4)               min:    1.00000  max:  991.00000

七、Mini Masks

当训练高分辨率图像时，产生的instance二值masks会比较大。比如，当训练1024x1024分辨率的图像时，单个instance的mask需要1MB的内存空间。如果一副图像有100个instances，就需要100MB的空间来存储masks。

为加速训练速度，对masks做了以下优化：

只存储在bounding box内的物体的mask像素，而不是整幅图像的mask。大部分物体相比整幅图像来说都很小，所以节省了很多存储0值得空间。
将mask缩放到更小的尺寸（例如 56x56）。尽管在一些比我们选择尺寸大的物体上会损失一些精度，但是大部分物体的标注都不是很精确，所以这一点损失在实际应用中可以忽略不计。mini_mask的尺寸可以在config类中设置。

image_id = np.random.choice(dataset.image_ids, 1)[0]
image, image_meta, class_ids, bbox, mask = modellib.load_image_gt(
    dataset, config, image_id, use_mini_mask=False)

log("image", image)
log("image_meta", image_meta)
log("class_ids", class_ids)
log("bbox", bbox)
log("mask", mask)

display_images([image]+[mask[:,:,i] for i in range(min(mask.shape[-1], 7))])
visualize.display_instances(image, bbox, mask, class_ids, dataset.class_names)

image                    shape: (1024, 1024, 3)       min:    0.00000  max:  255.00000
image_meta               shape: (89,)                 min:    0.00000  max: 23221.00000
bbox                     shape: (1, 5)                min:   62.00000  max:  578.00000
mask                     shape: (1024, 1024, 1)       min:    0.00000  max:    1.00000

#使用mini mask
image, image_meta, class_ids, bbox, mask = modellib.load_image_gt(
    dataset, config, image_id, augment=True, use_mini_mask=True)
log("mask", mask)
display_images([image]+[mask[:,:,i] for i in range(min(mask.shape[-1], 7))])
mask = utils.expand_mask(bbox, mask, image.shape)
visualize.display_instances(image, bbox, mask, class_ids, dataset.class_names)

mask                     shape: (56, 56, 1)           min:    0.00000  max:    1.00000

八、Anchors

Anchors的顺序是很重要的。在训练和预测阶段使用的顺序应该是相同的，并且必须匹配卷积执行的顺序。

在FPN网络，anchors的顺序必须易于匹配卷积层的输出，用于预测anchor的分数和偏移。

首先，按照金字塔的层次排序。先是第一层的所有anchors，再是第二层。。。这样容易用不同的层次分离anchors。
在每一层次内部，按照feature map的处理顺序进行排序。通常一个卷积层处理一个feature map的顺序是，从左上方开始，然后一行一行的往右移动。
在每一个feature map cell内部，以任意顺序选择不同比例的anchors。这里根据传递给函数的比例的顺序选择。

Anchor Stride：在FPN结构中，feature maps只在前几层的分辨率较高。例如，输入图像的尺寸是1024x1024，则第一层的feature map的大小是256x256，产生大约200k个anchors（2562563）。这些anchors的大小是32x32像素，它们的stride相对于图像来说是4个像素，所以这里有很大的重叠。如果我们在feature map上每隔一个cell生成anchors的话就可以显著的减少负载。例如，将stride设置为2可以将anchors的数量减少到1/4。

所以和paper不同的是，这里设置anchor的stride等于2。

#生成Anchors
backbone_shapes = modellib.compute_backbone_shapes(config, config.IMAGE_SHAPE)
anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, 
                                          config.RPN_ANCHOR_RATIOS,
                                          backbone_shapes,
                                          config.BACKBONE_STRIDES, 
                                          config.RPN_ANCHOR_STRIDE)

#打印anchors的信息
num_levels = len(backbone_shapes)
anchors_per_cell = len(config.RPN_ANCHOR_RATIOS)
print("Count: ", anchors.shape[0])
print("Scales: ", config.RPN_ANCHOR_SCALES)
print("ratios: ", config.RPN_ANCHOR_RATIOS)
print("Anchors per Cell: ", anchors_per_cell)
print("Levels: ", num_levels)
anchors_per_level = []
for l in range(num_levels):
    num_cells = backbone_shapes[l][0] * backbone_shapes[l][1]
    anchors_per_level.append(anchors_per_cell * num_cells // config.RPN_ANCHOR_STRIDE**2)
    print("Anchors in Level {}: {}".format(l, anchors_per_level[l]))

Count:  65472
Scales:  (32, 64, 128, 256, 512)
ratios:  [0.5, 1, 2]
Anchors per Cell:  3
Levels:  5
Anchors in Level 0: 49152
Anchors in Level 1: 12288
Anchors in Level 2: 3072
Anchors in Level 3: 768
Anchors in Level 4: 192

##选定一个特定层次的位于feature map中间的一个cell内的anchors

#随机选择一幅图像加载并显示
image_id = np.random.choice(dataset.image_ids, 1)[0]
image, image_meta, _, _, _ = modellib.load_image_gt(dataset, config, image_id)
fig, ax = plt.subplots(1, figsize=(10, 10))
ax.imshow(image)
levels = len(backbone_shapes)

for level in range(levels):
    colors = visualize.random_colors(levels)
    # Compute the index of the anchors at the center of the image
    level_start = sum(anchors_per_level[:level]) #前面levels的anchors的总和
    level_anchors = anchors[level_start:level_start+anchors_per_level[level]]
    print("Level {}. Anchors: {:6}  Feature map Shape: {}".format(level, level_anchors.shape[0], 
                                                                  backbone_shapes[level]))
    center_cell = backbone_shapes[level] // 2
    center_cell_index = (center_cell[0] * backbone_shapes[level][1] + center_cell[1])
    level_center = center_cell_index * anchors_per_cell 
    center_anchor = anchors_per_cell * (
        (center_cell[0] * backbone_shapes[level][1] / config.RPN_ANCHOR_STRIDE**2) \
        + center_cell[1] / config.RPN_ANCHOR_STRIDE)
    level_center = int(center_anchor)

    #绘制anchors.按照亮度暗到亮的顺序显示.
    for i, rect in enumerate(level_anchors[level_center:level_center+anchors_per_cell]):
        y1, x1, y2, x2 = rect
        p = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=2, facecolor='none',
                              edgecolor=(i+1)*np.array(colors[level]) / anchors_per_cell)
        ax.add_patch(p)

九、Data Generator

#创建一个data generator
random_rois = 2000
g = modellib.data_generator(
    dataset, config, shuffle=True, random_rois=random_rois, 
    batch_size=4,
    detection_targets=True)

#获取下一幅图像
if random_rois:
    [normalized_images, image_meta, rpn_match, rpn_bbox, gt_class_ids, gt_boxes, gt_masks, rpn_rois, rois], \
    [mrcnn_class_ids, mrcnn_bbox, mrcnn_mask] = next(g)
    
    log("rois", rois)
    log("mrcnn_class_ids", mrcnn_class_ids)
    log("mrcnn_bbox", mrcnn_bbox)
    log("mrcnn_mask", mrcnn_mask)
else:
    [normalized_images, image_meta, rpn_match, rpn_bbox, gt_boxes, gt_masks], _ = next(g)
    
log("gt_class_ids", gt_class_ids)
log("gt_boxes", gt_boxes)
log("gt_masks", gt_masks)
log("rpn_match", rpn_match, )
log("rpn_bbox", rpn_bbox)
image_id = modellib.parse_image_meta(image_meta)["image_id"][0]
print("image_id: ", image_id, dataset.image_reference(image_id))

#移除mrcnn_class_ids的最后一个dim. 它仅仅是为了满足Keras对目标shape的限制.
mrcnn_class_ids = mrcnn_class_ids[:,:,0]

b = 0

#恢复原始图像(逆正规化)
sample_image = modellib.unmold_image(normalized_images[b], config)

#计算anchor偏移.
indices = np.where(rpn_match[b] == 1)[0]
refined_anchors = utils.apply_box_deltas(anchors[indices], rpn_bbox[b, :len(indices)] * config.RPN_BBOX_STD_DEV)
log("anchors", anchors)
log("refined_anchors", refined_anchors)

#获取positive anchors
positive_anchor_ids = np.where(rpn_match[b] == 1)[0]
print("Positive anchors: {}".format(len(positive_anchor_ids)))
negative_anchor_ids = np.where(rpn_match[b] == -1)[0]
print("Negative anchors: {}".format(len(negative_anchor_ids)))
neutral_anchor_ids = np.where(rpn_match[b] == 0)[0]
print("Neutral anchors: {}".format(len(neutral_anchor_ids)))

# 根据类别分解ROI
for c, n in zip(dataset.class_names, np.bincount(mrcnn_class_ids[b].flatten())):
    if n:
        print("{:23}: {}".format(c[:20], n))

# 显示positive anchors
visualize.draw_boxes(sample_image, boxes=anchors[positive_anchor_ids], 
                     refined_boxes=refined_anchors)
#显示negative anchors
visualize.draw_boxes(sample_image, boxes=anchors[negative_anchor_ids])
#显示neutral anchors.它们不用于训练.
visualize.draw_boxes(sample_image, boxes=anchors[np.random.choice(neutral_anchor_ids, 100)])

十、ROIS

if random_rois:
    #类别明确的bboxes
    bbox_specific = mrcnn_bbox[b, np.arange(mrcnn_bbox.shape[1]), mrcnn_class_ids[b], :]

    #优化ROIs
    refined_rois = utils.apply_box_deltas(rois[b].astype(np.float32), bbox_specific[:,:4] * config.BBOX_STD_DEV)

    #类别明确masks
    mask_specific = mrcnn_mask[b, np.arange(mrcnn_mask.shape[1]), :, :, mrcnn_class_ids[b]]

    visualize.draw_rois(sample_image, rois[b], refined_rois, mask_specific, mrcnn_class_ids[b], dataset.class_names)
    
    #有没有重复的ROIs?
    rows = np.ascontiguousarray(rois[b]).view(np.dtype((np.void, rois.dtype.itemsize * rois.shape[-1])))
    _, idx = np.unique(rows, return_index=True)
    print("Unique ROIs: {} out of {}".format(len(idx), rois.shape[1]))

if random_rois:
    #显示ROIs和相关的masks，bounding boxes
    ids = random.sample(range(rois.shape[1]), 8)

    images = []
    titles = []
    for i in ids:
        image = visualize.draw_box(sample_image.copy(), rois[b,i,:4].astype(np.int32), [255, 0, 0])
        image = visualize.draw_box(image, refined_rois[i].astype(np.int64), [0, 255, 0])
        images.append(image)
        titles.append("ROI {}".format(i))
        images.append(mask_specific[i] * 255)
        titles.append(dataset.class_names[mrcnn_class_ids[b,i]][:20])

    display_images(images, titles, cols=4, cmap="Blues", interpolation="none")

#检查positive ROIs占一系列图像的比例.
if random_rois:
    limit = 10
    temp_g = modellib.data_generator(
        dataset, config, shuffle=True, random_rois=10000, 
        batch_size=1, detection_targets=True)
    total = 0
    for i in range(limit):
        _, [ids, _, _] = next(temp_g)
        positive_rois = np.sum(ids[0] > 0)
        total += positive_rois
        print("{:5} {:5.2f}".format(positive_rois, positive_rois/ids.shape[1]))
    print("Average percent: {:.2f}".format(total/(limit*ids.shape[1])))

Mask R-CNN（九）：代码理解inspect_data.ipynb