目录
retinanet结构:
train.py
import argparse
import collections
import numpy as np
import torch
import torch.optim as optim
from torchvision import transforms
from retinanet import model
from retinanet.dataloader import CocoDataset, CSVDataset, collater, Resizer, AspectRatioBasedSampler, Augmenter, \
Normalizer
from torch.utils.data import DataLoader
from retinanet import coco_eval
from retinanet import csv_eval
#pytorch版本要求在1.0之上
assert torch.__version__.split('.')[0] == '1'
print('CUDA available: {}'.format(torch.cuda.is_available()))
def main(args=None):
parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.')
parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.',default='csv')
parser.add_argument('--coco_path', help='Path to COCO directory',default = 'F:/datasets/coco/coco2017')
parser.add_argument('--csv_train', help='Path to file containing training annotations (see readme)',default='./datasets/train.csv')
parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)',default='./datasets/class.csv')
parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)',default='./datasets/val.csv')
parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=18)
parser.add_argument('--epochs', help='Number of epochs', type=int, default=100)
parser.add_argument('--resume', help='resume or not', type=bool, default=True)
parser.add_argument('--pretrain_model_path', help='if resume, pretrain_model_path', type=str, default='./model/csv_retinanet_pretrain.pt')
parser = parser.parse_args(args)
#数据处理,通过输入参数判断选择CocoDataset还是CSVDataset
# Create the data loaders
if parser.dataset == 'coco':
if parser.coco_path is None:
raise ValueError('Must provide --coco_path when training on COCO,')
dataset_train = CocoDataset(parser.coco_path, set_name='train2017',
transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()]))
dataset_val = CocoDataset(parser.coco_path, set_name='val2017',
transform=transforms.Compose([Normalizer(), Resizer()]))
elif parser.dataset == 'csv':
if parser.csv_train is None:
raise ValueError('Must provide --csv_train when training on COCO,')
if parser.csv_classes is None:
raise ValueError('Must provide --csv_classes when training on COCO,')
dataset_train = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes,
transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()]))
if parser.csv_val is None:
dataset_val = None
print('No validation annotations provided.')
else:
dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes,
transform=transforms.Compose([Normalizer(), Resizer()]))
else:
raise ValueError('Dataset type not understood (must be csv or coco), exiting.')
sampler = AspectRatioBasedSampler(dataset_train, batch_size=6, drop_last=False)
dataloader_train = DataLoader(dataset_train, num_workers=2, collate_fn=collater, batch_sampler=sampler)
if dataset_val is not None:
sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=6, drop_last=False)
dataloader_val = DataLoader(dataset_val, num_workers=2, collate_fn=collater, batch_sampler=sampler_val)
# 根据深度参数创建模型
if parser.depth == 18:
retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True)
elif parser.depth == 34:
retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True)
elif parser.depth == 50:
retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True)
elif parser.depth == 101:
retinanet = model.resnet101(num_classes=dataset_train.num_classes(), pretrained=True)
elif parser.depth == 152:
retinanet = model.resnet152(num_classes=dataset_train.num_classes(), pretrained=True)
else:
raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152')
if parser.resume:
print("pretrain model path:",parser.pretrain_model_path)
print('loading pretrain model...')
retinanet = torch.load(parser.pretrain_model_path)
use_gpu = True
#将模型传入GPU
if use_gpu:
if torch.cuda.is_available():
retinanet = retinanet.cuda()
if torch.cuda.is_available():
retinanet = torch.nn.DataParallel(retinanet).cuda()
else:
retinanet = torch.nn.DataParallel(retinanet)
retinanet.training = True
optimizer = optim.Adam(retinanet.parameters(), lr=1e-5)
#ReduceLROnPlateau策略,某指标不再变化(下降或升高),调整学习率
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)
#创建一个队列用来记录训练过程中loss值
loss_hist = collections.deque(maxlen=500)
retinanet.train()
retinanet.module.freeze_bn()
print('Num training images: {}'.format(len(dataset_train)))
for epoch_num in range(parser.epochs):
retinanet.train()
retinanet.module.freeze_bn()
epoch_loss = []
for iter_num, data in enumerate(dataloader_train):
try:
optimizer.zero_grad()
if torch.cuda.is_available():
classification_loss, regression_loss = retinanet([data['img'].cuda().float(), data['annot']])
else:
classification_loss, regression_loss = retinanet([data['img'].float(), data['annot']])
classification_loss = classification_loss.mean()
regression_loss = regression_loss.mean()
loss = classification_loss + regression_loss
if bool(loss == 0):
continue
loss.backward()
torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1)
optimizer.step()
loss_hist.append(float(loss))
epoch_loss.append(float(loss))
print(
'Epoch: {} | Iteration: {} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Running loss: {:1.5f}'.format(
epoch_num, iter_num, float(classification_loss), float(regression_loss), np.mean(loss_hist)))
del classification_loss
del regression_loss
except Exception as e:
print(e)
continue
torch.save(retinanet.module, '{}_retinanet_{}.pt'.format('./model/'+parser.dataset, epoch_num))
print('{}_retinanet_{}.pt saved in ./model'.format(parser.dataset, epoch_num))
if parser.dataset == 'coco':
print('Evaluating dataset')
coco_eval.evaluate_coco(dataset_val, retinanet)
elif parser.dataset == 'csv' and parser.csv_val is not None:
print('Evaluating dataset')
#mAP = csv_eval.evaluate(dataset_val, retinanet)
scheduler.step(np.mean(epoch_loss))
torch.save(retinanet.module, '{}_retinanet_{}.pt'.format(parser.dataset, epoch_num))
retinanet.eval()
torch.save(retinanet, 'model_final.pt')
if __name__ == '__main__':
main()
model.py
import torch.nn as nn
import torch
import math
import torch.utils.model_zoo as model_zoo
from torchvision.ops import nms
from retinanet.utils import BasicBlock, Bottleneck, BBoxTransform, ClipBoxes
from retinanet.anchors import Anchors
from retinanet import losses
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
class PyramidFeatures(nn.Module):
def __init__(self, C3_size, C4_size, C5_size, feature_size=256):
super(PyramidFeatures, self).__init__()
# C5上采样得到P5
self.P5_1 = nn.Conv2d(C5_size, feature_size, kernel_size=1, stride=1, padding=0)
self.P5_upsampled = nn.Upsample(scale_factor=2, mode='nearest')
self.P5_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
# add P5 elementwise to C4
self.P4_1 = nn.Conv2d(C4_size, feature_size, kernel_size=1, stride=1, padding=0)
self.P4_upsampled = nn.Upsample(scale_factor=2, mode='nearest')
self.P4_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
# add P4 elementwise to C3
self.P3_1 = nn.Conv2d(C3_size, feature_size, kernel_size=1, stride=1, padding=0)
self.P3_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1)
# "P6 is obtained via a 3x3 stride-2 conv on C5"
self.P6 = nn.Conv2d(C5_size, feature_size, kernel_size=3, stride=2, padding=1)
# "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6"
self.P7_1 = nn.ReLU()
self.P7_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=2, padding=1)
def forward(self, inputs):
C3, C4, C5 = inputs
#C5卷积调整通道数得到M5
P5_x = self.P5_1(C5)
#M5上采样,准备与C4进行融合
P5_upsampled_x = self.P5_upsampled(P5_x)
#M5卷积整合特征得到P5
P5_x = self.P5_2(P5_x)
#C4卷积调整通道数
P4_x = self.P4_1(C4)
#C5与C4特征融合得到M4
P4_x = P5_upsampled_x + P4_x
#C5与C4特征融合后的feature map 上采样准备与C3融合
P4_upsampled_x = self.P4_upsampled(P4_x)
#M4卷积整合特征得到P4
P4_x = self.P4_2(P4_x)
# C3卷积调整通道数
P3_x = self.P3_1(C3)
# C3与C4特征融合得到M3
P3_x = P3_x + P4_upsampled_x
#M3卷积整合特征得到P3
P3_x = self.P3_2(P3_x)
#C5下采样得到P6
P6_x = self.P6(C5)
# P6下采样得到P7
P7_x = self.P7_1(P6_x)
P7_x = self.P7_2(P7_x)
return [P3_x, P4_x, P5_x, P6_x, P7_x]
class RegressionModel(nn.Module):
def __init__(self, num_features_in, num_anchors=9, feature_size=256):
super(RegressionModel, self).__init__()
self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
self.act1 = nn.ReLU()
self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act2 = nn.ReLU()
self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act3 = nn.ReLU()
self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act4 = nn.ReLU()
self.output = nn.Conv2d(feature_size, num_anchors * 4, kernel_size=3, padding=1)
def forward(self, x):
out = self.conv1(x)
out = self.act1(out)
out = self.conv2(out)
out = self.act2(out)
out = self.conv3(out)
out = self.act3(out)
out = self.conv4(out)
out = self.act4(out)
#B x C x W x H
out = self.output(out)
# out is B x C x W x H, with C = 4*num_anchors --> B W H C
out = out.permute(0, 2, 3, 1)
return out.contiguous().view(out.shape[0], -1, 4)
class ClassificationModel(nn.Module):
def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256):
super(ClassificationModel, self).__init__()
self.num_classes = num_classes
self.num_anchors = num_anchors
self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1)
self.act1 = nn.ReLU()
self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act2 = nn.ReLU()
self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act3 = nn.ReLU()
self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1)
self.act4 = nn.ReLU()
self.output = nn.Conv2d(feature_size, num_anchors * num_classes, kernel_size=3, padding=1)
self.output_act = nn.Sigmoid()
def forward(self, x):
out = self.conv1(x)
out = self.act1(out)
out = self.conv2(out)
out = self.act2(out)
out = self.conv3(out)
out = self.act3(out)
out = self.conv4(out)
out = self.act4(out)
out = self.output(out)
out = self.output_act(out)
# out is B x C x W x H, with C = n_classes * n_anchors
out1 = out.permute(0, 2, 3, 1)
batch_size, width, height, channels = out1.shape
out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes)
return out2.contiguous().view(x.shape[0], -1, self.num_classes)
class ResNet(nn.Module):
def __init__(self, num_classes, block, layers):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
#resnet18的layers为[2,2,2,2]
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
if block == BasicBlock:
fpn_sizes = [self.layer2[layers[1] - 1].conv2.out_channels, self.layer3[layers[2] - 1].conv2.out_channels,
self.layer4[layers[3] - 1].conv2.out_channels]
elif block == Bottleneck:
fpn_sizes = [self.layer2[layers[1] - 1].conv3.out_channels, self.layer3[layers[2] - 1].conv3.out_channels,
self.layer4[layers[3] - 1].conv3.out_channels]
else:
raise ValueError(f"Block type {block} not understood")
self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2])
self.regressionModel = RegressionModel(256)
self.classificationModel = ClassificationModel(256, num_classes=num_classes)
self.anchors = Anchors()
self.regressBoxes = BBoxTransform()
self.clipBoxes = ClipBoxes()
self.focalLoss = losses.FocalLoss()
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
prior = 0.01
self.classificationModel.output.weight.data.fill_(0)
self.classificationModel.output.bias.data.fill_(-math.log((1.0 - prior) / prior))
self.regressionModel.output.weight.data.fill_(0)
self.regressionModel.output.bias.data.fill_(0)
self.freeze_bn()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = [block(self.inplanes, planes, stride, downsample)]
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def freeze_bn(self):
'''Freeze BatchNorm layers.'''
for layer in self.modules():
if isinstance(layer, nn.BatchNorm2d):
layer.eval()
def forward(self, inputs):
#backbone网络下采样提取不同scale特征,得到C3、C4、C5
if self.training:
img_batch, annotations = inputs
else:
img_batch = inputs
#img_batch [batch,3,H,W]
x = self.conv1(img_batch)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x1 = self.layer1(x) #输出size为:[batch,64,H//4,W//4]
x2 = self.layer2(x1) #输出size为:[batch,128,H//8,W//8]
x3 = self.layer3(x2) #输出size为:[batch,256,H//16,W//16]
x4 = self.layer4(x3) #输出size为:[batch,512,H//32,W//32]
#特征金字塔,将C3、C4、C5三个scale的feature map进行上采样融合,生成P3到P7共5个scale的feature map
features = self.fpn([x2, x3, x4])
#循环P3到P7,以此进行回归和分类操作
regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1)
classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1)
#生成anchor
anchors = self.anchors(img_batch)
if self.training:
return self.focalLoss(classification, regression, anchors, annotations)
else:
transformed_anchors = self.regressBoxes(anchors, regression)
transformed_anchors = self.clipBoxes(transformed_anchors, img_batch)
finalResult = [[], [], []]
finalScores = torch.Tensor([])
finalAnchorBoxesIndexes = torch.Tensor([]).long()
finalAnchorBoxesCoordinates = torch.Tensor([])
if torch.cuda.is_available():
finalScores = finalScores.cuda()
finalAnchorBoxesIndexes = finalAnchorBoxesIndexes.cuda()
finalAnchorBoxesCoordinates = finalAnchorBoxesCoordinates.cuda()
for i in range(classification.shape[2]):
scores = torch.squeeze(classification[:, :, i])
scores_over_thresh = (scores > 0.05)
if scores_over_thresh.sum() == 0:
# no boxes to NMS, just continue
continue
scores = scores[scores_over_thresh]
anchorBoxes = torch.squeeze(transformed_anchors)
anchorBoxes = anchorBoxes[scores_over_thresh]
anchors_nms_idx = nms(anchorBoxes, scores, 0.5)
finalResult[0].extend(scores[anchors_nms_idx])
finalResult[1].extend(torch.tensor([i] * anchors_nms_idx.shape[0]))
finalResult[2].extend(anchorBoxes[anchors_nms_idx])
finalScores = torch.cat((finalScores, scores[anchors_nms_idx]))
finalAnchorBoxesIndexesValue = torch.tensor([i] * anchors_nms_idx.shape[0])
if torch.cuda.is_available():
finalAnchorBoxesIndexesValue = finalAnchorBoxesIndexesValue.cuda()
finalAnchorBoxesIndexes = torch.cat((finalAnchorBoxesIndexes, finalAnchorBoxesIndexesValue))
finalAnchorBoxesCoordinates = torch.cat((finalAnchorBoxesCoordinates, anchorBoxes[anchors_nms_idx]))
return [finalScores, finalAnchorBoxesIndexes, finalAnchorBoxesCoordinates]
def resnet18(num_classes, pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(num_classes, BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18'], model_dir='.'), strict=False)
return model
def resnet34(num_classes, pretrained=False, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(num_classes, BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet34'], model_dir='.'), strict=False)
return model
#resnet50模型
def resnet50(num_classes, pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
#Bottleneck为基础,每个module中Bottleneck中的数量为[3, 4, 6, 3]
#model返回为FPN输出的多尺度特征图
model = ResNet(num_classes, Bottleneck, [3, 4, 6, 3], **kwargs)
#print("======>")
#print('model:',model.shape)
if pretrained:
#pretrained参数如果为True,下载resnet50的backbone在imagenet数据集上训练好的模型
model.load_state_dict(model_zoo.load_url(model_urls['resnet50'], model_dir='.'), strict=False)
return model
def resnet101(num_classes, pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(num_classes, Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet101'], model_dir='.'), strict=False)
return model
def resnet152(num_classes, pretrained=False, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(num_classes, Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet152'], model_dir='.'), strict=False)
return model
if __name__ == '__main__':
retinanet = resnet18(num_classes=20, pretrained=True)
print(retinanet)
utils.py
import torch
import torch.nn as nn
import numpy as np
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class BBoxTransform(nn.Module):
def __init__(self, mean=None, std=None):
super(BBoxTransform, self).__init__()
if mean is None:
if torch.cuda.is_available():
self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32)).cuda()
else:
self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32))
else:
self.mean = mean
if std is None:
if torch.cuda.is_available():
self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32)).cuda()
else:
self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32))
else:
self.std = std
def forward(self, boxes, deltas):
widths = boxes[:, :, 2] - boxes[:, :, 0]
heights = boxes[:, :, 3] - boxes[:, :, 1]
ctr_x = boxes[:, :, 0] + 0.5 * widths
ctr_y = boxes[:, :, 1] + 0.5 * heights
dx = deltas[:, :, 0] * self.std[0] + self.mean[0]
dy = deltas[:, :, 1] * self.std[1] + self.mean[1]
dw = deltas[:, :, 2] * self.std[2] + self.mean[2]
dh = deltas[:, :, 3] * self.std[3] + self.mean[3]
pred_ctr_x = ctr_x + dx * widths
pred_ctr_y = ctr_y + dy * heights
pred_w = torch.exp(dw) * widths
pred_h = torch.exp(dh) * heights
pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w
pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h
pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w
pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h
pred_boxes = torch.stack([pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2)
return pred_boxes
class ClipBoxes(nn.Module):
def __init__(self, width=None, height=None):
super(ClipBoxes, self).__init__()
def forward(self, boxes, img):
batch_size, num_channels, height, width = img.shape
boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0)
boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width)
boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height)
return boxes
anchors.py
import numpy as np
import torch
import torch.nn as nn
class Anchors(nn.Module):
def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None):
super(Anchors, self).__init__()
if pyramid_levels is None:
self.pyramid_levels = [3, 4, 5, 6, 7]
if strides is None:
#self.strides为[8,16,32,64,128]
self.strides = [2 ** x for x in self.pyramid_levels]
if sizes is None:
#self.sizes为[32,64,128,256,512]
self.sizes = [2 ** (x + 2) for x in self.pyramid_levels]
if ratios is None:
#self.ratios为[0.5, 1, 2]
self.ratios = np.array([0.5, 1, 2])
if scales is None:
#self.scales为[1,1.26,1.59]
self.scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])
def forward(self, image):
image_shape = image.shape[2:]
image_shape = np.array(image_shape)
#计算每张图片的P3到p7对应的5个feature map的宽和高,返回list,存5个元素
image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in self.pyramid_levels]
# compute anchors over all pyramid levels
all_anchors = np.zeros((0, 4)).astype(np.float32)
for idx, p in enumerate(self.pyramid_levels):
#每个锚点为中心(0,0),生成的9个anchor的坐标信息,格式为(x1, y1, x2, y2)
anchors = generate_anchors(base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales)
shifted_anchors = shift(image_shapes[idx], self.strides[idx], anchors)
#将shifted_anchors添加到all_anchors列表中
all_anchors = np.append(all_anchors, shifted_anchors, axis=0)
#np.expand_dims扩展all_anchors的维度
all_anchors = np.expand_dims(all_anchors, axis=0)
if torch.cuda.is_available():
return torch.from_numpy(all_anchors.astype(np.float32)).cuda()
else:
return torch.from_numpy(all_anchors.astype(np.float32))
def generate_anchors(base_size=32, ratios=None, scales=None):
"""
Generate anchor (reference) windows by enumerating aspect ratios X
scales w.r.t. a reference window.
"""
if ratios is None:
ratios = np.array([0.5, 1, 2])
if scales is None:
scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])
num_anchors = len(ratios) * len(scales)
# 初始化anchors,[9,4]
anchors = np.zeros((num_anchors, 4))
# scale base_size
# print('base_size:',base_size)
# print('scales:',scales)
# print(np.tile(scales, (2, len(ratios))).T)
# print(base_size * np.tile(scales, (2, len(ratios))).T)
anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T
# print('anchors:',anchors)
# compute areas of anchors
areas = anchors[:, 2] * anchors[:, 3]
# print('areas:',areas)
# correct for ratios
anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales)))
# print('anchors:',anchors)
anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales))
# print('anchors:',anchors)
# transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2)
anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
# print("anchor:",anchors)
return anchors
def compute_shape(image_shape, pyramid_levels):
"""Compute shapes based on pyramid levels.
:param image_shape:
:param pyramid_levels:
:return:
"""
image_shape = np.array(image_shape[:2])
image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels]
return image_shapes
def anchors_for_shape(
image_shape,
pyramid_levels=None,
ratios=None,
scales=None,
strides=None,
sizes=None,
shapes_callback=None,
):
image_shapes = compute_shape(image_shape, pyramid_levels)
# compute anchors over all pyramid levels
all_anchors = np.zeros((0, 4))
for idx, p in enumerate(pyramid_levels):
anchors = generate_anchors(base_size=sizes[idx], ratios=ratios, scales=scales)
shifted_anchors = shift(image_shapes[idx], strides[idx], anchors)
all_anchors = np.append(all_anchors, shifted_anchors, axis=0)
return all_anchors
def shift(shape, stride, anchors):
shift_x = (np.arange(0, shape[1]) + 0.5) * stride
shift_y = (np.arange(0, shape[0]) + 0.5) * stride
shift_x, shift_y = np.meshgrid(shift_x, shift_y)
shifts = np.vstack((
shift_x.ravel(), shift_y.ravel(),
shift_x.ravel(), shift_y.ravel()
)).transpose()
# add A anchors (1, A, 4) to
# cell K shifts (K, 1, 4) to get
# shift anchors (K, A, 4)
# reshape to (K*A, 4) shifted anchors
A = anchors.shape[0]
K = shifts.shape[0]
all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
all_anchors = all_anchors.reshape((K * A, 4))
return all_anchors
if __name__ == '__main__':
image = np.random.rand(6,3,600,800)
anchor = Anchors()
anchor(image)
#generate_anchors(base_size=32, ratios=None, scales=None)
coco_evel.py
from pycocotools.cocoeval import COCOeval
import json
import torch
def evaluate_coco(dataset, model, threshold=0.05):
model.eval()
with torch.no_grad():
# start collecting results
results = []
image_ids = []
for index in range(len(dataset)):
data = dataset[index]
scale = data['scale']
# run network
if torch.cuda.is_available():
scores, labels, boxes = model(data['img'].permute(2, 0, 1).cuda().float().unsqueeze(dim=0))
else:
scores, labels, boxes = model(data['img'].permute(2, 0, 1).float().unsqueeze(dim=0))
scores = scores.cpu()
labels = labels.cpu()
boxes = boxes.cpu()
# correct boxes for image scale
boxes /= scale
if boxes.shape[0] > 0:
# change to (x, y, w, h) (MS COCO standard)
boxes[:, 2] -= boxes[:, 0]
boxes[:, 3] -= boxes[:, 1]
# compute predicted labels and scores
#for box, score, label in zip(boxes[0], scores[0], labels[0]):
for box_id in range(boxes.shape[0]):
score = float(scores[box_id])
label = int(labels[box_id])
box = boxes[box_id, :]
# scores are sorted, so we can break
if score < threshold:
break
# append detection for each positively labeled class
image_result = {
'image_id' : dataset.image_ids[index],
'category_id' : dataset.label_to_coco_label(label),
'score' : float(score),
'bbox' : box.tolist(),
}
# append detection to results
results.append(image_result)
# append image to list of processed images
image_ids.append(dataset.image_ids[index])
# print progress
print('{}/{}'.format(index, len(dataset)), end='\r')
if not len(results):
return
# write output
json.dump(results, open('{}_bbox_results.json'.format(dataset.set_name), 'w'), indent=4)
# load results in COCO evaluation tool
coco_true = dataset.coco
coco_pred = coco_true.loadRes('{}_bbox_results.json'.format(dataset.set_name))
# run COCO evaluation
coco_eval = COCOeval(coco_true, coco_pred, 'bbox')
coco_eval.params.imgIds = image_ids
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
model.train()
return
csv_evel.py
from __future__ import print_function
import numpy as np
import json
import os
import matplotlib.pyplot as plt
import torch
def compute_overlap(a, b):
"""
Parameters
----------
a: (N, 4) ndarray of float
b: (K, 4) ndarray of float
Returns
-------
overlaps: (N, K) ndarray of overlap between boxes and query_boxes
"""
area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0])
ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1])
iw = np.maximum(iw, 0)
ih = np.maximum(ih, 0)
ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
ua = np.maximum(ua, np.finfo(float).eps)
intersection = iw * ih
return intersection / ua
def _compute_ap(recall, precision):
""" Compute the average precision, given the recall and precision curves.
Code originally from https://github.com/rbgirshick/py-faster-rcnn.
# Arguments
recall: The recall curve (list).
precision: The precision curve (list).
# Returns
The average precision as computed in py-faster-rcnn.
"""
# correct AP calculation
# first append sentinel values at the end
mrec = np.concatenate(([0.], recall, [1.]))
mpre = np.concatenate(([0.], precision, [0.]))
# compute the precision envelope
for i in range(mpre.size - 1, 0, -1):
mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
# to calculate area under PR curve, look for points
# where X axis (recall) changes value
i = np.where(mrec[1:] != mrec[:-1])[0]
# and sum (\Delta recall) * prec
ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
return ap
def _get_detections(dataset, retinanet, score_threshold=0.05, max_detections=100, save_path=None):
""" Get the detections from the retinanet using the generator.
The result is a list of lists such that the size is:
all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes]
# Arguments
dataset : The generator used to run images through the retinanet.
retinanet : The retinanet to run on the images.
score_threshold : The score confidence threshold to use.
max_detections : The maximum number of detections to use per image.
save_path : The path to save the images with visualized detections to.
# Returns
A list of lists containing the detections for each image in the generator.
"""
all_detections = [[None for i in range(dataset.num_classes())] for j in range(len(dataset))]
retinanet.eval()
with torch.no_grad():
for index in range(len(dataset)):
data = dataset[index]
scale = data['scale']
# run network
if torch.cuda.is_available():
scores, labels, boxes = retinanet(data['img'].permute(2, 0, 1).cuda().float().unsqueeze(dim=0))
else:
scores, labels, boxes = retinanet(data['img'].permute(2, 0, 1).float().unsqueeze(dim=0))
scores = scores.cpu().numpy()
labels = labels.cpu().numpy()
boxes = boxes.cpu().numpy()
# correct boxes for image scale
boxes /= scale
# select indices which have a score above the threshold
indices = np.where(scores > score_threshold)[0]
if indices.shape[0] > 0:
# select those scores
scores = scores[indices]
# find the order with which to sort the scores
scores_sort = np.argsort(-scores)[:max_detections]
# select detections
image_boxes = boxes[indices[scores_sort], :]
image_scores = scores[scores_sort]
image_labels = labels[indices[scores_sort]]
image_detections = np.concatenate([image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
# copy detections to all_detections
for label in range(dataset.num_classes()):
all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1]
else:
# copy detections to all_detections
for label in range(dataset.num_classes()):
all_detections[index][label] = np.zeros((0, 5))
print('{}/{}'.format(index + 1, len(dataset)), end='\r')
return all_detections
def _get_annotations(generator):
""" Get the ground truth annotations from the generator.
The result is a list of lists such that the size is:
all_detections[num_images][num_classes] = annotations[num_detections, 5]
# Arguments
generator : The generator used to retrieve ground truth annotations.
# Returns
A list of lists containing the annotations for each image in the generator.
"""
all_annotations = [[None for i in range(generator.num_classes())] for j in range(len(generator))]
for i in range(len(generator)):
# load the annotations
annotations = generator.load_annotations(i)
# copy detections to all_annotations
for label in range(generator.num_classes()):
all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy()
print('{}/{}'.format(i + 1, len(generator)), end='\r')
return all_annotations
def evaluate(
generator,
retinanet,
iou_threshold=0.5,
score_threshold=0.05,
max_detections=100,
save_path=None
):
""" Evaluate a given dataset using a given retinanet.
# Arguments
generator : The generator that represents the dataset to evaluate.
retinanet : The retinanet to evaluate.
iou_threshold : The threshold used to consider when a detection is positive or negative.
score_threshold : The score confidence threshold to use for detections.
max_detections : The maximum number of detections to use per image.
save_path : The path to save precision recall curve of each label.
# Returns
A dict mapping class names to mAP scores.
"""
# gather all detections and annotations
all_detections = _get_detections(generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path)
all_annotations = _get_annotations(generator)
average_precisions = {}
for label in range(generator.num_classes()):
false_positives = np.zeros((0,))
true_positives = np.zeros((0,))
scores = np.zeros((0,))
num_annotations = 0.0
for i in range(len(generator)):
detections = all_detections[i][label]
annotations = all_annotations[i][label]
num_annotations += annotations.shape[0]
detected_annotations = []
for d in detections:
scores = np.append(scores, d[4])
if annotations.shape[0] == 0:
false_positives = np.append(false_positives, 1)
true_positives = np.append(true_positives, 0)
continue
overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations)
assigned_annotation = np.argmax(overlaps, axis=1)
max_overlap = overlaps[0, assigned_annotation]
if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
false_positives = np.append(false_positives, 0)
true_positives = np.append(true_positives, 1)
detected_annotations.append(assigned_annotation)
else:
false_positives = np.append(false_positives, 1)
true_positives = np.append(true_positives, 0)
# no annotations -> AP for this class is 0 (is this correct?)
if num_annotations == 0:
average_precisions[label] = 0, 0
continue
# sort by score
indices = np.argsort(-scores)
false_positives = false_positives[indices]
true_positives = true_positives[indices]
# compute false positives and true positives
false_positives = np.cumsum(false_positives)
true_positives = np.cumsum(true_positives)
# compute recall and precision
recall = true_positives / num_annotations
precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps)
# compute average precision
average_precision = _compute_ap(recall, precision)
average_precisions[label] = average_precision, num_annotations
print('\nmAP:')
for label in range(generator.num_classes()):
label_name = generator.label_to_name(label)
print('{}: {}'.format(label_name, average_precisions[label][0]))
print("Precision: ",precision[-1])
print("Recall: ",recall[-1])
if save_path!=None:
plt.plot(recall,precision)
# naming the x axis
plt.xlabel('Recall')
# naming the y axis
plt.ylabel('Precision')
# giving a title to my graph
plt.title('Precision Recall curve')
# function to show the plot
plt.savefig(save_path+'/'+label_name+'_precision_recall.jpg')
return average_precisions
dtatloader.py
from __future__ import print_function, division
import sys
import os
import torch
import numpy as np
import random
import csv
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torch.utils.data.sampler import Sampler
from pycocotools.coco import COCO
import skimage.io
import skimage.transform
import skimage.color
import skimage
from PIL import Image
class CocoDataset(Dataset):
"""Coco dataset."""
def __init__(self, root_dir, set_name='train2017', transform=None):
"""
Args:
root_dir (string): COCO directory.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.root_dir = root_dir
self.set_name = set_name
self.transform = transform
self.coco = COCO(os.path.join(self.root_dir, 'annotations', 'instances_' + self.set_name + '.json'))
self.image_ids = self.coco.getImgIds()
self.load_classes()
def load_classes(self):
# load class names (name -> label)
categories = self.coco.loadCats(self.coco.getCatIds())
categories.sort(key=lambda x: x['id'])
self.classes = {}
self.coco_labels = {}
self.coco_labels_inverse = {}
for c in categories:
self.coco_labels[len(self.classes)] = c['id']
self.coco_labels_inverse[c['id']] = len(self.classes)
self.classes[c['name']] = len(self.classes)
# also load the reverse (label -> name)
self.labels = {}
for key, value in self.classes.items():
self.labels[value] = key
def __len__(self):
return len(self.image_ids)
def __getitem__(self, idx):
img = self.load_image(idx)
annot = self.load_annotations(idx)
sample = {'img': img, 'annot': annot}
if self.transform:
sample = self.transform(sample)
return sample
def load_image(self, image_index):
image_info = self.coco.loadImgs(self.image_ids[image_index])[0]
path = os.path.join(self.root_dir, 'images', self.set_name, image_info['file_name'])
img = skimage.io.imread(path)
if len(img.shape) == 2:
img = skimage.color.gray2rgb(img)
return img.astype(np.float32)/255.0
def load_annotations(self, image_index):
# get ground truth annotations
annotations_ids = self.coco.getAnnIds(imgIds=self.image_ids[image_index], iscrowd=False)
annotations = np.zeros((0, 5))
# some images appear to miss annotations (like image with id 257034)
if len(annotations_ids) == 0:
return annotations
# parse annotations
coco_annotations = self.coco.loadAnns(annotations_ids)
for idx, a in enumerate(coco_annotations):
# some annotations have basically no width / height, skip them
if a['bbox'][2] < 1 or a['bbox'][3] < 1:
continue
annotation = np.zeros((1, 5))
annotation[0, :4] = a['bbox']
annotation[0, 4] = self.coco_label_to_label(a['category_id'])
annotations = np.append(annotations, annotation, axis=0)
# transform from [x, y, w, h] to [x1, y1, x2, y2]
annotations[:, 2] = annotations[:, 0] + annotations[:, 2]
annotations[:, 3] = annotations[:, 1] + annotations[:, 3]
return annotations
def coco_label_to_label(self, coco_label):
return self.coco_labels_inverse[coco_label]
def label_to_coco_label(self, label):
return self.coco_labels[label]
def image_aspect_ratio(self, image_index):
image = self.coco.loadImgs(self.image_ids[image_index])[0]
return float(image['width']) / float(image['height'])
def num_classes(self):
return 80
class CSVDataset(Dataset):
"""CSV dataset."""
def __init__(self, train_file, class_list, transform=None):
"""
Args:
train_file (string): 带有注释的训练集 CSV 文件
annotations (string): 带有类别列表的 CSV 文件
test_file (string, optional): 带有注释的测试集 CSV 文件
"""
self.train_file = train_file
self.class_list = class_list
self.transform = transform
# 解析记录类别列表的CSV文件
try:
with self._open_for_csv(self.class_list) as file:
self.classes = self.load_classes(csv.reader(file, delimiter=','))
except ValueError as e:
raise(ValueError('invalid CSV class file: {}: {}'.format(self.class_list, e)))
self.labels = {}
for key, value in self.classes.items():
self.labels[value] = key
# 解析训练、测试集的csv文件,csv 的格式: img_path, x1, y1, x2, y2, class_name
try:
with self._open_for_csv(self.train_file) as file:
#调用_read_annotations函数
self.image_data = self._read_annotations(csv.reader(file, delimiter=','), self.classes)
except ValueError as e:
raise(ValueError('invalid CSV annotations file: {}: {}'.format(self.train_file, e)))
self.image_names = list(self.image_data.keys())
def _parse(self, value, function, fmt):
"""
Parse a string into a value, and format a nice ValueError if it fails.
Returns `function(value)`.
Any `ValueError` raised is catched and a new `ValueError` is raised
with message `fmt.format(e)`, where `e` is the caught `ValueError`.
"""
try:
return function(value)
except ValueError as e:
raise_from(ValueError(fmt.format(e)), None)
def _open_for_csv(self, path):
"""
Open a file with flags suitable for csv.reader.
This is different for python2 it means with mode 'rb',
for python3 this means 'r' with "universal newlines".
"""
if sys.version_info[0] < 3:
return open(path, 'rb')
else:
return open(path, 'r', newline='')
def load_classes(self, csv_reader):
result = {}
for line, row in enumerate(csv_reader):
line += 1
try:
class_name, class_id = row
except ValueError:
raise(ValueError('line {}: format should be \'class_name,class_id\''.format(line)))
class_id = self._parse(class_id, int, 'line {}: malformed class ID: {
{}}'.format(line))
if class_name in result:
raise ValueError('line {}: duplicate class name: \'{}\''.format(line, class_name))
result[class_name] = class_id
return result
def __len__(self):
#print('image num:', len(self.image_names))
return len(self.image_names)
def __getitem__(self, idx):
img = self.load_image(idx)
annot = self.load_annotations(idx)
sample = {'img': img, 'annot': annot}
if self.transform:
sample = self.transform(sample)
return sample
def load_image(self, image_index):
img = skimage.io.imread(self.image_names[image_index])
if len(img.shape) == 2:
img = skimage.color.gray2rgb(img)
return img.astype(np.float32)/255.0
def load_annotations(self, image_index):
# get ground truth annotations
annotation_list = self.image_data[self.image_names[image_index]]
annotations = np.zeros((0, 5))
# some images appear to miss annotations (like image with id 257034)
if len(annotation_list) == 0:
return annotations
# parse annotations
for idx, a in enumerate(annotation_list):
# some annotations have basically no width / height, skip them
x1 = a['x1']
x2 = a['x2']
y1 = a['y1']
y2 = a['y2']
if (x2-x1) < 1 or (y2-y1) < 1:
continue
annotation = np.zeros((1, 5))
annotation[0, 0] = x1
annotation[0, 1] = y1
annotation[0, 2] = x2
annotation[0, 3] = y2
annotation[0, 4] = self.name_to_label(a['class'])
annotations = np.append(annotations, annotation, axis=0)
return annotations
def _read_annotations(self, csv_reader, classes):
result = {}
for line, row in enumerate(csv_reader):
line += 1
try:
img_file, x1, y1, x2, y2, class_name = row[:6]
except ValueError:
raise_from(ValueError('line {}: format should be \'img_file,x1,y1,x2,y2,class_name\' or \'img_file,,,,,\''.format(line)), None)
if img_file not in result:
result[img_file] = []
# If a row contains only an image path, it's an image without annotations.
if (x1, y1, x2, y2, class_name) == ('', '', '', '', ''):
continue
x1 = self._parse(x1, int, 'line {}: malformed x1: {
{}}'.format(line))
y1 = self._parse(y1, int, 'line {}: malformed y1: {
{}}'.format(line))
x2 = self._parse(x2, int, 'line {}: malformed x2: {
{}}'.format(line))
y2 = self._parse(y2, int, 'line {}: malformed y2: {
{}}'.format(line))
# Check that the bounding box is valid.
if x2 <= x1:
raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1))
if y2 <= y1:
raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1))
# check if the current class name is correctly present
if class_name not in classes:
raise ValueError('line {}: unknown class name: \'{}\' (classes: {})'.format(line, class_name, classes))
result[img_file].append({'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'class': class_name})
return result
def name_to_label(self, name):
return self.classes[name]
def label_to_name(self, label):
return self.labels[label]
def num_classes(self):
return max(self.classes.values()) + 1
def image_aspect_ratio(self, image_index):
image = Image.open(self.image_names[image_index])
return float(image.width) / float(image.height)
def collater(data):
imgs = [s['img'] for s in data]
annots = [s['annot'] for s in data]
scales = [s['scale'] for s in data]
widths = [int(s.shape[0]) for s in imgs]
heights = [int(s.shape[1]) for s in imgs]
batch_size = len(imgs)
max_width = np.array(widths).max()
max_height = np.array(heights).max()
padded_imgs = torch.zeros(batch_size, max_width, max_height, 3)
for i in range(batch_size):
img = imgs[i]
padded_imgs[i, :int(img.shape[0]), :int(img.shape[1]), :] = img
max_num_annots = max(annot.shape[0] for annot in annots)
if max_num_annots > 0:
annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1
if max_num_annots > 0:
for idx, annot in enumerate(annots):
#print(annot.shape)
if annot.shape[0] > 0:
annot_padded[idx, :annot.shape[0], :] = annot
else:
annot_padded = torch.ones((len(annots), 1, 5)) * -1
padded_imgs = padded_imgs.permute(0, 3, 1, 2)
size = padded_imgs.size()
a = {'img': padded_imgs, 'annot': annot_padded, 'scale': scales}
return {'img': padded_imgs, 'annot': annot_padded, 'scale': scales}
class Resizer(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample, min_side=608, max_side=1024):
image, annots = sample['img'], sample['annot']
rows, cols, cns = image.shape
smallest_side = min(rows, cols)
# 缩放图像,使最小的边为min_side 608
scale = min_side / smallest_side
# 检查最大的边现在是否大于max_side1024,如果超过就以最大边进行缩放
#如800*900, scale=1.315 ,长边变成1184>1024,此时scale重新计算为1024/900=1.138 短边为910长1024
largest_side = max(rows, cols)
if largest_side * scale > max_side:
scale = max_side / largest_side
# 同比例缩放
image = skimage.transform.resize(image, (int(round(rows*scale)), int(round((cols*scale)))))
rows, cols, cns = image.shape
#宽和高padding为32的整数倍
pad_w = 32 - rows%32
pad_h = 32 - cols%32
new_image = np.zeros((rows + pad_w, cols + pad_h, cns)).astype(np.float32)
new_image[:rows, :cols, :] = image.astype(np.float32)
annots[:, :4] *= scale
return {'img': torch.from_numpy(new_image), 'annot': torch.from_numpy(annots), 'scale': scale}
class Augmenter(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample, flip_x=0.5):
if np.random.rand() < flip_x:
image, annots = sample['img'], sample['annot']
image = image[:, ::-1, :]
rows, cols, channels = image.shape
x1 = annots[:, 0].copy()
x2 = annots[:, 2].copy()
x_tmp = x1.copy()
annots[:, 0] = cols - x2
annots[:, 2] = cols - x_tmp
sample = {'img': image, 'annot': annots}
return sample
class Normalizer(object):
def __init__(self):
self.mean = np.array([[[0.485, 0.456, 0.406]]])
self.std = np.array([[[0.229, 0.224, 0.225]]])
def __call__(self, sample):
image, annots = sample['img'], sample['annot']
return {'img':((image.astype(np.float32)-self.mean)/self.std), 'annot': annots}
class UnNormalizer(object):
def __init__(self, mean=None, std=None):
if mean == None:
self.mean = [0.485, 0.456, 0.406]
else:
self.mean = mean
if std == None:
self.std = [0.229, 0.224, 0.225]
else:
self.std = std
def __call__(self, tensor):
"""
Args:
tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
Returns:
Tensor: Normalized image.
"""
for t, m, s in zip(tensor, self.mean, self.std):
t.mul_(s).add_(m)
return tensor
class AspectRatioBasedSampler(Sampler):
def __init__(self, data_source, batch_size, drop_last):
self.data_source = data_source
self.batch_size = batch_size
self.drop_last = drop_last
self.groups = self.group_images()
def __iter__(self):
random.shuffle(self.groups)
for group in self.groups:
yield group
def __len__(self):
if self.drop_last:
return len(self.data_source) // self.batch_size
else:
return (len(self.data_source) + self.batch_size - 1) // self.batch_size
def group_images(self):
# determine the order of the images
order = list(range(len(self.data_source)))
order.sort(key=lambda x: self.data_source.image_aspect_ratio(x))
# divide into groups, one group = one batch
return [[order[x % len(order)] for x in range(i, i + self.batch_size)] for i in range(0, len(order), self.batch_size)]
losses.py
import numpy as np
import torch
import torch.nn as nn
def calc_iou(a, b):
area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0])
ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1])
iw = torch.clamp(iw, min=0)
ih = torch.clamp(ih, min=0)
ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih
ua = torch.clamp(ua, min=1e-8)
intersection = iw * ih
IoU = intersection / ua
return IoU
class FocalLoss(nn.Module):
#def __init__(self):
def forward(self, classifications, regressions, anchors, annotations):
alpha = 0.25
gamma = 2.0
batch_size = classifications.shape[0]
classification_losses = []
regression_losses = []
anchor = anchors[0, :, :]
anchor_widths = anchor[:, 2] - anchor[:, 0]
anchor_heights = anchor[:, 3] - anchor[:, 1]
anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths
anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights
for j in range(batch_size):
classification = classifications[j, :, :]
regression = regressions[j, :, :]
bbox_annotation = annotations[j, :, :]
bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1]
classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4)
if bbox_annotation.shape[0] == 0:
if torch.cuda.is_available():
alpha_factor = torch.ones(classification.shape).cuda() * alpha
alpha_factor = 1. - alpha_factor
focal_weight = classification
focal_weight = alpha_factor * torch.pow(focal_weight, gamma)
bce = -(torch.log(1.0 - classification))
# cls_loss = focal_weight * torch.pow(bce, gamma)
cls_loss = focal_weight * bce
classification_losses.append(cls_loss.sum())
regression_losses.append(torch.tensor(0).float().cuda())
else:
alpha_factor = torch.ones(classification.shape) * alpha
alpha_factor = 1. - alpha_factor
focal_weight = classification
focal_weight = alpha_factor * torch.pow(focal_weight, gamma)
bce = -(torch.log(1.0 - classification))
# cls_loss = focal_weight * torch.pow(bce, gamma)
cls_loss = focal_weight * bce
classification_losses.append(cls_loss.sum())
regression_losses.append(torch.tensor(0).float())
continue
IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4]) # num_anchors x num_annotations
IoU_max, IoU_argmax = torch.max(IoU, dim=1) # num_anchors x 1
#import pdb
#pdb.set_trace()
# 计算分类损失
targets = torch.ones(classification.shape) * -1
if torch.cuda.is_available():
targets = targets.cuda()
#取IoU小于0.4的targets,赋值为0
targets[torch.lt(IoU_max, 0.4), :] = 0
# 取IoU大于等于0.5的targets,即正样本的positive indices
positive_indices = torch.ge(IoU_max, 0.5)
num_positive_anchors = positive_indices.sum()
assigned_annotations = bbox_annotation[IoU_argmax, :]
#将正样本标签赋值为1
targets[positive_indices, :] = 0
targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1
if torch.cuda.is_available():
alpha_factor = torch.ones(targets.shape).cuda() * alpha
else:
alpha_factor = torch.ones(targets.shape) * alpha
#判断targets是不是等于1,是返回alpha_factor,否则返回1 - alpha_factor
alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor)
focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification)
focal_weight = alpha_factor * torch.pow(focal_weight, gamma)
bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification))
#计算cls_loss,根据下面的公式
# cls_loss = focal_weight * torch.pow(bce, gamma)
cls_loss = focal_weight * bce
if torch.cuda.is_available():
cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda())
else:
cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape))
classification_losses.append(cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0))
# 计算回归损失
if positive_indices.sum() > 0:
assigned_annotations = assigned_annotations[positive_indices, :]
anchor_widths_pi = anchor_widths[positive_indices]
anchor_heights_pi = anchor_heights[positive_indices]
anchor_ctr_x_pi = anchor_ctr_x[positive_indices]
anchor_ctr_y_pi = anchor_ctr_y[positive_indices]
gt_widths = assigned_annotations[:, 2] - assigned_annotations[:, 0]
gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1]
gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths
gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights
# clip widths to 1
gt_widths = torch.clamp(gt_widths, min=1)
gt_heights = torch.clamp(gt_heights, min=1)
targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi
targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi
targets_dw = torch.log(gt_widths / anchor_widths_pi)
targets_dh = torch.log(gt_heights / anchor_heights_pi)
targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh))
targets = targets.t()
if torch.cuda.is_available():
targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda()
else:
targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]])
negative_indices = 1 + (~positive_indices)
#回归使用L2损失
regression_diff = torch.abs(targets - regression[positive_indices, :])
#smooth L1
regression_loss = torch.where(
torch.le(regression_diff, 1.0 / 9.0),
0.5 * 9.0 * torch.pow(regression_diff, 2),
regression_diff - 0.5 / 9.0
)
regression_losses.append(regression_loss.mean())
else:
if torch.cuda.is_available():
regression_losses.append(torch.tensor(0).float().cuda())
else:
regression_losses.append(torch.tensor(0).float())
return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True)
oid_dataset.py
from __future__ import print_function, division
import csv
import json
import os
import warnings
import numpy as np
import skimage
import skimage.color
import skimage.io
import skimage.transform
from PIL import Image
from torch.utils.data import Dataset
def get_labels(metadata_dir, version='v4'):
if version == 'v4' or version == 'challenge2018':
csv_file = 'class-descriptions-boxable.csv' if version == 'v4' else 'challenge-2018-class-descriptions-500.csv'
boxable_classes_descriptions = os.path.join(metadata_dir, csv_file)
id_to_labels = {}
cls_index = {}
i = 0
with open(boxable_classes_descriptions) as f:
for row in csv.reader(f):
# make sure the csv row is not empty (usually the last one)
if len(row):
label = row[0]
description = row[1].replace("\"", "").replace("'", "").replace('`', '')
id_to_labels[i] = description
cls_index[label] = i
i += 1
else:
trainable_classes_path = os.path.join(metadata_dir, 'classes-bbox-trainable.txt')
description_path = os.path.join(metadata_dir, 'class-descriptions.csv')
description_table = {}
with open(description_path) as f:
for row in csv.reader(f):
# make sure the csv row is not empty (usually the last one)
if len(row):
description_table[row[0]] = row[1].replace("\"", "").replace("'", "").replace('`', '')
with open(trainable_classes_path, 'rb') as f:
trainable_classes = f.read().split('\n')
id_to_labels = dict([(i, description_table[c]) for i, c in enumerate(trainable_classes)])
cls_index = dict([(c, i) for i, c in enumerate(trainable_classes)])
return id_to_labels, cls_index
def generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version='v4'):
validation_image_ids = {}
if version == 'v4':
annotations_path = os.path.join(metadata_dir, subset, '{}-annotations-bbox.csv'.format(subset))
elif version == 'challenge2018':
validation_image_ids_path = os.path.join(metadata_dir, 'challenge-2018-image-ids-valset-od.csv')
with open(validation_image_ids_path, 'r') as csv_file:
reader = csv.DictReader(csv_file, fieldnames=['ImageID'])
reader.next()
for line, row in enumerate(reader):
image_id = row['ImageID']
validation_image_ids[image_id] = True
annotations_path = os.path.join(metadata_dir, 'challenge-2018-train-annotations-bbox.csv')
else:
annotations_path = os.path.join(metadata_dir, subset, 'annotations-human-bbox.csv')
fieldnames = ['ImageID', 'Source', 'LabelName', 'Confidence',
'XMin', 'XMax', 'YMin', 'YMax',
'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside']
id_annotations = dict()
with open(annotations_path, 'r') as csv_file:
reader = csv.DictReader(csv_file, fieldnames=fieldnames)
next(reader)
images_sizes = {}
for line, row in enumerate(reader):
frame = row['ImageID']
if version == 'challenge2018':
if subset == 'train':
if frame in validation_image_ids:
continue
elif subset == 'validation':
if frame not in validation_image_ids:
continue
else:
raise NotImplementedError('This generator handles only the train and validation subsets')
class_name = row['LabelName']
if class_name not in cls_index:
continue
cls_id = cls_index[class_name]
if version == 'challenge2018':
# We recommend participants to use the provided subset of the training set as a validation set.
# This is preferable over using the V4 val/test sets, as the training set is more densely annotated.
img_path = os.path.join(main_dir, 'images', 'train', frame + '.jpg')
else:
img_path = os.path.join(main_dir, 'images', subset, frame + '.jpg')
if frame in images_sizes:
width, height = images_sizes[frame]
else:
try:
with Image.open(img_path) as img:
width, height = img.width, img.height
images_sizes[frame] = (width, height)
except Exception as ex:
if version == 'challenge2018':
raise ex
continue
x1 = float(row['XMin'])
x2 = float(row['XMax'])
y1 = float(row['YMin'])
y2 = float(row['YMax'])
x1_int = int(round(x1 * width))
x2_int = int(round(x2 * width))
y1_int = int(round(y1 * height))
y2_int = int(round(y2 * height))
# Check that the bounding box is valid.
if x2 <= x1:
raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1))
if y2 <= y1:
raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1))
if y2_int == y1_int:
warnings.warn('filtering line {}: rounding y2 ({}) and y1 ({}) makes them equal'.format(line, y2, y1))
continue
if x2_int == x1_int:
warnings.warn('filtering line {}: rounding x2 ({}) and x1 ({}) makes them equal'.format(line, x2, x1))
continue
img_id = row['ImageID']
annotation = {'cls_id': cls_id, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2}
if img_id in id_annotations:
annotations = id_annotations[img_id]
annotations['boxes'].append(annotation)
else:
id_annotations[img_id] = {'w': width, 'h': height, 'boxes': [annotation]}
return id_annotations
class OidDataset(Dataset):
"""Oid dataset."""
def __init__(self, main_dir, subset, version='v4', annotation_cache_dir='.', transform=None):
if version == 'v4':
metadata = '2018_04'
elif version == 'challenge2018':
metadata = 'challenge2018'
elif version == 'v3':
metadata = '2017_11'
else:
raise NotImplementedError('There is currently no implementation for versions older than v3')
self.transform = transform
if version == 'challenge2018':
self.base_dir = os.path.join(main_dir, 'images', 'train')
else:
self.base_dir = os.path.join(main_dir, 'images', subset)
metadata_dir = os.path.join(main_dir, metadata)
annotation_cache_json = os.path.join(annotation_cache_dir, subset + '.json')
self.id_to_labels, cls_index = get_labels(metadata_dir, version=version)
if os.path.exists(annotation_cache_json):
with open(annotation_cache_json, 'r') as f:
self.annotations = json.loads(f.read())
else:
self.annotations = generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index,
version=version)
json.dump(self.annotations, open(annotation_cache_json, "w"))
self.id_to_image_id = dict([(i, k) for i, k in enumerate(self.annotations)])
# (label -> name)
self.labels = self.id_to_labels
def __len__(self):
return len(self.annotations)
def __getitem__(self, idx):
img = self.load_image(idx)
annot = self.load_annotations(idx)
sample = {'img': img, 'annot': annot}
if self.transform:
sample = self.transform(sample)
return sample
def image_path(self, image_index):
path = os.path.join(self.base_dir, self.id_to_image_id[image_index] + '.jpg')
return path
def load_image(self, image_index):
path = self.image_path(image_index)
img = skimage.io.imread(path)
if len(img.shape) == 1:
img = img[0]
if len(img.shape) == 2:
img = skimage.color.gray2rgb(img)
try:
return img.astype(np.float32) / 255.0
except Exception:
print (path)
exit(0)
def load_annotations(self, image_index):
# get ground truth annotations
image_annotations = self.annotations[self.id_to_image_id[image_index]]
labels = image_annotations['boxes']
height, width = image_annotations['h'], image_annotations['w']
boxes = np.zeros((len(labels), 5))
for idx, ann in enumerate(labels):
cls_id = ann['cls_id']
x1 = ann['x1'] * width
x2 = ann['x2'] * width
y1 = ann['y1'] * height
y2 = ann['y2'] * height
boxes[idx, 0] = x1
boxes[idx, 1] = y1
boxes[idx, 2] = x2
boxes[idx, 3] = y2
boxes[idx, 4] = cls_id
return boxes
def image_aspect_ratio(self, image_index):
img_annotations = self.annotations[self.id_to_image_id[image_index]]
height, width = img_annotations['h'], img_annotations['w']
return float(width) / float(height)
def num_classes(self):
return len(self.id_to_labels)