一、网络部分
Vgg网络
代码实现如下:
layers = []
# i=3,说明图像是3通道
in_channels = i
# cfg:[64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
# 512, 512, 512]
for v in cfg:
if v == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
elif v == 'C':
layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
else:
#不用管in_channels取值,默认stride=1,故不改变特征图大小
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
if batch_norm:
layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
else:
layers += [conv2d, nn.ReLU(inplace=True)]
in_channels = v
#操作pool5之前,为19*19*512
#操作完pool5后,为19x19x512
pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
#空洞卷积后,为19*19*1024
conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
#正常卷积后,为19*19*1024
conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
layers += [pool5, conv6,
nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
return layers
Extra Layer
Conv7_fc是19x19x1024,后接红色区域模块,那么如何实现这三个模块的构建呢?
红色区域代码如下:
layers = []
#初始化in_channels,in_channels=1024
in_channels = i
# cfg:[1024, 'S', 512, 'S', 256]
for k, v in enumerate(cfg):
#当in_channels不等于S时,才执行下面操作:
if in_channels != 'S':
if v == 'S':
if in_channels == 256 and size == 512:
#3:BasicRFB(512, 256, stride=2, scale=1.0, visual=1)
layers += [BasicRFB(in_channels, cfg[k+1], stride=2, scale = 1.0, visual=1)]
else:
#2:BasicRFB(1024, 512, stride=2, scale=1.0, visual=2)
layers += [BasicRFB(in_channels, cfg[k+1], stride=2, scale = 1.0, visual=2)]
else:
#1:BasicRFB(1024, 1024, scale=1.0, visual=2)
layers += [BasicRFB(in_channels, v, scale = 1.0, visual=2)]
in_channels = v
此代码依次会将3个BasicRFB添加进入layers中,即上图所示红色框部分。
1: BasicRFB(1024, 1024, scale=1.0, visual=2)
2: BasicRFB(1024, 512, stride=2, scale=1.0, visual=2)
3: BasicRFB(512, 256, stride=2, scale=1.0, visual=1)
蓝色区域的构造代码如下:
elif size ==300:
layers += [BasicConv(256,128,kernel_size=1,stride=1)]
layers += [BasicConv(128,256,kernel_size=3,stride=1)]
layers += [BasicConv(256,128,kernel_size=1,stride=1)]
layers += [BasicConv(128,256,kernel_size=3,stride=1)]
那么BasicRFB又是如何构造的呢?
代码实现如下:
def __init__(self, in_planes, out_planes, stride=1, scale = 0.1, visual = 1):
super(BasicRFB, self).__init__()
self.scale = scale
self.out_channels = out_planes
#//表示向下取整。inter_planes=128
#inter_planes=128
#inter_planes=64
inter_planes = in_planes // 8
#所有branch的输入通道数为in_planes,输出通道数为2*inter_planes
self.branch0 = nn.Sequential(
#BasicConv(1024, 256, 1, stride=1)
BasicConv(in_planes, 2*inter_planes, kernel_size=1, stride=stride),
BasicConv(2*inter_planes, 2*inter_planes, kernel_size=3, stride=1, padding=visual, dilation=visual, relu=False)
)
self.branch1 = nn.Sequential(
BasicConv(in_planes, inter_planes, kernel_size=1, stride=1),
BasicConv(inter_planes, 2*inter_planes, kernel_size=(3,3), stride=stride, padding=(1,1)),
BasicConv(2*inter_planes, 2*inter_planes, kernel_size=3, stride=1, padding=visual+1, dilation=visual+1, relu=False)
)
self.branch2 = nn.Sequential(
BasicConv(in_planes, inter_planes, kernel_size=1, stride=1),
BasicConv(inter_planes, (inter_planes//2)*3, kernel_size=3, stride=1, padding=1),
BasicConv((inter_planes//2)*3, 2*inter_planes, kernel_size=3, stride=stride, padding=1),
BasicConv(2*inter_planes, 2*inter_planes, kernel_size=3, stride=1, padding=2*visual+1, dilation=2*visual+1, relu=False)
)
self.ConvLinear = BasicConv(6*inter_planes, out_planes, kernel_size=1, stride=1, relu=False)
self.shortcut = BasicConv(in_planes, out_planes, kernel_size=1, stride=stride, relu=False)
self.relu = nn.ReLU(inplace=False)
现在,已经构造完了如下图所示绿色框内的所有层。
计算anchor所对应分类卷积层和位置卷积层
每个feature map所对应的anchor数:[6, 6, 6, 6, 4, 4]
代码如下:
loc_layers = []
conf_layers = []
vgg_source = [-2]
for k, v in enumerate(vgg_source):
if k == 0:
#即将增加到RFB-s后面的分类卷积层和位置卷积层
loc_layers += [nn.Conv2d(512,
cfg[k] * 4, kernel_size=3, padding=1)]
conf_layers +=[nn.Conv2d(512,
cfg[k] * num_classes, kernel_size=3, padding=1)]
else:
loc_layers += [nn.Conv2d(vgg[v].out_channels,
cfg[k] * 4, kernel_size=3, padding=1)]
conf_layers += [nn.Conv2d(vgg[v].out_channels,
cfg[k] * num_classes, kernel_size=3, padding=1)]
i = 1
indicator = 0
if size == 300:
indicator = 3
elif size == 512:
indicator = 5
else:
print("Error: Sorry only RFBNet300 and RFBNet512 are supported!")
return
for k, v in enumerate(extra_layers):
#即将增加到extra_layers的分类卷积层和位置卷积层
if k < indicator or k%2== 0:
loc_layers += [nn.Conv2d(v.out_channels, cfg[i]
* 4, kernel_size=3, padding=1)]
conf_layers += [nn.Conv2d(v.out_channels, cfg[i]
* num_classes, kernel_size=3, padding=1)]
i +=1
return vgg, extra_layers, (loc_layers, conf_layers)
那么如何才能把这分散的各个卷积层给连接成一个整体呢?
组装成RFB-Net网络
代码如下
构建各个层:
def __init__(self, phase, size, base, extras, head, num_classes):
# base:vgg
# extras:extra_layersf
# head:(loc_layers, conf_layers)
super(RFBNet, self).__init__()
self.phase = phase
self.num_classes = num_classes
self.size = size
if size == 300:
self.indicator = 3
elif size == 512:
self.indicator = 5
else:
print("Error: Sorry only SSD300 and SSD512 are supported!")
return
# vgg network
self.base = nn.ModuleList(base)
# conv_4
self.Norm = BasicRFB_a(512,512,stride = 1,scale=1.0)
self.extras = nn.ModuleList(extras)
self.loc = nn.ModuleList(head[0])
self.conf = nn.ModuleList(head[1])
if self.phase == 'test':
self.softmax = nn.Softmax(dim=-1)
将各个层结合起来:
sources = list()
loc = list()
conf = list()
# apply vgg up to conv4_3 relu
#每次卷积完都要进行batch normalization,所以:2*2+1+2*2+1+3*2+1+3*2=23
for k in range(23):
x = self.base[k](x)
s = self.Norm(x)
sources.append(s)
# apply vgg up to fc7
for k in range(23, len(self.base)):
x = self.base[k](x)
# apply extra layers and cache source layer outputs
for k, v in enumerate(self.extras):
x = v(x)
if k < self.indicator or k%2 ==0:
#只将图上的层加入source,不加那些过程卷积层。
sources.append(x)
# apply multibox head to source layers
#source是要计算anchor的分类信息和位置信息的feature map的组合。
for (x, l, c) in zip(sources, self.loc, self.conf):
#将loc和conf层分别加到feature map上
#x.contiguous():把tensor变成在内存中连续分布的形式
loc.append(l(x).permute(0, 2, 3, 1).contiguous())
conf.append(c(x).permute(0, 2, 3, 1).contiguous())
#print([o.size() for o in loc])
#把loc和conf中每一个最终位置信息和分类信息的tensor重新reshape一下。
loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
if self.phase == "test":
output = (
loc.view(loc.size(0), -1, 4), # loc preds
self.softmax(conf.view(-1, self.num_classes)), # conf preds
)
else:
output = (
loc.view(loc.size(0), -1, 4),
conf.view(conf.size(0), -1, self.num_classes),
)
return output
构建BasicRFB-s模块:
其中BasicRFB-s如下图所示:
代码实现如下:
def __init__(self, in_planes, out_planes, stride=1, scale = 0.1):
super(BasicRFB_a, self).__init__()
self.scale = scale
self.out_channels = out_planes
inter_planes = in_planes //4
self.branch0 = nn.Sequential(
BasicConv(in_planes, inter_planes, kernel_size=1, stride=1),
BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=1,relu=False)
)
self.branch1 = nn.Sequential(
BasicConv(in_planes, inter_planes, kernel_size=1, stride=1),
BasicConv(inter_planes, inter_planes, kernel_size=(3,1), stride=1, padding=(1,0)),
BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=3, dilation=3, relu=False)
)
self.branch2 = nn.Sequential(
BasicConv(in_planes, inter_planes, kernel_size=1, stride=1),
BasicConv(inter_planes, inter_planes, kernel_size=(1,3), stride=stride, padding=(0,1)),
BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=3, dilation=3, relu=False)
)
self.branch3 = nn.Sequential(
BasicConv(in_planes, inter_planes//2, kernel_size=1, stride=1),
BasicConv(inter_planes//2, (inter_planes//4)*3, kernel_size=(1,3), stride=1, padding=(0,1)),
BasicConv((inter_planes//4)*3, inter_planes, kernel_size=(3,1), stride=stride, padding=(1,0)),
BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=5, dilation=5, relu=False)
)
self.ConvLinear = BasicConv(4*inter_planes, out_planes, kernel_size=1, stride=1, relu=False)
self.shortcut = BasicConv(in_planes, out_planes, kernel_size=1, stride=stride, relu=False)
self.relu = nn.ReLU(inplace=False)
二、训练部分
从此代码入手,彻底搞清楚整个训练流程。
1.首先会接收一系列输入或者直接是默认值。如dataset选VOC或者COCO。但这只是一些简要的表示,并不能直接用于程序使用,所以需要后续变量来配合。
parser = argparse.ArgumentParser(
description='Receptive Field Block Net Training')
parser.add_argument('-v', '--version', default='RFB_vgg',
help='RFB_vgg ,RFB_E_vgg or RFB_mobile version.')
parser.add_argument('-s', '--size', default='300',
help='300 or 512 input size.')
parser.add_argument('-d', '--dataset', default='VOC',
help='VOC or COCO dataset')
parser.add_argument(
'--basenet', default='./weights/vgg16_reducedfc.pth', help='pretrained base model')
parser.add_argument('--jaccard_threshold', default=0.5,
type=float, help='Min Jaccard index for matching')
parser.add_argument('-b', '--batch_size', default=32,
type=int, help='Batch size for training')
parser.add_argument('--num_workers', default=8,
type=int, help='Number of workers used in dataloading')
parser.add_argument('--cuda', default=True,
type=bool, help='Use cuda to train model')
parser.add_argument('--ngpu', default=1, type=int, help='gpus')
parser.add_argument('--lr', '--learning-rate',
default=4e-3, type=float, help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, help='momentum')
parser.add_argument(
'--resume_net', default=None, help='resume net for retraining')
parser.add_argument('--resume_epoch', default=0,
type=int, help='resume iter for retraining')
# parser.add_argument('-max','--max_epoch', default=300,
# type=int, help='max epoch for retraining')
parser.add_argument('-max','--max_epoch', default=300,
type=int, help='max epoch for retraining')
parser.add_argument('--weight_decay', default=5e-4,
type=float, help='Weight decay for SGD')
parser.add_argument('--gamma', default=0.1,
type=float, help='Gamma update for SGD')
parser.add_argument('--log_iters', default=True,
type=bool, help='Print the loss at each iteration')
parser.add_argument('--save_folder', default='./weights/',
help='Location to save checkpoint models')
args = parser.parse_args()
2.会根据args设置一些变量,供以后使用。如根据VOC还是COCO设置参数。
if not os.path.exists(args.save_folder):
os.mkdir(args.save_folder)
#数据集部分
if args.dataset == 'VOC':
#训练集
train_sets = [('2007', 'trainval'), ('2012', 'trainval')]
#设置配置参数
cfg = (VOC_300, VOC_512)[args.size == '512']
else:
train_sets = [('2014', 'train'),('2014', 'valminusminival')]
cfg = (COCO_300, COCO_512)[args.size == '512']
if args.version == 'RFB_vgg':
from models.RFB_Net_vgg import build_net
elif args.version == 'RFB_E_vgg':
from models.RFB_Net_E_vgg import build_net
elif args.version == 'RFB_mobile':
from models.RFB_Net_mobile import build_net
cfg = COCO_mobile_300
else:
print('Unkown version!')
img_dim = (300,512)[args.size=='512']
rgb_means = ((104, 117, 123),(103.94,116.78,123.68))[args.version == 'RFB_mobile']
p = (0.6,0.2)[args.version == 'RFB_mobile']
num_classes = (21, 81)[args.dataset == 'COCO']
batch_size = args.batch_size
weight_decay = 0.0005
gamma = 0.1
momentum = 0.9
3.构建模型、加载预训练模型、把模型转成gpu支持tensor。
net = build_net('train', img_dim, num_classes)
print(net)
if args.resume_net == None:
#parser.add_argument(
#'--basenet', default='./weights/vgg16_reducedfc.pth', help='pretrained base model')
base_weights = torch.load(args.basenet)
print('Loading base network...')
net.base.load_state_dict(base_weights)
def xavier(param):
init.xavier_uniform(param)
def weights_init(m):
for key in m.state_dict():
if key.split('.')[-1] == 'weight':
if 'conv' in key:
init.kaiming_normal_(m.state_dict()[key], mode='fan_out')
if 'bn' in key:
m.state_dict()[key][...] = 1
elif key.split('.')[-1] == 'bias':
m.state_dict()[key][...] = 0
print('Initializing weights...')
# initialize newly added layers' weights with kaiming_normal method
net.extras.apply(weights_init)
net.loc.apply(weights_init)
net.conf.apply(weights_init)
net.Norm.apply(weights_init)
if args.version == 'RFB_E_vgg':
net.reduce.apply(weights_init)
net.up_reduce.apply(weights_init)
else:
# load resume network
print('Loading resume network...')
state_dict = torch.load(args.resume_net)
# create new OrderedDict that does not contain `module.`
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
head = k[:7]
if head == 'module.':
name = k[7:] # remove `module.`
else:
name = k
new_state_dict[name] = v
net.load_state_dict(new_state_dict)
if args.ngpu > 1:
net = torch.nn.DataParallel(net, device_ids=list(range(args.ngpu)))
if args.cuda:
net.cuda()
cudnn.benchmark = True
#step3:优化器和目标函数
optimizer = optim.SGD(net.parameters(), lr=args.lr,
momentum=args.momentum, weight_decay=args.weight_decay)
#optimizer = optim.RMSprop(net.parameters(), lr=args.lr,alpha = 0.9, eps=1e-08,
# momentum=args.momentum, weight_decay=args.weight_decay)
criterion = MultiBoxLoss(num_classes, 0.5, True, 0, True, 3, 0.5, False)
priorbox = PriorBox(cfg)
with torch.no_grad():
priors = priorbox.forward()
if args.cuda:
priors = priors.cuda()
定义优化器、损失函数、定义锚框
4.之前都是一些准备工作,没有进行正式训练。
接下来正式进行训练。
(1)加载数据集