代码主要是参考bubbliiing的github YOLOv3的代码:github.com/bubbliiiing…
对于源代码的解读
训练部分
dataloader.py文件
class YoloDataset(Dataset):
def __init__(self, annotation_lines, input_shape, num_classes, train):
super(YoloDataset, self).__init__()
self.annotation_lines = annotation_lines
self.input_shape = input_shape
self.num_classes = num_classes
self.length = len(self.annotation_lines)
self.train = train
def __len__(self):
return self.length
def __getitem__(self, index):
index = index % self.length
#---------------------------------------------------#
# 训练时进行数据的随机增强
# 验证时不进行数据的随机增强
#---------------------------------------------------#
image, box = self.get_random_data(self.annotation_lines[index], self.input_shape[0:2], random = self.train)
image = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1))
box = np.array(box, dtype=np.float32)
if len(box) != 0:
box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1]
box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0]
box[:, 2:4] = box[:, 2:4] - box[:, 0:2]
box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2
return image, box
def rand(self, a=0, b=1):
return np.random.rand()*(b-a) + a
def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True):
line = annotation_line.split()
#------------------------------#
# 读取图像并转换成RGB图像
#------------------------------#
image = Image.open(line[0])
image = cvtColor(image)
#------------------------------#
# 获得图像的高宽与目标高宽
#------------------------------#
iw, ih = image.size
h, w = input_shape
#------------------------------#
# 获得预测框
#------------------------------#
box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
if not random:
scale = min(w/iw, h/ih)
nw = int(iw*scale)
nh = int(ih*scale)
dx = (w-nw)//2
dy = (h-nh)//2
#---------------------------------#
# 将图像多余的部分加上灰条
#---------------------------------#
image = image.resize((nw,nh), Image.BICUBIC)
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image, np.float32)
#---------------------------------#
# 对真实框进行调整
#---------------------------------#
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
return image_data, box
#------------------------------------------#
# 对图像进行缩放并且进行长和宽的扭曲
#------------------------------------------#
new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
scale = self.rand(.25, 2)
if new_ar < 1:
nh = int(scale*h)
nw = int(nh*new_ar)
else:
nw = int(scale*w)
nh = int(nw/new_ar)
image = image.resize((nw,nh), Image.BICUBIC)
#------------------------------------------#
# 将图像多余的部分加上灰条
#------------------------------------------#
dx = int(self.rand(0, w-nw))
dy = int(self.rand(0, h-nh))
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image = new_image
#------------------------------------------#
# 翻转图像
#------------------------------------------#
flip = self.rand()<.5
if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
image_data = np.array(image, np.uint8)
#---------------------------------#
# 对图像进行色域变换
# 计算色域变换的参数
#---------------------------------#
r = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
#---------------------------------#
# 将图像转到HSV上
#---------------------------------#
hue, sat, val = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))
dtype = image_data.dtype
#---------------------------------#
# 应用变换
#---------------------------------#
x = np.arange(0, 256, dtype=r.dtype)
lut_hue = ((x * r[0]) % 180).astype(dtype)
lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB)
#---------------------------------#
# 对真实框进行调整
#---------------------------------#
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
if flip: box[:, [0,2]] = w - box[:, [2,0]]
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)]
return image_data, box
复制代码
- 对代码进行详细解读
dataset的三部件
# 初始化 def __init__(self, annotation_lines, input_shape, num_classes, train): super(YoloDataset, self).__init__() # 从txt中读出来的数据列表 self.annotation_lines = annotation_lines # [416,416] self.input_shape = input_shape # voc 20 self.num_classes = num_classes # 训练数据总量 self.length = len(self.annotation_lines) # 是train还是val, train需要backward反向传播,val不用 self.train = train # 返回图片(训练数据)总量 def __len__(self): return self.length # 根据index获取数据 def __getitem__(self, index): index = index % self.length #---------------------------------------------------# # 训练时进行数据的随机增强 # 验证时不进行数据的随机增强 #---------------------------------------------------# # 根据是训练或者是测试阶段,看是否对图像进行数据增强和框的变化 image, box = self.get_random_data(self.annotation_lines[index], self.input_shape[0:2], random = self.train) # 对于图片先转化成nd.array,然后在进行prepreocess处理,因为pytorch需要[channel,w,h]的输入格式,所以直接transpose转化维度 image = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1)) # 先将box加载成ndarray, box = np.array(box, dtype=np.float32) if len(box) != 0: # 在这个部分我们将我们的真实宽高x1,x2,y1,y2转化成归一化坐标 box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1] box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0] # 将归一化的x1,x2,y1,y2坐标转化成[中心点坐标x,y,w,h]的形式 box[:, 2:4] = box[:, 2:4] - box[:, 0:2] box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2 # 返回image数据和label数据 return image, box 复制代码
- 调用
get_random_data
方法
我们看下变化前我们的框的数据是集# 读入的信息如下C:\Users\xxx\yolo3-pytorch-master\VOCdevkit/VOC2007/JPEGImages/000072.jpg 40,71,333,473,13 def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True): # 对读入的数据进行分割 line = annotation_line.split() #------------------------------# # 读取图像并转换成RGB图像 #------------------------------# # 我们读取图像并转化成RGB的格式 image = Image.open(line[0]) image = cvtColor(image) #------------------------------# # 获得图像的高宽与目标高宽 #------------------------------# # 获取真实狂傲 iw, ih = image.size # 指定宽高[416,416] h, w = input_shape #------------------------------# # 获得预测框 #------------------------------# # 我们获取box的数据[[40,71,333,473,13],[89,31,323,73,11]...],注意他的复合写法,可学习 box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]]) # 这个random变量就是train or val if not random: # 是val的话,不进行图像增强,直接进行框的变化即可 scale = min(w/iw, h/ih) nw = int(iw*scale) nh = int(ih*scale) # 我们要得到的是填充的宽度和高度 dx = (w-nw)//2 dy = (h-nh)//2 #---------------------------------# # 将图像多余的部分加上灰条 #---------------------------------# # 对图像进行填充,得到不失真的填充图片 image = image.resize((nw,nh), Image.BICUBIC) new_image = Image.new('RGB', (w,h), (128,128,128)) new_image.paste(image, (dx, dy)) # 返回图片的ndarray数据 image_data = np.array(new_image, np.float32) #---------------------------------# # 对真实框进行调整 #---------------------------------# # 真的有真实框的话 if len(box)>0: # 对框的顺序进行打乱 np.random.shuffle(box) # 计算我们的坐标在填充后的真实坐标 box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy box[:, 0:2][box[:, 0:2]<0] = 0 box[:, 2][box[:, 2]>w] = w box[:, 3][box[:, 3]>h] = h box_w = box[:, 2] - box[:, 0] box_h = box[:, 3] - box[:, 1] box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box return image_data, box #------------------------------------------# # 对图像进行缩放并且进行长和宽的扭曲 #------------------------------------------# # 是train的话,进行图像增强(以下都是图像增强的手段[随机缩放、扭曲、翻转、RGB-HSV-RGB等],不进行详细解读) new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter) scale = self.rand(.25, 2) if new_ar < 1: nh = int(scale*h) nw = int(nh*new_ar) else: nw = int(scale*w) nh = int(nw/new_ar) image = image.resize((nw,nh), Image.BICUBIC) #------------------------------------------# # 将图像多余的部分加上灰条 #------------------------------------------# dx = int(self.rand(0, w-nw)) dy = int(self.rand(0, h-nh)) new_image = Image.new('RGB', (w,h), (128,128,128)) new_image.paste(image, (dx, dy)) image = new_image #------------------------------------------# # 翻转图像 #------------------------------------------# flip = self.rand()<.5 if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT) image_data = np.array(image, np.uint8) #---------------------------------# # 对图像进行色域变换 # 计算色域变换的参数 #---------------------------------# r = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1 #---------------------------------# # 将图像转到HSV上 #---------------------------------# hue, sat, val = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV)) dtype = image_data.dtype #---------------------------------# # 应用变换 #---------------------------------# x = np.arange(0, 256, dtype=r.dtype) lut_hue = ((x * r[0]) % 180).astype(dtype) lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) lut_val = np.clip(x * r[2], 0, 255).astype(dtype) image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))) image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB) #---------------------------------# # 对真实框进行调整 #---------------------------------# if len(box)>0: np.random.shuffle(box) box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy if flip: box[:, [0,2]] = w - box[:, [2,0]] box[:, 0:2][box[:, 0:2]<0] = 0 box[:, 2][box[:, 2]>w] = w box[:, 3][box[:, 3]>h] = h box_w = box[:, 2] - box[:, 0] box_h = box[:, 3] - box[:, 1] box = box[np.logical_and(box_w>1, box_h>1)] return image_data, box 复制代码
经过数据增强变化后的数据是
\