请关注我的个人公众号
用户数据转换示例
第一种方法
第一步:按照下面的形式,组织自己的数据
├── annotations
│ ├── road0.xml
│ ├── road1.xml
│ ├── road10.xml
│ | ...
├── images
│ ├── road0.jpg
│ ├── road1.jpg
│ ├── road2.jpg
│ | ...
第二步,将数据划分为训练集和测试集
# 生成 label_list.txt 文件
>>echo "speedlimit\ncrosswalk\ntrafficlight\nstop" > label_list.txt
# 生成 train.txt、valid.txt和test.txt列表文件
>>ls images/*.png | shuf > all_image_list.txt
>>awk -F"/" '{print $2}' all_image_list.txt | awk -F".png" '{print $1}' | awk -F"\t" '{print "images/"$1".png annotations/"$1".xml"}' > all_list.txt
# 训练集、验证集、测试集比例分别约80%、10%、10%。
>>head -n 88 all_list.txt > test.txt
>>head -n 176 all_list.txt | tail -n 88 > valid.txt
>>tail -n 701 all_list.txt > train.txt
# 删除不用文件
>>rm -rf all_image_list.txt all_list.txt
最终数据集文件组织结构为:
├── annotations
│ ├── road0.xml
│ ├── road1.xml
│ ├── road10.xml
│ | ...
├── images
│ ├── road0.jpg
│ ├── road1.jpg
│ ├── road2.jpg
│ | ...
├── label_list.txt
├── test.txt
├── train.txt
└── valid.txt
# label_list.txt 是类别名称列表,文件名必须是 label_list.txt
>>cat label_list.txt
crosswalk
speedlimit
stop
trafficlight
# train.txt 是训练数据集文件列表,每一行是一张图像路径和对应标注文件路径,以空格分开。注意这里的路径是数据集文件夹内的相对路径。
>>cat train.txt
./images/road839.png ./annotations/road839.xml
./images/road363.png ./annotations/road363.xml
...
# valid.txt 是验证数据集文件列表,每一行是一张图像路径和对应标注文件路径,以空格分开。注意这里的路径是数据集文件夹内的相对路径。
>>cat valid.txt
./images/road218.png ./annotations/road218.xml
./images/road681.png ./annotations/road681.xml
第二种方法
VOC数据集格式
文件夹目录如下:
---VOC
------creat_txt.py
------txt_write.py
------Annotations
---------n个xml文件
------ImagesSet
---------Main
--------trainval.txt
--------train.txt
--------test.txt
--------val.txt
------JPEGImages
---------n个img文件
生成VOC数据集的txt文件
分别将这两个python文件放在voc文件夹下。
生成Main文件夹下的txt文件
运行creat_txt.py将会生成Main文件夹下的trainval.txt、train.txt、val.txt、test.txt四个txt文件。
creat_txt.py
import os
import random
#需要根据自己的目录进行修改
trainval_percent = 0.95 # 训练集验证集总占比
train_percent = 0.9 # 训练集在trainval_percent里的train占比
xmlfilepath = r'VOC2007\Annotations'
txtsavepath = r'\VOC2007\ImageSets/Main'
total_xml = os.listdir(xmlfilepath)
num = len(total_xml)
list = range(num)
tv = int(num * trainval_percent)
tr = int(tv * train_percent)
trainval = random.sample(list, tv)
train = random.sample(trainval, tr)
ftrainval = open(r'/VOC2007\ImageSets\Main\trainval.txt', 'w')
ftest = open(r'VOC2007\ImageSets\Main\test.txt', 'w')
ftrain = open(r'VOC2007\ImageSets\Main\train.txt', 'w')
fval = open(r'VOC2007\ImageSets\Main\val.txt', 'w')
for i in list:
name = total_xml[i][:-4] + '\n'
if i in trainval:
ftrainval.write(name)
if i in train:
ftrain.write(name)
else:
fval.write(name)
else:
ftest.write(name)
ftrainval.close()
ftrain.close()
fval.close()
ftest.close()
生成包含jpg和xml信息的txt文件
运行txt_write.py根据在Main文件夹中划分好的数据集进行位置索引,生成含有图像及对应的XML文件的地址信息的文件。
txt_write.py(不需要修改路径)
import os
import re
import random
devkit_dir = './'
output_dir = './'
def get_dir(devkit_dir, type):
return os.path.join(devkit_dir, type)
def walk_dir(devkit_dir):
filelist_dir = get_dir(devkit_dir, 'ImageSets/Main')
annotation_dir = get_dir(devkit_dir, 'Annotations')
img_dir = get_dir(devkit_dir, 'JPEGImages')
trainval_list = []
train_list = []
val_list = []
test_list = []
added = set()
for _, _, files in os.walk(filelist_dir):
for fname in files:
print(fname)
img_ann_list = []
if re.match('trainval.txt', fname):
img_ann_list = trainval_list
elif re.match('train.txt', fname):
img_ann_list = train_list
elif re.match('val.txt', fname):
img_ann_list = val_list
elif re.match('test.txt', fname):
img_ann_list = test_list
else:
continue
fpath = os.path.join(filelist_dir, fname)
for line in open(fpath):
name_prefix = line.strip().split()[0]
print(name_prefix)
added.add(name_prefix)
#ann_path = os.path.join(annotation_dir, name_prefix + '.xml')
ann_path = annotation_dir + '/' + name_prefix + '.xml'
print(ann_path)
#img_path = os.path.join(img_dir, name_prefix + '.jpg')
img_path = img_dir + '/' + name_prefix + '.jpg'
assert os.path.isfile(ann_path), 'file %s not found.' % ann_path
assert os.path.isfile(img_path), 'file %s not found.' % img_path
img_ann_list.append((img_path, ann_path))
print(img_ann_list)
return trainval_list, train_list, val_list, test_list
def prepare_filelist(devkit_dir, output_dir):
trainval_list = []
train_list = []
val_list = []
test_list = []
trainval, train, val, test = walk_dir(devkit_dir)
trainval_list.extend(trainval)
train_list.extend(train)
val_list.extend(val)
test_list.extend(test)
#print(trainval)
with open(os.path.join(output_dir, 'trainval.txt'), 'w') as ftrainval:
for item in trainval_list:
ftrainval.write(item[0] + ' ' + item[1] + '\n')
with open(os.path.join(output_dir, 'train.txt'), 'w') as ftrain:
for item in train_list:
ftrain.write(item[0] + ' ' + item[1] + '\n')
with open(os.path.join(output_dir, 'val.txt'), 'w') as fval:
for item in val_list:
fval.write(item[0] + ' ' + item[1] + '\n')
with open(os.path.join(output_dir, 'test.txt'), 'w') as ftest:
for item in test_list:
ftest.write(item[0] + ' ' + item[1] + '\n')
if __name__ == '__main__':
prepare_filelist(devkit_dir, output_dir)
创建label_list
修改预训练模型的Configs配置
在paddleDetection文件夹中操作。
修改num_classes
修改所用模型ssd_mobilenet_v1_voc.yml配置文件,在PaddleDetection-release-0.2\configs\ssd路径下找到配置文件,修改为自己数据集的的num_classes:
num_classes = (label_class) + 1(background)
修改pascalvoc_label
修改voc.py运行文件,在PaddleDetection-release-0.2\ppdet\data\source路径下找到配置文件,修改
修改为自己模型的label:
参考链接
- https://blog.csdn.net/qq_45779334/article/details/106026210
- https://blog.csdn.net/qq_45779334/article/details/106052958