版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/lingyunxianhe/article/details/82913915
上一篇blog写过一个把训练的样本按指定比例随机分配个学习过程,具体见:
https://blog.csdn.net/lingyunxianhe/article/details/81837978
这样做前提是你的类别在样本中是随机或更科学的说是均匀分布的,而不是一个类别集中与连续的某一段数据中,这样你随机产生样本就有可能使得train val test 分配的很不好
因为自己手动标记的数据,有时为了方便标记,同一个类别的图片可能比较集中,我这有同一个类别样本在连续超过500张图片中占到80%以上,因此为了当分配train val test时合理,在此把连续多张图片中某个类别的样本量占很大比重时,记录在一个txt文件中(用个程序把图片名写入txt文件即可),然后对这个txt集合按train val(我这里是test固定)分配比例分配这个小集合,如果test不固定那还要按train val test比例分配这个小集合,最后把这些小集合整合到一个大集合中去即可,具体代码如下:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 2018/08/11 by DQ
import os
import random
MidFolder='py-faster-rcnn'
MainFolder=os.path.join('/home/KingMe/project',MidFolder,'data/FABdevkit2017/FAB2017/ImageSets/Main')
AnotFolder=os.path.join('/home/KingMe/project',MidFolder,'data/FABdevkit2017/FAB2017/Annotations')
fileIdLen=6 #
CurImNum=len(os.listdir(AnotFolder))
######################last start#############################
def CreateImIdTxt(ImIdS,FilePath):
if os.path.exists(FilePath):
os.remove(FilePath)
with open(FilePath,'w') as FId:
for ImId in ImIdS:
ImIdStr=str(ImId).zfill(fileIdLen)+'\n'
FId.writelines(ImIdStr)
#获取指定txt文档记录的图片集合名
def GetPointTxtImIdSet(FilePath):
ImIdSet=[]
if os.path.exists(FilePath):
with open(FilePath) as FId:
TxtList=FId.readlines()
#print TxtList
for TxtStr in TxtList:
ImId=TxtStr.split()
ImIdSet.append(int(ImId[0]))
return ImIdSet
def AssignImIdSetAsRatio(ImIdSet,TrainR):
random.shuffle(ImIdSet)
ImNum=len(ImIdSet)
TrainNum=int(TrainR*ImNum)
TrainImId=ImIdSet[:TrainNum-1]
ValImId=list(set(ImIdSet).difference(set(TrainImId)))
return TrainImId,ValImId
def WriteImIdSet2TrainValTxt(TrainImId,ValImId,TrainValImId):
TrainImId.sort()
ValImId.sort()
TrainValImId.sort()
TrainValTestIds={}
TrainValTestIds['train']=TrainImId
TrainValTestIds['val']=ValImId
TrainValTestIds['trainval']=TrainValImId
TrainValTestFiles={'train':'train.txt','val':'val.txt','trainval':'trainval.txt'}
for Key,KeyVal in TrainValTestFiles.iteritems():
print 'start create '+ Key+' ImSet'
ImIdS=TrainValTestIds[Key]
FileName=TrainValTestFiles[Key]
FilePath=os.path.join(MainFolder,FileName)
CreateImIdTxt(ImIdS,FilePath)
def FixTestDeassignTrainVal():
TrainR=0.7
SubFolder='TestSetOrOtherBackup'
FileName='test.txt'#测试集合固定,我这里有两个类别
FilePath=os.path.join(MainFolder,SubFolder,FileName)
TestImIdSet=GetPointTxtImIdSet(FilePath)
FileName='7480_8594ManyBlis.txt'
FilePath=os.path.join(MainFolder,SubFolder,FileName)
ManyBlisImIdSet=GetPointTxtImIdSet(FilePath)#获取txt记录的连续多张图片中某个类别的样本量占很大比重的图片名
FileName='8594-8879ManyBreak.txt'
FilePath=os.path.join(MainFolder,SubFolder,FileName)
ManyBreakImIdSet=GetPointTxtImIdSet(FilePath)#获取txt记录的连续多张图片中某个类别的样本量占很大比重的图片名
ImIdSet0=range(1,CurImNum+1)
ImIdSet1=list(set(ImIdSet0).difference(set(TestImIdSet)))#从总集合中去除测试集合
ImIdSet2=list(set(ImIdSet1).difference(set(ManyBlisImIdSet)))
ImIdSet=list(set(ImIdSet2).difference(set(ManyBreakImIdSet)))
TrainImId,ValImId=AssignImIdSetAsRatio(ImIdSet,TrainR)#非txt记录的集合按比例分配
MBlistTrainImId,MBlistValImId=AssignImIdSetAsRatio(ManyBlisImIdSet,TrainR)#txt记录的小集合单独按比例分配
MBreakTrainImId,MBreakValImId=AssignImIdSetAsRatio(ManyBreakImIdSet,TrainR)#txt记录的小集合单独按比例分配
#小集合合并为大集合
TrainImId=TrainImId+MBlistTrainImId+MBreakTrainImId
ValImId=ValImId+MBlistValImId+MBreakValImId
TrainValImId=ImIdSet1
WriteImIdSet2TrainValTxt(TrainImId,ValImId,TrainValImId)
######################last end#############################
FixTestDeassignTrainVal()