FCN训练自己数据集（person-segmentation）、SIFT-FLOW、SBD和VOC实验总结

最近花了将近一周的时间，基于提供的源码，通过参考网上的博客，跑通了FCN在三个数据集上的训练以及测试。在这里写下总结，即是记录，又希望能够对其他刚刚接触FCN的人有所帮助。

FCN的源码地址：https://github.com/shelhamer/fcn.berkeleyvision.org

训练自己的数据集：

1.首先准备自己的数据集：

我的数据集是对人体轮廓进行分割。

最开始的标签是这样的：

但是这个标签是有问题的，最后会说明。

2.对源码进行修改

将工程所要用到的所有py文件放在一个目录下，否则会报错。

首先是solve.py文件

#!usr/bin/python
# -*- coding: utf-8 -*-

import caffe
import surgery, score

import numpy as np
import os
import sys
sys.path.append('/home/panyun/sourcecode/caffe/python')	#引入sys库并且增加需要的caffe的python路径

try:
    import setproctitle
    setproctitle.setproctitle(os.path.basename(os.getcwd()))
except:
    pass

# init
# caffe.set_device(int(sys.argv[0]))	# 这里设置使用GPU编号 先注释掉
caffe.set_device(int(0))
caffe.set_mode_gpu()

solver = caffe.SGDSolver('/home/zhaoys/myf/fcn/people-fcn8s/solver.prototxt')	# 设置solver.prototxt
weights = '/home/zhaoys/myf/fcn/people-fcn8s/caffemodel/fcn8s-heavy-pascal.caffemodel'	# 设置下载好的用来finetune的模型
deploy_proto = '/home/zhaoys/myf/fcn/people-fcn8s/deploy.prototxt'
# solver.net.copy_from(weights)
fcn8snet = caffe.Net(deploy_proto, weights, caffe.TRAIN)
surgery.transplant(solver.net, fcn8snet)
del fcn8snet

# surgeries
interp_layers = [k for k in solver.net.params.keys() if 'up' in k]	#修改层名称的时候不要把反卷积层的name中的"up"去掉
surgery.interp(solver.net, interp_layers)

# scoring
val = np.loadtxt('/home/zhaoys/myf/dataset/people_segmentation/test.txt', dtype=str)	# 传入已经写好的训练txt文件

for _ in range(25):	# 设置epoch数和每个epoch中的step数
    solver.step(4000)
    score.seg_tests(solver, False, val, layer='score')

首先，在代码里面对路径进行了修改。其次，在fcn8s的模型上面进行了fine-tune，由于网络模型和我自己数据集输入的图片大小不同，这里采用了移植的方式。最后，设置epoch数和每个epoch中进行迭代的次数。

train_net: "/home/zhaoys/myf/fcn/people-fcn8s/train.prototxt"
test_net: "/home/zhaoys/myf/fcn/people-fcn8s/val.prototxt"

test_iter: 736
# make test net, but don't invoke it from the solver itself
test_interval: 999999999
display: 40
average_loss: 20
lr_policy: "fixed"
# lr for unnormalized softmax
base_lr: 1e-12
# high momentum
momentum: 0.99
# no gradient accumulation
iter_size: 1
max_iter: 100000
weight_decay: 0.0005
snapshot: 4000
snapshot_prefix: "/home/zhaoys/myf/fcn/people-fcn8s/snapshot/train"
test_initialization: false
debug_info: false

solve.prototxt文件设置。

layer {
  name: "data"
  type: "Python"
  top: "data"
  top: "label"
  python_param {
    module: "voc_layers"
    layer: "SBDDSegDataLayer"
    param_str: "{\'sbdd_dir\': \'/home/zhaoys/myf/dataset/people_segmentation\', \'seed\': 1337, \'split\': \'train\', \'mean\': (184.17504, 193.82224, 204.57951)}"
  }
}
layer {
  name: "conv1_1"
  type: "Convolution"
  bottom: "data"
  top: "conv1_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 64
    pad: 100
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu1_1"
  type: "ReLU"
  bottom: "conv1_1"
  top: "conv1_1"
}
layer {
  name: "conv1_2"
  type: "Convolution"
  bottom: "conv1_1"
  top: "conv1_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu1_2"
  type: "ReLU"
  bottom: "conv1_2"
  top: "conv1_2"
}
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "conv1_2"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv2_1"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu2_1"
  type: "ReLU"
  bottom: "conv2_1"
  top: "conv2_1"
}
layer {
  name: "conv2_2"
  type: "Convolution"
  bottom: "conv2_1"
  top: "conv2_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu2_2"
  type: "ReLU"
  bottom: "conv2_2"
  top: "conv2_2"
}
layer {
  name: "pool2"
  type: "Pooling"
  bottom: "conv2_2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv3_1"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu3_1"
  type: "ReLU"
  bottom: "conv3_1"
  top: "conv3_1"
}
layer {
  name: "conv3_2"
  type: "Convolution"
  bottom: "conv3_1"
  top: "conv3_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu3_2"
  type: "ReLU"
  bottom: "conv3_2"
  top: "conv3_2"
}
layer {
  name: "conv3_3"
  type: "Convolution"
  bottom: "conv3_2"
  top: "conv3_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu3_3"
  type: "ReLU"
  bottom: "conv3_3"
  top: "conv3_3"
}
layer {
  name: "pool3"
  type: "Pooling"
  bottom: "conv3_3"
  top: "pool3"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv4_1"
  type: "Convolution"
  bottom: "pool3"
  top: "conv4_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu4_1"
  type: "ReLU"
  bottom: "conv4_1"
  top: "conv4_1"
}
layer {
  name: "conv4_2"
  type: "Convolution"
  bottom: "conv4_1"
  top: "conv4_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu4_2"
  type: "ReLU"
  bottom: "conv4_2"
  top: "conv4_2"
}
layer {
  name: "conv4_3"
  type: "Convolution"
  bottom: "conv4_2"
  top: "conv4_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu4_3"
  type: "ReLU"
  bottom: "conv4_3"
  top: "conv4_3"
}
layer {
  name: "pool4"
  type: "Pooling"
  bottom: "conv4_3"
  top: "pool4"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv5_1"
  type: "Convolution"
  bottom: "pool4"
  top: "conv5_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu5_1"
  type: "ReLU"
  bottom: "conv5_1"
  top: "conv5_1"
}
layer {
  name: "conv5_2"
  type: "Convolution"
  bottom: "conv5_1"
  top: "conv5_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu5_2"
  type: "ReLU"
  bottom: "conv5_2"
  top: "conv5_2"
}
layer {
  name: "conv5_3"
  type: "Convolution"
  bottom: "conv5_2"
  top: "conv5_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu5_3"
  type: "ReLU"
  bottom: "conv5_3"
  top: "conv5_3"
}
layer {
  name: "pool5"
  type: "Pooling"
  bottom: "conv5_3"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "fc6"
  type: "Convolution"
  bottom: "pool5"
  top: "fc6"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 4096
    pad: 0
    kernel_size: 7
    stride: 1
  }
}
layer {
  name: "relu6"
  type: "ReLU"
  bottom: "fc6"
  top: "fc6"
}
layer {
  name: "drop6"
  type: "Dropout"
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "fc7"
  type: "Convolution"
  bottom: "fc6"
  top: "fc7"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 4096
    pad: 0
    kernel_size: 1
    stride: 1
  }
}
layer {
  name: "relu7"
  type: "ReLU"
  bottom: "fc7"
  top: "fc7"
}
layer {
  name: "drop7"
  type: "Dropout"
  bottom: "fc7"
  top: "fc7"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "score_fr_n"
  type: "Convolution"
  bottom: "fc7"
  top: "score_fr"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 2
    pad: 0
    kernel_size: 1
  }
}
layer {
  name: "upscore2_n"
  type: "Deconvolution"
  bottom: "score_fr"
  top: "upscore2"
  param {
    lr_mult: 0.0
  }
  convolution_param {
    num_output: 2
    bias_term: false
    kernel_size: 4
    stride: 2
  }
}
layer {
  name: "score_pool4_n"
  type: "Convolution"
  bottom: "pool4"
  top: "score_pool4"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 2
    pad: 0
    kernel_size: 1
  }
}
layer {
  name: "score_pool4c"
  type: "Crop"
  bottom: "score_pool4"
  bottom: "upscore2"
  top: "score_pool4c"
  crop_param {
    axis: 2
    offset: 5
  }
}
layer {
  name: "fuse_pool4"
  type: "Eltwise"
  bottom: "upscore2"
  bottom: "score_pool4c"
  top: "fuse_pool4"
  eltwise_param {
    operation: SUM
  }
}
layer {
  name: "upscore_pool4_n"
  type: "Deconvolution"
  bottom: "fuse_pool4"
  top: "upscore_pool4"
  param {
    lr_mult: 0.0
  }
  convolution_param {
    num_output: 2
    bias_term: false
    kernel_size: 4
    stride: 2
  }
}
layer {
  name: "score_pool3_n"
  type: "Convolution"
  bottom: "pool3"
  top: "score_pool3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 2
    pad: 0
    kernel_size: 1
  }
}
layer {
  name: "score_pool3c"
  type: "Crop"
  bottom: "score_pool3"
  bottom: "upscore_pool4"
  top: "score_pool3c"
  crop_param {
    axis: 2
    offset: 9
  }
}
layer {
  name: "fuse_pool3"
  type: "Eltwise"
  bottom: "upscore_pool4"
  bottom: "score_pool3c"
  top: "fuse_pool3"
  eltwise_param {
    operation: SUM
  }
}
layer {
  name: "upscore8_n"
  type: "Deconvolution"
  bottom: "fuse_pool3"
  top: "upscore8"
  param {
    lr_mult: 0.0
  }
  convolution_param {
    num_output: 2
    bias_term: false
    kernel_size: 16
    stride: 8
  }
}
layer {
  name: "score"
  type: "Crop"
  bottom: "upscore8"
  bottom: "data"
  top: "score"
  crop_param {
    axis: 2
    offset: 31
  }
}
layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "score"
  bottom: "label"
  top: "loss"
  loss_param {
    normalize: false
  }
}

train.prototxt文件设置。

里面进行的改动主要有：1，输入数据层的改动。2.out_num由原来的21改为2，由于笔者进行的是二分类。3.将2中对应层的name属性最后加"_n"，不然会和solve.py中移植过来的参数冲突。4.将loss层中的parameter中的忽略label值删掉。因为原来使用的voc数据集中的mask中有白边。并且笔者最后的标签中的像素值也并没有255。只有0，1。

要说明的一点是：nomalize是进行正则化，我设置为了默认的false，这样的话训练出来的loss值非常的大，4，5位数是常有的事，不过笔者认为这样是正常的。因为测试出来的指标如mIOU比较不错，还是以指标为准。后来我将其改为true，发现loss为很小的值，但是指标却不太好。

test.prototxt中设置和这个类似。

import caffe

import numpy as np
from PIL import Image

import random
import cv2

class VOCSegDataLayer(caffe.Layer):
    """
    Load (input image, label image) pairs from PASCAL VOC
    one-at-a-time while reshaping the net to preserve dimensions.

    Use this to feed data to a fully convolutional network.
    """

    def setup(self, bottom, top):
        """
        Setup data layer according to parameters:

        - voc_dir: path to PASCAL VOC year dir
        - split: train / val / test
        - mean: tuple of mean values to subtract
        - randomize: load in random order (default: True)
        - seed: seed for randomization (default: None / current time)

        for PASCAL VOC semantic segmentation.

        example

        params = dict(voc_dir="/path/to/PASCAL/VOC2011",
            mean=(104.00698793, 116.66876762, 122.67891434),
            split="val")
        """
        # config
        params = eval(self.param_str)
        self.voc_dir = params['voc_dir']
        self.split = params['split']
        self.mean = np.array(params['mean'])
        self.random = params.get('randomize', True)
        self.seed = params.get('seed', None)

        # two tops: data and label
        if len(top) != 2:
            raise Exception("Need to define two tops: data and label.")
        # data layers have no bottoms
        if len(bottom) != 0:
            raise Exception("Do not define a bottom.")

        # load indices for images and labels
        split_f  = '{}/{}.txt'.format(self.voc_dir,
                self.split)
        self.indices = open(split_f, 'r').read().splitlines()
        self.idx = 0

        # make eval deterministic
        if 'train' not in self.split:
            self.random = False

        # randomization: seed and pick
        if self.random:
            random.seed(self.seed)
            self.idx = random.randint(0, len(self.indices)-1)


    def reshape(self, bottom, top):
        # load image + label image pair
        self.data = self.load_image(self.indices[self.idx])
        self.label = self.load_label(self.indices[self.idx])
        # reshape tops to fit (leading 1 is for batch dimension)
        top[0].reshape(1, *self.data.shape)
        top[1].reshape(1, *self.label.shape)


    def forward(self, bottom, top):
        # assign output
        top[0].data[...] = self.data
        top[1].data[...] = self.label

        # pick next input
        if self.random:
            self.idx = random.randint(0, len(self.indices)-1)
        else:
            self.idx += 1
            if self.idx == len(self.indices):
                self.idx = 0


    def backward(self, top, propagate_down, bottom):
        pass


    def load_image(self, idx):
        """
        Load input image and preprocess for Caffe:
        - cast to float
        - switch channels RGB -> BGR
        - subtract mean
        - transpose to channel x height x width order
        """
        im = Image.open('{}/images/{}.jpg'.format(self.voc_dir, idx))
	im = im.resize((500, 500))
        in_ = np.array(im, dtype=np.float32)
        in_ = in_[:,:,::-1]
        in_ -= self.mean
        in_ = in_.transpose((2,0,1))
        return in_


    def load_label(self, idx):
        """
        Load label image as 1 x height x width integer array of label indices.
        The leading singleton dimension is required by the loss.
        """
        im = Image.open('{}/masks/{}-profile.png'.format(self.voc_dir, idx))
	im = im.resize((500, 500))
        label = np.array(im, dtype=np.uint8)
        label = label[np.newaxis, ...]
        return label


class SBDDSegDataLayer(caffe.Layer):
    """
    Load (input image, label image) pairs from the SBDD extended labeling
    of PASCAL VOC for semantic segmentation
    one-at-a-time while reshaping the net to preserve dimensions.

    Use this to feed data to a fully convolutional network.
    """

    def setup(self, bottom, top):
        """
        Setup data layer according to parameters:

        - sbdd_dir: path to SBDD `dataset` dir
        - split: train / seg11valid
        - mean: tuple of mean values to subtract
        - randomize: load in random order (default: True)
        - seed: seed for randomization (default: None / current time)

        for SBDD semantic segmentation.

        N.B.segv11alid is the set of segval11 that does not intersect with SBDD.
        Find it here: https://gist.github.com/shelhamer/edb330760338892d511e.

        example

        params = dict(sbdd_dir="/path/to/SBDD/dataset",
            mean=(104.00698793, 116.66876762, 122.67891434),
            split="valid")
        """
        # config
        params = eval(self.param_str)
        self.sbdd_dir = params['sbdd_dir']
        self.split = params['split']
        self.mean = np.array(params['mean'])
        self.random = params.get('randomize', True)
        self.seed = params.get('seed', None)

        # two tops: data and label
        if len(top) != 2:
            raise Exception("Need to define two tops: data and label.")
        # data layers have no bottoms
        if len(bottom) != 0:
            raise Exception("Do not define a bottom.")

        # load indices for images and labels
        split_f  = '{}/{}.txt'.format(self.sbdd_dir,
                self.split)
        self.indices = open(split_f, 'r').read().splitlines()
        self.idx = 0

        # make eval deterministic
        if 'train' not in self.split:
            self.random = False

        # randomization: seed and pick
        if self.random:
            random.seed(self.seed)
            self.idx = random.randint(0, len(self.indices)-1)


    def reshape(self, bottom, top):
        # load image + label image pair
        self.data = self.load_image(self.indices[self.idx])
        self.label = self.load_label(self.indices[self.idx])
        # reshape tops to fit (leading 1 is for batch dimension)
        top[0].reshape(1, *self.data.shape)
        top[1].reshape(1, *self.label.shape)


    def forward(self, bottom, top):
        # assign output
        top[0].data[...] = self.data
        top[1].data[...] = self.label

        # pick next input
        if self.random:
            self.idx = random.randint(0, len(self.indices)-1)
        else:
            self.idx += 1
            if self.idx == len(self.indices):
                self.idx = 0


    def backward(self, top, propagate_down, bottom):
        pass


    def load_image(self, idx):
        """
        Load input image and preprocess for Caffe:
        - cast to float
        - switch channels RGB -> BGR
        - subtract mean
        - transpose to channel x height x width order
        """
        im = Image.open('{}/images/{}.jpg'.format(self.sbdd_dir, idx))
	im = im.resize((500, 500))
        in_ = np.array(im, dtype=np.float32)
        in_ = in_[:,:,::-1]
        in_ -= self.mean
        in_ = in_.transpose((2,0,1))
        return in_


    def load_label(self, idx):
        """
        Load label image as 1 x height x width integer array of label indices.
        The leading singleton dimension is required by the loss.
        """
        im = Image.open('{}/masks/{}-profile.png'.format(self.sbdd_dir, idx))
	im = im.resize((500, 500))
	label = np.array(im, dtype=np.uint8)
	label = label[np.newaxis, ...]
	return label

这里对voc_layers.py数据层进行了修改。

需要说明的是：最好使用源码提供的python类型的数据层，不要将数据集改为LMDB格式进行训练，因为这样会漏掉python函数中一些比较必要的功能函数，导致最后训练不出来。拿到自己的数据集，将其套用进fcn8s例子中的训练、测试层即可。

这里笔者的训练集是jpg格式的，训练集是png格式的。

对输入的图像进行了resize，是因为，输入的图像的大小超出了caffe最终blob的限制。

至此已经基本完成了文件的配置。

cd到文件目录下进行训练，输入python solve.py进行训练。

出现了下面的报错。

产生这个问题的原因是标签的值和我们输出的类别不匹配，即不为2。

我们的数据集明明是黑白的，为什么不为2呢？

于是就对数据集进行了考察。

from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
im = Image.open('/home/zhaoys/myf/dataset/people_segmentation/masks/00001-profile.png')
img = np.array(im)
print img
print img.size
print img.dtype
print img.shape


print img[540, 360]
print np.unique(im)
print im

打印图片中所有不同像素值的种类输出的结果为：

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39 217 218 219 220 221 222 224 225 226 227 228 229 230 231
 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
 250 251 252 253 254 255]

为什么有这么多种的像素值？不管了先进行二值化处理，变为2类吧。执行下面的脚本文件。

import os
import cv2

path  = "/home/zhaoys/myf/dataset/people_segmentation/bad_masks"
os.chdir(path) 
imgList = os.listdir(path)

for img in imgList:
    img_name = img
    print str(img_name)

    img = cv2.imread(img)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
    ret, img = cv2.threshold(img, 127, 1, cv2.THRESH_BINARY)    

    cv2.imwrite("/home/zhaoys/myf/dataset/people_segmentation/masks/" + img_name[0:-3] + "png", img)	
print ("done!")

这样就可以了。所有正常的标签图片都保存在了masks里面。

这里需要说明的是，千万不能保存为比较坑的.jpg文件格式，因为这种格式的图片文件是有损压缩的，就算进行二值化，还是会有其他的像素值。这里保存为了.png格式。

再次执行上上面的脚本文件，输出结果为[0,1]，此时图片中只有两种像素类型，即0和1，这样就满足了二分类标签的要求。

另外读取图片的时候要通过灰度图读取进来，这样最后生成的图片是8位的，不然就是24位的，这样就跟网络中的参数不匹配了。

最后笔者产生的数据集中的标签图片是这样的。

没错，看起来全部是黑的。因为灰度是0和1。

图片的位宽是8位的。

执行下面的脚本文件转化为索引图。

from random import randint
from PIL import Image
import os
import cv2

path  = "/home/zhaoys/myf/dataset/people_segmentation/masks"
os.chdir(path) 
imgList = os.listdir(path) 

for pic in imgList:
	img = Image.open(pic)
    img.putpalette([0,0,0,255,0,0])
    img.save(pic)
	
print ("done!")

得到的图片是这样的。

3.进行最终的训练

训练是基于上面全黑的masks训练的，不过通过索引图训练应该也是可以的。

测试的指标如下：

到这里已经训练成功了。

测试的infer文件。

import numpy as np
from PIL import Image

import caffe
import vis
import cv2

# the demo image is "2007_000129" from PASCAL VOC

# load image, switch to BGR, subtract mean, and make dims C x H x W for Caffe
im = Image.open('/home/zhaoys/myf/fcn/people-fcn8s/demo/image.jpg')
in_ = np.array(im, dtype=np.float32)
in_ = in_[:,:,::-1]
in_ -= np.array((184.17504, 193.82224, 204.57951))
in_ = in_.transpose((2,0,1))

# load net
net = caffe.Net('/home/zhaoys/myf/fcn/people-fcn8s/infer.prototxt', '/home/zhaoys/myf/fcn/people-fcn8s/snapshot/train_iter_4000.caffemodel', caffe.TEST)
# shape for input (data blob is N x C x H x W), set data
net.blobs['data'].reshape(1, *in_.shape)
net.blobs['data'].data[...] = in_
# run net and take argmax for prediction
net.forward()

'''
out = net.blobs['score'].data[0].argmax(axis=0)
image_0 = net.blobs['data'].data[0,0]
image_1 = net.blobs['conv3_1'].data[0,0]
image_2 = net.blobs['conv5_3'].data[0,1]
image_3 = net.blobs['fuse_pool4'].data[0,1]


cv2.imshow('1', out)
cv2.imshow('2', image_0)
cv2.imshow('3', image_1)
cv2.imshow('4', image_2)
cv2.imshow('5', image_3)
cv2.waitKey(0)

img = np.array(out)
print img
print img.size
print img.dtype
print img.shape
print img[540, 360]
print np.unique(img)
'''

# visualize segmentation in PASCAL VOC colors
voc_palette = vis.make_palette(2)
out_im = Image.fromarray(vis.color_seg(out, voc_palette))
out_im.save('/home/zhaoys/myf/fcn/people-fcn8s/demo/output.png')
masked_im = Image.fromarray(vis.vis_seg(im, out, voc_palette))
masked_im.save('/home/zhaoys/myf/fcn/people-fcn8s/demo/visualization.jpg')

测试的infer.prototxt网络结构文件。

layer {
  name: "input"
  type: "Input"
  top: "data"
  input_param {
    shape { dim: 1 dim: 3 dim: 500 dim: 500 }
  }
}
layer {
  name: "conv1_1"
  type: "Convolution"
  bottom: "data"
  top: "conv1_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 64
    pad: 100
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu1_1"
  type: "ReLU"
  bottom: "conv1_1"
  top: "conv1_1"
}
layer {
  name: "conv1_2"
  type: "Convolution"
  bottom: "conv1_1"
  top: "conv1_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu1_2"
  type: "ReLU"
  bottom: "conv1_2"
  top: "conv1_2"
}
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "conv1_2"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv2_1"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu2_1"
  type: "ReLU"
  bottom: "conv2_1"
  top: "conv2_1"
}
layer {
  name: "conv2_2"
  type: "Convolution"
  bottom: "conv2_1"
  top: "conv2_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu2_2"
  type: "ReLU"
  bottom: "conv2_2"
  top: "conv2_2"
}
layer {
  name: "pool2"
  type: "Pooling"
  bottom: "conv2_2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv3_1"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu3_1"
  type: "ReLU"
  bottom: "conv3_1"
  top: "conv3_1"
}
layer {
  name: "conv3_2"
  type: "Convolution"
  bottom: "conv3_1"
  top: "conv3_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu3_2"
  type: "ReLU"
  bottom: "conv3_2"
  top: "conv3_2"
}
layer {
  name: "conv3_3"
  type: "Convolution"
  bottom: "conv3_2"
  top: "conv3_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu3_3"
  type: "ReLU"
  bottom: "conv3_3"
  top: "conv3_3"
}
layer {
  name: "pool3"
  type: "Pooling"
  bottom: "conv3_3"
  top: "pool3"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv4_1"
  type: "Convolution"
  bottom: "pool3"
  top: "conv4_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu4_1"
  type: "ReLU"
  bottom: "conv4_1"
  top: "conv4_1"
}
layer {
  name: "conv4_2"
  type: "Convolution"
  bottom: "conv4_1"
  top: "conv4_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu4_2"
  type: "ReLU"
  bottom: "conv4_2"
  top: "conv4_2"
}
layer {
  name: "conv4_3"
  type: "Convolution"
  bottom: "conv4_2"
  top: "conv4_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu4_3"
  type: "ReLU"
  bottom: "conv4_3"
  top: "conv4_3"
}
layer {
  name: "pool4"
  type: "Pooling"
  bottom: "conv4_3"
  top: "pool4"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv5_1"
  type: "Convolution"
  bottom: "pool4"
  top: "conv5_1"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu5_1"
  type: "ReLU"
  bottom: "conv5_1"
  top: "conv5_1"
}
layer {
  name: "conv5_2"
  type: "Convolution"
  bottom: "conv5_1"
  top: "conv5_2"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu5_2"
  type: "ReLU"
  bottom: "conv5_2"
  top: "conv5_2"
}
layer {
  name: "conv5_3"
  type: "Convolution"
  bottom: "conv5_2"
  top: "conv5_3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 512
    pad: 1
    kernel_size: 3
    stride: 1
  }
}
layer {
  name: "relu5_3"
  type: "ReLU"
  bottom: "conv5_3"
  top: "conv5_3"
}
layer {
  name: "pool5"
  type: "Pooling"
  bottom: "conv5_3"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "fc6"
  type: "Convolution"
  bottom: "pool5"
  top: "fc6"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 4096
    pad: 0
    kernel_size: 7
    stride: 1
  }
}
layer {
  name: "relu6"
  type: "ReLU"
  bottom: "fc6"
  top: "fc6"
}
layer {
  name: "drop6"
  type: "Dropout"
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "fc7"
  type: "Convolution"
  bottom: "fc6"
  top: "fc7"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 4096
    pad: 0
    kernel_size: 1
    stride: 1
  }
}
layer {
  name: "relu7"
  type: "ReLU"
  bottom: "fc7"
  top: "fc7"
}
layer {
  name: "drop7"
  type: "Dropout"
  bottom: "fc7"
  top: "fc7"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "score_fr_n"
  type: "Convolution"
  bottom: "fc7"
  top: "score_fr"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 2
    pad: 0
    kernel_size: 1
  }
}
layer {
  name: "upscore2_n"
  type: "Deconvolution"
  bottom: "score_fr"
  top: "upscore2"
  param {
    lr_mult: 0.0
  }
  convolution_param {
    num_output: 2
    bias_term: false
    kernel_size: 4
    stride: 2
  }
}
layer {
  name: "score_pool4_n"
  type: "Convolution"
  bottom: "pool4"
  top: "score_pool4"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 2
    pad: 0
    kernel_size: 1
  }
}
layer {
  name: "score_pool4c"
  type: "Crop"
  bottom: "score_pool4"
  bottom: "upscore2"
  top: "score_pool4c"
  crop_param {
    axis: 2
    offset: 5
  }
}
layer {
  name: "fuse_pool4"
  type: "Eltwise"
  bottom: "upscore2"
  bottom: "score_pool4c"
  top: "fuse_pool4"
  eltwise_param {
    operation: SUM
  }
}
layer {
  name: "upscore_pool4_n"
  type: "Deconvolution"
  bottom: "fuse_pool4"
  top: "upscore_pool4"
  param {
    lr_mult: 0.0
  }
  convolution_param {
    num_output: 2
    bias_term: false
    kernel_size: 4
    stride: 2
  }
}
layer {
  name: "score_pool3_n"
  type: "Convolution"
  bottom: "pool3"
  top: "score_pool3"
  param {
    lr_mult: 1.0
    decay_mult: 1.0
  }
  param {
    lr_mult: 2.0
    decay_mult: 0.0
  }
  convolution_param {
    num_output: 2
    pad: 0
    kernel_size: 1
  }
}
layer {
  name: "score_pool3c"
  type: "Crop"
  bottom: "score_pool3"
  bottom: "upscore_pool4"
  top: "score_pool3c"
  crop_param {
    axis: 2
    offset: 9
  }
}
layer {
  name: "fuse_pool3"
  type: "Eltwise"
  bottom: "upscore_pool4"
  bottom: "score_pool3c"
  top: "fuse_pool3"
  eltwise_param {
    operation: SUM
  }
}
layer {
  name: "upscore8_n"
  type: "Deconvolution"
  bottom: "fuse_pool3"
  top: "upscore8"
  param {
    lr_mult: 0.0
  }
  convolution_param {
    num_output: 2
    bias_term: false
    kernel_size: 16
    stride: 8
  }
}
layer {
  name: "score"
  type: "Crop"
  bottom: "upscore8"
  bottom: "data"
  top: "score"
  crop_param {
    axis: 2
    offset: 31
  }
}

输出的图片。