FCN源码解读之voc_layers.py

71 阅读 0 评论 47 点赞

我是靠谱客的博主舒适白云，最近开发中收集的这篇文章主要介绍FCN源码解读之voc_layers.py，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

voc_layers.py是FCN中利用python写的数据层（即使用caffe的Python API 写的数据输入层），其格式是相对固定的，包含setup()、reshape()、forward()、backward()四个必要函数。

其源码如下：

import caffe
import numpy as np
from PIL import Image
import random
class VOCSegDataLayer(caffe.Layer):
"""
Load (input image, label image) pairs from PASCAL VOC
one-at-a-time while reshaping the net to preserve dimensions.
Use this to feed data to a fully convolutional network.
"""
def setup(self, bottom, top):
"""
Setup data layer according to parameters:
- voc_dir: path to PASCAL VOC year dir
- split: train / val / test
- mean: tuple of mean values to subtract
- randomize: load in random order (default: True)
- seed: seed for randomization (default: None / current time)
for PASCAL VOC semantic segmentation.
example
params = dict(voc_dir="/path/to/PASCAL/VOC2011",
mean=(104.00698793, 116.66876762, 122.67891434),
split="val")
"""
# config
params = eval(self.param_str)
self.voc_dir = params['voc_dir']
self.split = params['split']
self.mean = np.array(params['mean'])
self.random = params.get('randomize', True)
self.seed = params.get('seed', None)
# two tops: data and label
if len(top) != 2:
raise Exception("Need to define two tops: data and label.")
# data layers have no bottoms
if len(bottom) != 0:
raise Exception("Do not define a bottom.")
# load indices for images and labels
split_f
= '{}/ImageSets/Segmentation/{}.txt'.format(self.voc_dir,
self.split)
self.indices = open(split_f, 'r').read().splitlines()
self.idx = 0
# make eval deterministic
if 'train' not in self.split:
self.random = False
# randomization: seed and pick
if self.random:
random.seed(self.seed)
self.idx = random.randint(0, len(self.indices)-1)
def reshape(self, bottom, top):
# load image + label image pair
self.data = self.load_image(self.indices[self.idx])
self.label = self.load_label(self.indices[self.idx])
# reshape tops to fit (leading 1 is for batch dimension)
top[0].reshape(1, *self.data.shape)
top[1].reshape(1, *self.label.shape)
def forward(self, bottom, top):
# assign output
top[0].data[...] = self.data
top[1].data[...] = self.label
# pick next input
if self.random:
self.idx = random.randint(0, len(self.indices)-1)
else:
self.idx += 1
if self.idx == len(self.indices):
self.idx = 0
def backward(self, top, propagate_down, bottom):
pass
def load_image(self, idx):
"""
Load input image and preprocess for Caffe:
- cast to float
- switch channels RGB -> BGR
- subtract mean
- transpose to channel x height x width order
"""
im = Image.open('{}/JPEGImages/{}.jpg'.format(self.voc_dir, idx))
in_ = np.array(im, dtype=np.float32)
in_ = in_[:,:,::-1]
in_ -= self.mean
in_ = in_.transpose((2,0,1))
return in_
def load_label(self, idx):
"""
Load label image as 1 x height x width integer array of label indices.
The leading singleton dimension is required by the loss.
"""
im = Image.open('{}/SegmentationClass/{}.png'.format(self.voc_dir, idx))
label = np.array(im, dtype=np.uint8)
label = label[np.newaxis, ...]
return label
class SBDDSegDataLayer(caffe.Layer):
"""
Load (input image, label image) pairs from the SBDD extended labeling
of PASCAL VOC for semantic segmentation
one-at-a-time while reshaping the net to preserve dimensions.
Use this to feed data to a fully convolutional network.
"""
def setup(self, bottom, top):
"""
Setup data layer according to parameters:
- sbdd_dir: path to SBDD `dataset` dir
- split: train / seg11valid
- mean: tuple of mean values to subtract
- randomize: load in random order (default: True)
- seed: seed for randomization (default: None / current time)
for SBDD semantic segmentation.
N.B.segv11alid is the set of segval11 that does not intersect with SBDD.
Find it here: https://gist.github.com/shelhamer/edb330760338892d511e.
example
params = dict(sbdd_dir="/path/to/SBDD/dataset",
mean=(104.00698793, 116.66876762, 122.67891434),
split="valid")
"""
# config
params = eval(self.param_str)
self.sbdd_dir = params['sbdd_dir']
self.split = params['split']
self.mean = np.array(params['mean'])
self.random = params.get('randomize', True)
self.seed = params.get('seed', None)
# two tops: data and label
if len(top) != 2:
raise Exception("Need to define two tops: data and label.")
# data layers have no bottoms
if len(bottom) != 0:
raise Exception("Do not define a bottom.")
# load indices for images and labels
split_f
= '{}/{}.txt'.format(self.sbdd_dir,
self.split)
self.indices = open(split_f, 'r').read().splitlines()
self.idx = 0
# make eval deterministic
if 'train' not in self.split:
self.random = False
# randomization: seed and pick
if self.random:
random.seed(self.seed)
self.idx = random.randint(0, len(self.indices)-1)
def reshape(self, bottom, top):
# load image + label image pair
self.data = self.load_image(self.indices[self.idx])
self.label = self.load_label(self.indices[self.idx])
# reshape tops to fit (leading 1 is for batch dimension)
top[0].reshape(1, *self.data.shape)
top[1].reshape(1, *self.label.shape)
def forward(self, bottom, top):
# assign output
top[0].data[...] = self.data
top[1].data[...] = self.label
# pick next input
if self.random:
self.idx = random.randint(0, len(self.indices)-1)
else:
self.idx += 1
if self.idx == len(self.indices):
self.idx = 0
def backward(self, top, propagate_down, bottom):
pass
def load_image(self, idx):
"""
Load input image and preprocess for Caffe:
- cast to float
- switch channels RGB -> BGR
- subtract mean
- transpose to channel x height x width order
"""
im = Image.open('{}/img/{}.jpg'.format(self.sbdd_dir, idx))
in_ = np.array(im, dtype=np.float32)
in_ = in_[:,:,::-1]
in_ -= self.mean
in_ = in_.transpose((2,0,1))
return in_
def load_label(self, idx):
"""
Load label image as 1 x height x width integer array of label indices.
The leading singleton dimension is required by the loss.
"""
import scipy.io
mat = scipy.io.loadmat('{}/cls/{}.mat'.format(self.sbdd_dir, idx))
label = mat['GTcls'][0]['Segmentation'][0].astype(np.uint8)
label = label[np.newaxis, ...]
return label

详细代码解读如下

1.VOCSegDataLayer类（也即net.py中申明的测试时的输入层pylayer）

此类对应于val.prototxt中的输入层，即：

layer {
name: "data"
type: "Python"
top: "data"
top: "label"
python_param {
module: "voc_layers"
layer: "VOCSegDataLayer"
param_str: "{'voc_dir': '../data/VOC2012', 'seed': 1337, 'split': 'seg11valid', 'mean': (104.00699, 116.66877, 122.67892)}"
}
}

具体源码解读如下

#测试时用到的定义数据层的VOCSegDataLayer类（对应于测试集或验证集），类中根据caffe提供的python接口定义相
#应的函数，详细可参见https://chrischoy.github.io/research/caffe-python-layer/
class VOCSegDataLayer(caffe.Layer):
"""
Load (input image, label image) pairs from PASCAL VOC
one-at-a-time while reshaping the net to preserve dimensions.
Use this to feed data to a fully convolutional network.
"""
#setup函数，根据相应参数设置数据层
def setup(self, bottom, top):
"""
Setup data layer according to parameters:
- voc_dir: path to PASCAL VOC year dir 测试集或验证集的路径
- split: train / val / test split可以为train/val/test中的任意一者（即也可以看看训练集的效果）
- mean: tuple of mean values to subtract
存储着所要减去的平均值（减去平均值可以加速迭代）
- randomize: load in random order (default: True) 当randomize=True时，开启随机加载图片模式
- seed: seed for randomization (default: None / current time) 随机模式的种子（默认值为None）
for PASCAL VOC semantic segmentation.
example
params = dict(voc_dir="/path/to/PASCAL/VOC2011",
mean=(104.00698793, 116.66876762, 122.67891434),
split="val")
"""
# config
params = eval(self.param_str) #读入参数
self.voc_dir = params['voc_dir']
self.split = params['split']
self.mean = np.array(params['mean'])
self.random = params.get('randomize', True)
self.seed = params.get('seed', None)
# two tops: data and label
#判断输出是否包含数据和标记
if len(top) != 2:
raise Exception("Need to define two tops: data and label.")
# data layers have no bottoms
#判断是否有输入（数据层不需要定义输入bottom）
if len(bottom) != 0:
raise Exception("Do not define a bottom.")
# load indices for images and labels
#获取所需要加载的图片的编号（即读取'split'.txt文档中的图片索引编号，这些编号其实是图片名）
split_f
= '{}/ImageSets/Segmentation/{}.txt'.format(self.voc_dir,
self.split) #第一个{}即self.voc_dir;第二个{}即self.split
#splitlines()，按行('r', 'rn', n')分隔，返回一个包含各行作为元素的列表
#即indices是所有图片编号的列表（按行存放成一列）
self.indices = open(split_f, 'r').read().splitlines()
self.idx = 0
#indices列表索引指针idx初始化为0
# make eval deterministic
#当split=test或者val时，不需要开启随机模式
if 'train' not in self.split:
self.random = False
# randomization: seed and pick
#判断是否开启随机读取图片模式
if self.random:
random.seed(self.seed)
#随机生成一个整数作为索引号idx(范围为0~(len(self.indices)-1))
self.idx = random.randint(0, len(self.indices)-1)
#reshape函数，根据索引号idx加载相应图片，并调整数据层的大小
def reshape(self, bottom, top):
# load image + label image pair
#load_image()和load_label()函数在后面定义
self.data = self.load_image(self.indices[self.idx])
self.label = self.load_label(self.indices[self.idx])
# reshape tops to fit (leading 1 is for batch dimension)
'''
重新调整数据层的大小（即caffe所加载的数据层的大小在每次迭代训练中是可以变的，
因为数据层的大小并不影响各层参数的大小）
caffe中的数据按N*C*H*W存储的，N为batch size,C为通道数，H和W分别为长和宽，这里的1即为batch size
也就对应了FCN论文中所讲到的采用SGD算法（随机梯度下降法，每一迭代训练的图片数为1）
'''
top[0].reshape(1, *self.data.shape)
#data
top[1].reshape(1, *self.label.shape) #label
#定义前向传播函数forward(),数据层的前向传播不对数据进行任何操作，只是简单的输出数据本身
def forward(self, bottom, top):
# assign output
top[0].data[...] = self.data
top[1].data[...] = self.label
#输出数据的同时，进行下一次迭代时所需要的图片的选择（即产生下一个索引号idx）
# pick next input
if self.random:
self.idx = random.randint(0, len(self.indices)-1)
else:
self.idx += 1
if self.idx == len(self.indices):
self.idx = 0
#数据层不需要后向传播，直接pass
def backward(self, top, propagate_down, bottom):
pass
#加载图片的函数（根据索引号idx进行加载）
def load_image(self, idx):
"""
Load input image and preprocess for Caffe: 加载图片并处理成caffe的数据格式
- cast to float 转换为float型
- switch channels RGB -> BGR 交换通道位置，即R通道和B通道交换（感觉是用了opencv库的原因）
- subtract mean
减去均值
- transpose to channel x height x width order 将通道数放在前面（对应caffe数据存储的格式）
"""
im = Image.open('{}/JPEGImages/{}.jpg'.format(self.voc_dir, idx))
in_ = np.array(im, dtype=np.float32)
in_ = in_[:,:,::-1]
#-1表示从最后一维开始往前读取数据。即交换R通道和B通道
in_ -= self.mean
#减去均值
in_ = in_.transpose((2,0,1)) #将通道数放在前面
return in_

python中的PIL所读取的三通道彩色图片是按H*W*C存放的，且三通道是顺序是标准的RGB顺序，输入到caffe中处理前，需要进行相应的转换。

caffe中的数据存储方式是N*C*H*W，且是按BGR顺序存放三通道的。

所有需要先进行RGB转换到BGR，具体可直接使用 in_ = in_[:,:,::-1]语句实现，具体理解可参见以下例子（其中a的第三维可看成是C，且按RGB顺序存放，前两维可看成是H和W）：

import numpy as np
a = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]],[[13,14,15],[16,17,18]]])
print(str(a.shape))
print(str(a))
a = a[:,:,::-1] 
#a = a.transpose((2,0,1))
print(str(a.shape))
print(str(a))

以这个例子来说，第一行第一列所在位置的像素点的像素值分别为：R=1，G=2，B=3

运行结果为（可以看出第一行第一列所在位置的像素点的像素值分别为：B=3，G=2，R=1）：

(3L, 2L, 3L)
[[[ 1
2
3]
[ 4
5
6]]
[[ 7
8
9]
[10 11 12]]
[[13 14 15]
[16 17 18]]]
(3L, 2L, 3L)
[[[ 3
2
1]
[ 6
5
4]]
[[ 9
8
7]
[12 11 10]]
[[15 14 13]
[18 17 16]]]

在此基础上还需要减去各个通道的均值，来进行均值归一化来加速算法执行速度。

最后按照caffe的存储数据的格式将通道数放在前面，即利用python中的transpose()函数进行转置操作，具体理解参见以下例子：

import numpy as np
a = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]],[[13,14,15],[16,17,18]]])
print(str(a.shape))
print(str(a))
a = a[:,:,::-1]
a = a.transpose((2,0,1))
print(str(a.shape))
print(str(a))

运行结果如下（由此可看出原来的第三维变到了第一维）：

(3L, 2L, 3L)
[[[ 1
2
3]
[ 4
5
6]]
[[ 7
8
9]
[10 11 12]]
[[13 14 15]
[16 17 18]]]
(3L, 3L, 2L)
[[[ 3
6]
[ 9 12]
[15 18]]
[[ 2
5]
[ 8 11]
[14 17]]
[[ 1
4]
[ 7 10]
[13 16]]]

即，例如结果中的

[[ 3
6]
[ 9 12]
[15 18]]

表示的是所有像素点的B通道的像素数值，也即表示的是原图像的B通道。


#加载标记的函数（按照索引号idx加载相应的label图片）
def load_label(self, idx):
"""
Load label image as 1 x height x width integer array of label indices.
The leading singleton dimension is required by the loss.
"""
im = Image.open('{}/SegmentationClass/{}.png'.format(self.voc_dir, idx))
label = np.array(im, dtype=np.uint8)
#标签是单通道的
#np.newaxis的功能是插入新维度，即将原来的H×W转换为1×H×W
label = label[np.newaxis, ...]
return label

2.SBDDSegDataLayer类（也即net.py中申明的训练时的输入层pylayer）

此类对应于train.prototxt中的输入层，即：

layer {
name: "data"
type: "Python"
top: "data"
top: "label"
python_param {
module: "voc_layers"
layer: "SBDDSegDataLayer"
param_str: "{'sbdd_dir': '../data/VOC2012', 'seed': 1337, 'split': 'train', 'mean': (104.00699, 116.66877, 122.67892)}"
}
}

SBDDSegDataLayer类的代码和VOCSegDataLayer类类似，在此不再重复解读，就其中的一小点进行说明。

#训练时用到的定义数据层的SBDDSegDataLayer类（对应于训练集），类中根据caffe提供的python接口定义相
#应的函数
class SBDDSegDataLayer(caffe.Layer):
"""
Load (input image, label image) pairs from the SBDD extended labeling
of PASCAL VOC for semantic segmentation
one-at-a-time while reshaping the net to preserve dimensions.
Use this to feed data to a fully convolutional network.
"""
def setup(self, bottom, top):
"""
Setup data layer according to parameters:
- sbdd_dir: path to SBDD `dataset` dir
- split: train / seg11valid
- mean: tuple of mean values to subtract
- randomize: load in random order (default: True)
- seed: seed for randomization (default: None / current time)
for SBDD semantic segmentation.
N.B.segv11alid is the set of segval11 that does not intersect with SBDD.
Find it here: https://gist.github.com/shelhamer/edb330760338892d511e.
example
params = dict(sbdd_dir="/path/to/SBDD/dataset",
mean=(104.00698793, 116.66876762, 122.67891434),
split="valid")
"""
# config
params = eval(self.param_str)
self.sbdd_dir = params['sbdd_dir']
self.split = params['split']
self.mean = np.array(params['mean'])
self.random = params.get('randomize', True)
self.seed = params.get('seed', None)
# two tops: data and label
if len(top) != 2:
raise Exception("Need to define two tops: data and label.")
# data layers have no bottoms
if len(bottom) != 0:
raise Exception("Do not define a bottom.")
# load indices for images and labels
split_f
= '{}/{}.txt'.format(self.sbdd_dir,
self.split)
self.indices = open(split_f, 'r').read().splitlines()
self.idx = 0
# make eval deterministic
if 'train' not in self.split:
self.random = False
# randomization: seed and pick
if self.random:
random.seed(self.seed)
self.idx = random.randint(0, len(self.indices)-1)
def reshape(self, bottom, top):
# load image + label image pair
self.data = self.load_image(self.indices[self.idx])
self.label = self.load_label(self.indices[self.idx])
# reshape tops to fit (leading 1 is for batch dimension)
top[0].reshape(1, *self.data.shape)
top[1].reshape(1, *self.label.shape)
def forward(self, bottom, top):
# assign output
top[0].data[...] = self.data
top[1].data[...] = self.label
# pick next input
if self.random:
self.idx = random.randint(0, len(self.indices)-1)
else:
self.idx += 1
if self.idx == len(self.indices):
self.idx = 0
def backward(self, top, propagate_down, bottom):
pass
def load_image(self, idx):
"""
Load input image and preprocess for Caffe:
- cast to float
- switch channels RGB -> BGR
- subtract mean
- transpose to channel x height x width order
"""
im = Image.open('{}/img/{}.jpg'.format(self.sbdd_dir, idx))
in_ = np.array(im, dtype=np.float32)
in_ = in_[:,:,::-1]
in_ -= self.mean
in_ = in_.transpose((2,0,1))
return in_
def load_label(self, idx):
"""
Load label image as 1 x height x width integer array of label indices.
The leading singleton dimension is required by the loss.
"""
import scipy.io
mat = scipy.io.loadmat('{}/cls/{}.mat'.format(self.sbdd_dir, idx)) #训练集的标签为.mat格式
label = mat['GTcls'][0]['Segmentation'][0].astype(np.uint8)
label = label[np.newaxis, ...]
return label

SBDDSegDataLayer类所加载的训练样本的标记图片是按Matlab的mat进行存储的，但实际使用时，我们没有必要按照mat格式来加载标记图片，可参见VOCSegDataLayer类直接读取.png或.jpg格式的标记图片，即可将这个load_label()函数修改为：


def load_label(self, idx):
"""
Load label image as 1 x height x width integer array of label indices.
The leading singleton dimension is required by the loss.
"""
im = Image.open('{}/SegmentationClass/{}.png'.format(self.sbdd_dir, idx))
        label = np.array(im, dtype=np.uint8)
        label = label[np.newaxis, ...]
return label