Permute层是SSD(Single Shot MultiBox Detector)中用于置换索引轴顺序的,与matlab中的permute()函数实现类似的功能,首先我们看一下caffe.proto中关于该层参数的说明:
optional PermuteParameter permute_param = 202;
message PermuteParameter {
// The new orders of the axes of data. Notice it should be with
// in the same range as the input data, and it starts from 0.
// Do not provide repeated order.
repeated uint32 order = 1;
permute_param {
order: 1
order: 0
order: 2
order: 3
permute_param {
order: 1
order: 0
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
namespace caffe {
* @brief Permute the input blob by changing the memory order of the data.
* TODO(weiliu89): thorough documentation for Forward, Backward, and proto params.
// The main function which does the permute.
template <typename Dtype>
void Permute(const int count, Dtype* bottom_data, const bool forward,
const int* permute_order, const int* old_steps, const int* new_steps,
const int num_axes, Dtype* top_data);
template <typename Dtype>
class PermuteLayer : public Layer<Dtype> {
explicit PermuteLayer(const LayerParameter& param)
: Layer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual inline const char* type() const { return "Permute"; }
virtual inline int ExactNumBottomBlobs() const { return 1; } //输入blob数目为1
virtual inline int ExactNumTopBlobs() const { return 1; } //输出blob数目也为1
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
int num_axes_; //输入blob的索引轴数目(即维数)
bool need_permute_; //判断是否需要置换索引轴顺序
// Use Blob because it is convenient to be accessible in .cu file.
Blob<int> permute_order_; //用于记录置换顺序后的各轴顺序
Blob<int> old_steps_; //用于记录置换前某几维的总元素数目
Blob<int> new_steps_; //用于记录置换后某几维的总元素数目
} // namespace caffe
#include <vector>
#include "caffe/layers/permute_layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
template <typename Dtype>
void Permute(const int count, Dtype* bottom_data, const bool forward,
const int* permute_order, const int* old_steps, const int* new_steps,
const int num_axes, Dtype* top_data) {
for (int i = 0; i < count; ++i) {
int old_idx = 0;
int idx = i;
for (int j = 0; j < num_axes; ++j) {
int order = permute_order[j];
old_idx += (idx / new_steps[j]) * old_steps[order]; //old_idx为原始数据对应于现在的i的索引
idx %= new_steps[j];
if (forward) {
top_data[i] = bottom_data[old_idx];
} else {
bottom_data[old_idx] = top_data[i];
template <typename Dtype>
void PermuteLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
PermuteParameter permute_param = this->layer_param_.permute_param();
CHECK_EQ(bottom.size(), 1);
num_axes_ = bottom[0]->num_axes(); //获取输入blob的轴数目
vector<int> orders;
// Push the specified new orders.
for (int i = 0; i < permute_param.order_size(); ++i) {
int order = permute_param.order(i);
CHECK_LT(order, num_axes_)
<< "order should be less than the input dimension.";
if (std::find(orders.begin(), orders.end(), order) != orders.end()) {
LOG(FATAL) << "there are duplicate orders";
// Push the rest orders. And save original step sizes for each axis.
for (int i = 0; i < num_axes_; ++i) {
if (std::find(orders.begin(), orders.end(), i) == orders.end()) {
CHECK_EQ(num_axes_, orders.size());
// Check if we need to reorder the data or keep it.检查是否需要改变数据的索引轴顺序
need_permute_ = false;
for (int i = 0; i < num_axes_; ++i) {
if (orders[i] != i) {
// As long as there is one order which is different from the natural order
// of the data, we need to permute. Otherwise, we share the data and diff.
need_permute_ = true;
vector<int> top_shape(num_axes_, 1); //用于记录置换顺序后的输出blob的大小
permute_order_.Reshape(num_axes_, 1, 1, 1); //用于记录置换顺序后的各轴顺序
old_steps_.Reshape(num_axes_, 1, 1, 1);
new_steps_.Reshape(num_axes_, 1, 1, 1);
for (int i = 0; i < num_axes_; ++i) {
permute_order_.mutable_cpu_data()[i] = orders[i]; //将置换顺序写入permute_order_(blob)中
top_shape[i] = bottom[0]->shape(orders[i]); //将置换顺序后的输出blob的大小依次写入top_shape中
top[0]->Reshape(top_shape); //根据top_shape重新修正输出blob的大小
template <typename Dtype>
void PermuteLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
vector<int> top_shape;
for (int i = 0; i < num_axes_; ++i) {
if (i == num_axes_ - 1) {
old_steps_.mutable_cpu_data()[i] = 1;
} else {
old_steps_.mutable_cpu_data()[i] = bottom[0]->count(i + 1); //count(int start_axis)实现计算从某一维度开始的元素总数
top[0]->Reshape(top_shape); //感觉多此一举(上面建立层的函数已经reshape过了)
for (int i = 0; i < num_axes_; ++i) {
if (i == num_axes_ - 1) {
new_steps_.mutable_cpu_data()[i] = 1;
} else {
new_steps_.mutable_cpu_data()[i] = top[0]->count(i + 1);
template <typename Dtype>
void PermuteLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
if (need_permute_) {
Dtype* bottom_data = bottom[0]->mutable_cpu_data();
Dtype* top_data = top[0]->mutable_cpu_data();
const int top_count = top[0]->count();
const int* permute_order = permute_order_.cpu_data();
const int* old_steps = old_steps_.cpu_data();
const int* new_steps = new_steps_.cpu_data();
bool forward = true;
Permute(top_count, bottom_data, forward, permute_order, old_steps,
new_steps, num_axes_, top_data);
} else {
// If there is no need to permute, we share data to save memory.
top[0]->ShareData(*bottom[0]); //输出共享输入数据,节省内存
template <typename Dtype>
void PermuteLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
if (need_permute_) {
Dtype* top_diff = top[0]->mutable_cpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
const int top_count = top[0]->count();
const int* permute_order = permute_order_.cpu_data();
const int* old_steps = old_steps_.cpu_data();
const int* new_steps = new_steps_.cpu_data();
bool forward = false;
Permute(top_count, bottom_diff, forward, permute_order, old_steps,
new_steps, num_axes_, top_diff);
} else {
// If there is no need to permute, we share diff to save memory.
#ifdef CPU_ONLY
} // namespace caffe
old_steps_[0] = channel×height×width = 12
old_steps_[1] = height×width = 6
old_steps_[2] = width = 2
old_steps_[3] = 1(无论输入为什么,均为1)
new_steps_[0] = channel×width×height= 12
new_steps_[1] = width×height = 6
new_steps_[2] = height = 3
new_steps_[3] = 1(无论输入为什么,均为1)
则在调用input_ = input.mutable_cpu_data()或input_ = input.mutable_cpu_diff()得到的是序列化后的数据(按0000-1121依次增大的顺序序列化),即:
input_ [0]=0
input_ [1]=1
input_ [23]=23
for (int i = 0; i < count; ++i) {
int old_idx = 0;
int idx = i;
for (int j = 0; j < num_axes; ++j) {
int order = permute_order[j];
old_idx += (idx / new_steps[j]) * old_steps[order]; //old_idx为原始数据对应于现在的i的索引
idx %= new_steps[j];
第一个for循环就是依次取出置换后各元素在数组中的索引idx;第二个for循环计算idx对应的原数据对应的该元素的索引old_idx,实现过程就是不断计算除数和余数来实现 。
假设idx=1,则old_idx = (1 / 12)*12 + ((1 % 12) / 6)*6 + (((1 % 12) % 6) / 3)*1 + ((((1 % 12) % 6) % 3) / 1)*2 = 2
#include <algorithm>
#include <cfloat>
#include <vector>
#include "caffe/layers/permute_layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
template <typename Dtype>
__global__ void PermuteKernel(const int nthreads,
Dtype* const bottom_data, const bool forward, const int* permute_order,
const int* old_steps, const int* new_steps, const int num_axes,
Dtype* const top_data) {
CUDA_KERNEL_LOOP(index, nthreads) { //CUDA_KERNEL_LOOP函数相当于for循环,只是是多线程的for循环
int temp_idx = index;
int old_idx = 0;
for (int i = 0; i < num_axes; ++i) {
int order = permute_order[i];
old_idx += (temp_idx / new_steps[i]) * old_steps[order];
temp_idx %= new_steps[i];
if (forward) {
top_data[index] = bottom_data[old_idx];
} else {
bottom_data[old_idx] = top_data[index];
template <typename Dtype>
void PermuteLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
if (need_permute_) {
Dtype* bottom_data = bottom[0]->mutable_gpu_data();
Dtype* top_data = top[0]->mutable_gpu_data();
int count = top[0]->count();
const int* permute_order = permute_order_.gpu_data();
const int* new_steps = new_steps_.gpu_data();
const int* old_steps = old_steps_.gpu_data();
bool foward = true;
// NOLINT_NEXT_LINE(whitespace/operators)
PermuteKernel<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
count, bottom_data, foward, permute_order, old_steps, new_steps,
num_axes_, top_data);
} else {
// If there is no need to permute, we share data to save memory.
template <typename Dtype>
void PermuteLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
if (need_permute_) {
Dtype* top_diff = top[0]->mutable_gpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
const int count = bottom[0]->count();
const int* permute_order = permute_order_.gpu_data();
const int* new_steps = new_steps_.gpu_data();
const int* old_steps = old_steps_.gpu_data();
bool foward = false;
// NOLINT_NEXT_LINE(whitespace/operators)
PermuteKernel<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
count, bottom_diff, foward, permute_order, old_steps, new_steps,
num_axes_, top_diff);
} else {
// If there is no need to permute, we share diff to save memory.
} // namespace caffe
发表评论 取消回复