cuda nms

71 阅读 0 评论 47 点赞

我是靠谱客的博主淡然百合，最近开发中收集的这篇文章主要介绍cuda nms，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述


int YoloLayerPlugin::nms_fun(int batch_size, void **inputs, void *const* outputs, size_t count, int detections_per_im, float nms_thresh, void *workspace, size_t workspace_size, cudaStream_t stream) const {
if (!workspace || !workspace_size) {
// Return required scratch space size cub style
workspace_size
= get_size_aligned<bool>(count);
// flags
workspace_size += get_size_aligned<int>(count);
// indices
workspace_size += get_size_aligned<int>(count);
// indices_sorted
workspace_size += get_size_aligned<float>(count); // scores
workspace_size += get_size_aligned<float>(count); // scores_sorted
size_t temp_size_flag = 0;
cub::DeviceSelect::Flagged((void *)nullptr, temp_size_flag,
cub::CountingInputIterator<int>(count),
(bool *)nullptr, (int *)nullptr, (int *)nullptr, count);
size_t temp_size_sort = 0;
cub::DeviceRadixSort::SortPairsDescending((void *)nullptr, temp_size_sort,
(float *)nullptr, (float *)nullptr, (int *)nullptr, (int *)nullptr, count);
workspace_size += std::max(temp_size_flag, temp_size_sort);
return workspace_size;
}
auto on_stream = thrust::cuda::par.on(stream);
auto flags = get_next_ptr<bool>(count, workspace, workspace_size);
auto indices = get_next_ptr<int>(count, workspace, workspace_size);
auto indices_sorted = get_next_ptr<int>(count, workspace, workspace_size);
auto scores = get_next_ptr<float>(count, workspace, workspace_size);
auto scores_sorted = get_next_ptr<float>(count, workspace, workspace_size);
// printf("nms batch %d n", batch_size);
for (int batch = 0; batch < batch_size; batch++) {
auto in_scores = static_cast<const float *>(inputs[0]) + batch * count;
auto in_boxes = static_cast<const float4 *>(inputs[1]) + batch * count;
auto in_classes = static_cast<const float *>(inputs[2]) + batch * count;
auto in_points = static_cast<const float *>(inputs[3]) + batch * count;
auto out_scores = static_cast<float *>(outputs[0]) + batch * detections_per_im;
auto out_boxes = static_cast<float4 *>(outputs[1]) + batch * detections_per_im;
auto out_classes = static_cast<float *>(outputs[2]) + batch * detections_per_im;
auto out_points = static_cast<float4 *>(outputs[3]) + batch * detections_per_im;
// cudaMemcpyAsync(tmp, out_scores, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
// printf("output %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2fn", tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],tmp[6],tmp[7],tmp[8],tmp[9]);
// Discard null scores
thrust::transform(on_stream, in_scores, in_scores + count,flags, thrust::placeholders::_1 > 0.0f);
int *num_selected = reinterpret_cast<int *>(indices_sorted);
cub::DeviceSelect::Flagged(workspace, workspace_size, cub::CountingInputIterator<int>(0),flags, indices, num_selected, count, stream);
cudaStreamSynchronize(stream);
int num_detections = *thrust::device_pointer_cast(num_selected);
// Sort scores and corresponding indices
thrust::gather(on_stream, indices, indices + num_detections, in_scores, scores);
cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,scores, scores_sorted, indices, indices_sorted, num_detections, 0, sizeof(*scores)*8, stream);
// Launch actual NMS kernel - 1 block with each thread handling n detections
const int max_threads = 1024;
int num_per_thread = ceil((float)num_detections / max_threads);
nms_kernel<<<1, max_threads, 0, stream>>>(num_per_thread, nms_thresh, num_detections,
indices_sorted, scores_sorted, in_classes, in_boxes);
// Re-sort with updated scores
cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
scores_sorted, scores, indices_sorted, indices, num_detections, 0, sizeof(*scores)*8, stream);
// Gather filtered scores, boxes, classes
num_detections = min(detections_per_im, num_detections);
cudaMemcpyAsync(out_scores, scores, num_detections * sizeof *scores, cudaMemcpyDeviceToDevice, stream);
if (num_detections < detections_per_im) {
thrust::fill_n(on_stream, out_scores + num_detections, detections_per_im - num_detections, 0);
}
thrust::gather(on_stream, indices, indices + num_detections, in_boxes, out_boxes);
thrust::gather(on_stream, indices, indices + num_detections, in_classes, out_classes);
thrust::gather(on_stream, indices, indices + num_detections, in_points, out_points);
float tmp[10];
cudaMemcpyAsync(tmp, out_points, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
printf("out_points %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2fn", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6], tmp[7], tmp[8], tmp[9]);
// printf("num_detections %d n", num_detections);
// cudaMemcpyAsync(tmp, out_scores, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
// printf("output %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2fn", tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],tmp[6],tmp[7],tmp[8],tmp[9]);
}
return 0;
}