CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hukaixuan19970627

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: hukaixuan19970627/yolov5_obb
Path: blob/master/utils/nms_rotated/src/nms_rotated_cuda.cu
Views: 475
1
// Modified from
2
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/nms_rotated
3
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
4
#include <ATen/ATen.h>
5
#include <ATen/cuda/CUDAContext.h>
6
#include <c10/cuda/CUDAGuard.h>
7
#include <ATen/cuda/CUDAApplyUtils.cuh>
8
#include "box_iou_rotated_utils.h"
9
10
int const threadsPerBlock = sizeof(unsigned long long) * 8;
11
12
template <typename T>
13
__global__ void nms_rotated_cuda_kernel(
14
const int n_boxes,
15
const float iou_threshold,
16
const T* dev_boxes,
17
unsigned long long* dev_mask) {
18
// nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
19
20
const int row_start = blockIdx.y;
21
const int col_start = blockIdx.x;
22
23
// if (row_start > col_start) return;
24
25
const int row_size =
26
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
27
const int col_size =
28
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
29
30
// Compared to nms_cuda_kernel, where each box is represented with 4 values
31
// (x1, y1, x2, y2), each rotated box is represented with 5 values
32
// (x_center, y_center, width, height, angle_degrees) here.
33
__shared__ T block_boxes[threadsPerBlock * 5];
34
if (threadIdx.x < col_size) {
35
block_boxes[threadIdx.x * 5 + 0] =
36
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
37
block_boxes[threadIdx.x * 5 + 1] =
38
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
39
block_boxes[threadIdx.x * 5 + 2] =
40
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
41
block_boxes[threadIdx.x * 5 + 3] =
42
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
43
block_boxes[threadIdx.x * 5 + 4] =
44
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
45
}
46
__syncthreads();
47
48
if (threadIdx.x < row_size) {
49
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
50
const T* cur_box = dev_boxes + cur_box_idx * 5;
51
int i = 0;
52
unsigned long long t = 0;
53
int start = 0;
54
if (row_start == col_start) {
55
start = threadIdx.x + 1;
56
}
57
for (i = start; i < col_size; i++) {
58
// Instead of devIoU used by original horizontal nms, here
59
// we use the single_box_iou_rotated function from box_iou_rotated_utils.h
60
if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5) >
61
iou_threshold) {
62
t |= 1ULL << i;
63
}
64
}
65
const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
66
dev_mask[cur_box_idx * col_blocks + col_start] = t;
67
}
68
}
69
70
71
at::Tensor nms_rotated_cuda(
72
// input must be contiguous
73
const at::Tensor& dets,
74
const at::Tensor& scores,
75
float iou_threshold) {
76
// using scalar_t = float;
77
AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
78
AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
79
at::cuda::CUDAGuard device_guard(dets.device());
80
81
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
82
auto dets_sorted = dets.index_select(0, order_t);
83
84
auto dets_num = dets.size(0);
85
86
const int col_blocks =
87
at::cuda::ATenCeilDiv(static_cast<int>(dets_num), threadsPerBlock);
88
89
at::Tensor mask =
90
at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
91
92
dim3 blocks(col_blocks, col_blocks);
93
dim3 threads(threadsPerBlock);
94
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
95
96
AT_DISPATCH_FLOATING_TYPES(
97
dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
98
nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
99
dets_num,
100
iou_threshold,
101
dets_sorted.data_ptr<scalar_t>(),
102
(unsigned long long*)mask.data_ptr<int64_t>());
103
});
104
105
at::Tensor mask_cpu = mask.to(at::kCPU);
106
unsigned long long* mask_host =
107
(unsigned long long*)mask_cpu.data_ptr<int64_t>();
108
109
std::vector<unsigned long long> remv(col_blocks);
110
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
111
112
at::Tensor keep =
113
at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
114
int64_t* keep_out = keep.data_ptr<int64_t>();
115
116
int num_to_keep = 0;
117
for (int i = 0; i < dets_num; i++) {
118
int nblock = i / threadsPerBlock;
119
int inblock = i % threadsPerBlock;
120
121
if (!(remv[nblock] & (1ULL << inblock))) {
122
keep_out[num_to_keep++] = i;
123
unsigned long long* p = mask_host + i * col_blocks;
124
for (int j = nblock; j < col_blocks; j++) {
125
remv[j] |= p[j];
126
}
127
}
128
}
129
130
AT_CUDA_CHECK(cudaGetLastError());
131
return order_t.index(
132
{keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
133
.to(order_t.device(), keep.scalar_type())});
134
}
135
136