CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hukaixuan19970627

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: hukaixuan19970627/yolov5_obb
Path: blob/master/DOTA_devkit/poly_nms_gpu/poly_nms_kernel.cu
Views: 475
1
2
#include "poly_nms.hpp"
3
#include <vector>
4
#include <iostream>
5
#include <cmath>
6
#include <cstdio>
7
#include<algorithm>
8
9
using namespace std;
10
11
//##define CUDA_CHECK(condition)\
12
//
13
// do {
14
// cudaError_t error = condition;
15
// if (error != cudaSuccess) {
16
//
17
// }
18
// }
19
20
#define CUDA_CHECK(condition) \
21
/* Code block avoids redefinition of cudaError_t error */ \
22
do { \
23
cudaError_t error = condition; \
24
if (error != cudaSuccess) { \
25
std::cout << cudaGetErrorString(error) << std::endl; \
26
} \
27
} while (0)
28
29
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
30
int const threadsPerBlock = sizeof(unsigned long long) * 8;
31
32
33
#define maxn 51
34
const double eps=1E-8;
35
36
__device__ inline int sig(float d){
37
return(d>eps)-(d<-eps);
38
}
39
// struct Point{
40
// double x,y; Point(){}
41
// Point(double x,double y):x(x),y(y){}
42
// bool operator==(const Point&p)const{
43
// return sig(x-p.x)==0&&sig(y-p.y)==0;
44
// }
45
// };
46
47
__device__ inline int point_eq(const float2 a, const float2 b) {
48
return sig(a.x - b.x) == 0 && sig(a.y - b.y)==0;
49
}
50
51
__device__ inline void point_swap(float2 *a, float2 *b) {
52
float2 temp = *a;
53
*a = *b;
54
*b = temp;
55
}
56
57
__device__ inline void point_reverse(float2 *first, float2* last)
58
{
59
while ((first!=last)&&(first!=--last)) {
60
point_swap (first,last);
61
++first;
62
}
63
}
64
// void point_reverse(Point* first, Point* last)
65
// {
66
// while ((first!=last)&&(first!=--last)) {
67
// point_swap (first,last);
68
// ++first;
69
// }
70
// }
71
72
73
__device__ inline float cross(float2 o,float2 a,float2 b){ //叉积
74
return(a.x-o.x)*(b.y-o.y)-(b.x-o.x)*(a.y-o.y);
75
}
76
__device__ inline float area(float2* ps,int n){
77
ps[n]=ps[0];
78
float res=0;
79
for(int i=0;i<n;i++){
80
res+=ps[i].x*ps[i+1].y-ps[i].y*ps[i+1].x;
81
}
82
return res/2.0;
83
}
84
__device__ inline int lineCross(float2 a,float2 b,float2 c,float2 d,float2&p){
85
float s1,s2;
86
s1=cross(a,b,c);
87
s2=cross(a,b,d);
88
if(sig(s1)==0&&sig(s2)==0) return 2;
89
if(sig(s2-s1)==0) return 0;
90
p.x=(c.x*s2-d.x*s1)/(s2-s1);
91
p.y=(c.y*s2-d.y*s1)/(s2-s1);
92
return 1;
93
}
94
95
96
97
//多边形切割
98
//用直线ab切割多边形p,切割后的在向量(a,b)的左侧,并原地保存切割结果
99
//如果退化为一个点,也会返回去,此时n为1
100
// __device__ inline void polygon_cut(float2*p,int&n,float2 a,float2 b){
101
// static float2 pp[maxn];
102
// int m=0;p[n]=p[0];
103
// for(int i=0;i<n;i++){
104
// if(sig(cross(a,b,p[i]))>0) pp[m++]=p[i];
105
// if(sig(cross(a,b,p[i]))!=sig(cross(a,b,p[i+1])))
106
// lineCross(a,b,p[i],p[i+1],pp[m++]);
107
// }
108
// n=0;
109
// for(int i=0;i<m;i++)
110
// if(!i||!(point_eq(pp[i], pp[i-1])))
111
// p[n++]=pp[i];
112
// // while(n>1&&p[n-1]==p[0])n--;
113
// while(n>1&&point_eq(p[n-1], p[0]))n--;
114
// }
115
116
__device__ inline void polygon_cut(float2*p,int&n,float2 a,float2 b, float2* pp){
117
118
int m=0;p[n]=p[0];
119
for(int i=0;i<n;i++){
120
if(sig(cross(a,b,p[i]))>0) pp[m++]=p[i];
121
if(sig(cross(a,b,p[i]))!=sig(cross(a,b,p[i+1])))
122
lineCross(a,b,p[i],p[i+1],pp[m++]);
123
}
124
n=0;
125
for(int i=0;i<m;i++)
126
if(!i||!(point_eq(pp[i], pp[i-1])))
127
p[n++]=pp[i];
128
// while(n>1&&p[n-1]==p[0])n--;
129
while(n>1&&point_eq(p[n-1], p[0]))n--;
130
}
131
132
//---------------华丽的分隔线-----------------//
133
//返回三角形oab和三角形ocd的有向交面积,o是原点//
134
__device__ inline float intersectArea(float2 a,float2 b,float2 c,float2 d){
135
float2 o = make_float2(0,0);
136
int s1=sig(cross(o,a,b));
137
int s2=sig(cross(o,c,d));
138
if(s1==0||s2==0)return 0.0;//退化,面积为0
139
// if(s1==-1) swap(a,b);
140
// if(s2==-1) swap(c,d);
141
if (s1 == -1) point_swap(&a, &b);
142
if (s2 == -1) point_swap(&c, &d);
143
float2 p[10]={o,a,b};
144
int n=3;
145
float2 pp[maxn];
146
polygon_cut(p,n,o,c,pp);
147
polygon_cut(p,n,c,d,pp);
148
polygon_cut(p,n,d,o,pp);
149
float res=fabs(area(p,n));
150
if(s1*s2==-1) res=-res;return res;
151
}
152
//求两多边形的交面积
153
__device__ inline float intersectArea(float2*ps1,int n1,float2*ps2,int n2){
154
if(area(ps1,n1)<0) point_reverse(ps1,ps1+n1);
155
if(area(ps2,n2)<0) point_reverse(ps2,ps2+n2);
156
ps1[n1]=ps1[0];
157
ps2[n2]=ps2[0];
158
float res=0;
159
for(int i=0;i<n1;i++){
160
for(int j=0;j<n2;j++){
161
res+=intersectArea(ps1[i],ps1[i+1],ps2[j],ps2[j+1]);
162
}
163
}
164
return res;//assumeresispositive!
165
}
166
167
168
169
170
//__device__ inline double iou_poly(vector<double> p, vector<double> q) {
171
// Point ps1[maxn],ps2[maxn];
172
// int n1 = 4;
173
// int n2 = 4;
174
// for (int i = 0; i < 4; i++) {
175
// ps1[i].x = p[i * 2];
176
// ps1[i].y = p[i * 2 + 1];
177
//
178
// ps2[i].x = q[i * 2];
179
// ps2[i].y = q[i * 2 + 1];
180
// }
181
// double inter_area = intersectArea(ps1, n1, ps2, n2);
182
// double union_area = fabs(area(ps1, n1)) + fabs(area(ps2, n2)) - inter_area;
183
// double iou = inter_area / union_area;
184
//
185
//// cout << "inter_area:" << inter_area << endl;
186
//// cout << "union_area:" << union_area << endl;
187
//// cout << "iou:" << iou << endl;
188
//
189
// return iou;
190
//}
191
192
__device__ inline float devPolyIoU(float const * const p, float const * const q) {
193
float2 ps1[maxn], ps2[maxn];
194
int n1 = 4;
195
int n2 = 4;
196
for (int i = 0; i < 4; i++) {
197
ps1[i].x = p[i * 2];
198
ps1[i].y = p[i * 2 + 1];
199
200
ps2[i].x = q[i * 2];
201
ps2[i].y = q[i * 2 + 1];
202
}
203
float inter_area = intersectArea(ps1, n1, ps2, n2);
204
float union_area = fabs(area(ps1, n1)) + fabs(area(ps2, n2)) - inter_area;
205
float iou = 0;
206
if (union_area == 0) {
207
iou = (inter_area + 1) / (union_area + 1);
208
} else {
209
iou = inter_area / union_area;
210
}
211
return iou;
212
}
213
214
__global__ void poly_nms_kernel(const int n_polys, const float nms_overlap_thresh,
215
const float *dev_polys, unsigned long long *dev_mask) {
216
const int row_start = blockIdx.y;
217
const int col_start = blockIdx.x;
218
219
const int row_size =
220
min(n_polys - row_start * threadsPerBlock, threadsPerBlock);
221
const int cols_size =
222
min(n_polys - col_start * threadsPerBlock, threadsPerBlock);
223
224
__shared__ float block_polys[threadsPerBlock * 9];
225
if (threadIdx.x < cols_size) {
226
block_polys[threadIdx.x * 9 + 0] =
227
dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];
228
block_polys[threadIdx.x * 9 + 1] =
229
dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];
230
block_polys[threadIdx.x * 9 + 2] =
231
dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];
232
block_polys[threadIdx.x * 9 + 3] =
233
dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];
234
block_polys[threadIdx.x * 9 + 4] =
235
dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];
236
block_polys[threadIdx.x * 9 + 5] =
237
dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];
238
block_polys[threadIdx.x * 9 + 6] =
239
dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];
240
block_polys[threadIdx.x * 9 + 7] =
241
dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];
242
block_polys[threadIdx.x * 9 + 8] =
243
dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 8];
244
}
245
__syncthreads();
246
247
if (threadIdx.x < row_size) {
248
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
249
const float *cur_box = dev_polys + cur_box_idx * 9;
250
int i = 0;
251
unsigned long long t = 0;
252
int start = 0;
253
if (row_start == col_start) {
254
start = threadIdx.x + 1;
255
}
256
for (i = start; i < cols_size; i++) {
257
if (devPolyIoU(cur_box, block_polys + i * 9) > nms_overlap_thresh) {
258
t |= 1ULL << i;
259
}
260
}
261
const int col_blocks = DIVUP(n_polys, threadsPerBlock);
262
dev_mask[cur_box_idx * col_blocks + col_start] = t;
263
}
264
}
265
266
void _set_device(int device_id) {
267
int current_device;
268
CUDA_CHECK(cudaGetDevice(&current_device));
269
if (current_device == device_id) {
270
return;
271
}
272
// The call to cudaSetDevice must come before any calls to Get, which
273
// may perform initailization using the GPU.
274
CUDA_CHECK(cudaSetDevice(device_id));
275
}
276
277
void _poly_nms(int* keep_out, int* num_out, const float* polys_host, int polys_num,
278
int polys_dim, float nms_overlap_thresh, int device_id) {
279
float* polys_dev = NULL;
280
unsigned long long* mask_dev = NULL;
281
const int col_blocks = DIVUP(polys_num, threadsPerBlock);
282
283
CUDA_CHECK(cudaMalloc(&polys_dev,
284
polys_num * polys_dim * sizeof(float)));
285
CUDA_CHECK(cudaMemcpy(polys_dev,
286
polys_host,
287
polys_num * polys_dim * sizeof(float),
288
cudaMemcpyHostToDevice));
289
290
CUDA_CHECK(cudaMalloc(&mask_dev,
291
polys_num * col_blocks * sizeof(unsigned long long)));
292
293
dim3 blocks(DIVUP(polys_num, threadsPerBlock),
294
DIVUP(polys_num, threadsPerBlock));
295
dim3 threads(threadsPerBlock);
296
// __global__ void poly_nms_kernel(const int n_polys, const float nms_overlap_thresh,
297
// const float *dev_polys, unsigned long long *dev_mask)
298
poly_nms_kernel<<<blocks, threads>>>(polys_num,
299
nms_overlap_thresh,
300
polys_dev,
301
mask_dev);
302
303
std::vector<unsigned long long> mask_host(polys_num * col_blocks);
304
CUDA_CHECK(cudaMemcpy(&mask_host[0],
305
mask_dev,
306
sizeof(unsigned long long) * polys_num * col_blocks,
307
cudaMemcpyDeviceToHost));
308
309
std::vector<unsigned long long> remv(col_blocks);
310
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
311
// TODO: figure out it
312
int num_to_keep = 0;
313
for (int i = 0; i < polys_num; i++) {
314
int nblock = i / threadsPerBlock;
315
int inblock = i % threadsPerBlock;
316
317
if (!(remv[nblock] & (1ULL << inblock))) {
318
keep_out[num_to_keep++] = i;
319
unsigned long long *p = &mask_host[0] + i * col_blocks;
320
for (int j = nblock; j < col_blocks; j++) {
321
remv[j] |= p[j];
322
}
323
}
324
}
325
*num_out = num_to_keep;
326
327
CUDA_CHECK(cudaFree(polys_dev));
328
CUDA_CHECK(cudaFree(mask_dev));
329
}
330
331
//
332
//int main(){
333
// double p[8] = {0, 0, 1, 0, 1, 1, 0, 1};
334
// double q[8] = {0.5, 0.5, 1.5, 0.5, 1.5, 1.5, 0.5, 1.5};
335
// vector<double> P(p, p + 8);
336
// vector<double> Q(q, q + 8);
337
// iou_poly(P, Q);
338
// return 0;
339
//}
340
341
//int main(){
342
// double p[8] = {0, 0, 1, 0, 1, 1, 0, 1};
343
// double q[8] = {0.5, 0.5, 1.5, 0.5, 1.5, 1.5, 0.5, 1.5};
344
// iou_poly(p, q);
345
// return 0;
346
//}
347