CoCalc -- poly_nms

hukaixuan19970627
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: hukaixuan19970627/yolov5_obb
Path: blob/master/DOTA_devkit/poly_nms_gpu/poly_nms_kernel.cu
Views: ⁴⁷⁵
1

2
#include "poly_nms.hpp"
3
#include <vector>
4
#include <iostream>
5
#include <cmath>
6
#include <cstdio>
7
#include<algorithm>
8

9
using namespace std;
10

11
//##define CUDA_CHECK(condition)\
12
//
13
//  do {
14
//    cudaError_t error = condition;
15
//    if (error != cudaSuccess) {
16
//
17
//    }
18
//  }
19

20
#define CUDA_CHECK(condition) \
21
  /* Code block avoids redefinition of cudaError_t error */ \
22
  do { \
23
    cudaError_t error = condition; \
24
    if (error != cudaSuccess) { \
25
      std::cout << cudaGetErrorString(error) << std::endl; \
26
    } \
27
  } while (0)
28

29
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
30
int const threadsPerBlock = sizeof(unsigned long long) * 8;
31

32

33
#define maxn 51
34
const double eps=1E-8;
35

36
__device__ inline int sig(float d){
37
    return(d>eps)-(d<-eps);
38
}
39
// struct Point{
40
//     double x,y; Point(){}
41
//     Point(double x,double y):x(x),y(y){}
42
//     bool operator==(const Point&p)const{
43
//         return sig(x-p.x)==0&&sig(y-p.y)==0;
44
//     }
45
// };
46

47
__device__ inline int point_eq(const float2 a, const float2 b) {
48
    return sig(a.x - b.x) == 0 && sig(a.y - b.y)==0;
49
}
50

51
__device__ inline void point_swap(float2 *a, float2 *b) {
52
    float2 temp = *a;
53
    *a = *b;
54
    *b = temp;
55
}
56

57
__device__ inline void point_reverse(float2 *first, float2* last)
58
{
59
    while ((first!=last)&&(first!=--last)) {
60
        point_swap (first,last);
61
        ++first;
62
    }
63
}
64
// void point_reverse(Point* first, Point* last)
65
// {
66
//     while ((first!=last)&&(first!=--last)) {
67
//         point_swap (first,last);
68
//         ++first;
69
//     }
70
// }
71

72

73
__device__ inline float cross(float2 o,float2 a,float2 b){  //叉积
74
    return(a.x-o.x)*(b.y-o.y)-(b.x-o.x)*(a.y-o.y);
75
}
76
__device__ inline float area(float2* ps,int n){
77
    ps[n]=ps[0];
78
    float res=0;
79
    for(int i=0;i<n;i++){
80
        res+=ps[i].x*ps[i+1].y-ps[i].y*ps[i+1].x;
81
    }
82
    return res/2.0;
83
}
84
__device__ inline int lineCross(float2 a,float2 b,float2 c,float2 d,float2&p){
85
    float s1,s2;
86
    s1=cross(a,b,c);
87
    s2=cross(a,b,d);
88
    if(sig(s1)==0&&sig(s2)==0) return 2;
89
    if(sig(s2-s1)==0) return 0;
90
    p.x=(c.x*s2-d.x*s1)/(s2-s1);
91
    p.y=(c.y*s2-d.y*s1)/(s2-s1);
92
    return 1;
93
}
94

95

96

97
//多边形切割
98
//用直线ab切割多边形p，切割后的在向量(a,b)的左侧，并原地保存切割结果
99
//如果退化为一个点，也会返回去,此时n为1
100
// __device__ inline void polygon_cut(float2*p,int&n,float2 a,float2 b){
101
//     static float2 pp[maxn];
102
//     int m=0;p[n]=p[0];
103
//     for(int i=0;i<n;i++){
104
//         if(sig(cross(a,b,p[i]))>0) pp[m++]=p[i];
105
//         if(sig(cross(a,b,p[i]))!=sig(cross(a,b,p[i+1])))
106
//             lineCross(a,b,p[i],p[i+1],pp[m++]);
107
//     }
108
//     n=0;
109
//     for(int i=0;i<m;i++)
110
//         if(!i||!(point_eq(pp[i], pp[i-1])))
111
//             p[n++]=pp[i];
112
//     // while(n>1&&p[n-1]==p[0])n--;
113
//     while(n>1&&point_eq(p[n-1], p[0]))n--;
114
// }
115

116
__device__ inline void polygon_cut(float2*p,int&n,float2 a,float2 b, float2* pp){
117

118
    int m=0;p[n]=p[0];
119
    for(int i=0;i<n;i++){
120
        if(sig(cross(a,b,p[i]))>0) pp[m++]=p[i];
121
        if(sig(cross(a,b,p[i]))!=sig(cross(a,b,p[i+1])))
122
            lineCross(a,b,p[i],p[i+1],pp[m++]);
123
    }
124
    n=0;
125
    for(int i=0;i<m;i++)
126
        if(!i||!(point_eq(pp[i], pp[i-1])))
127
            p[n++]=pp[i];
128
    // while(n>1&&p[n-1]==p[0])n--;
129
    while(n>1&&point_eq(p[n-1], p[0]))n--;
130
}
131

132
//---------------华丽的分隔线-----------------//
133
//返回三角形oab和三角形ocd的有向交面积,o是原点//
134
__device__ inline float intersectArea(float2 a,float2 b,float2 c,float2 d){
135
    float2 o = make_float2(0,0);
136
    int s1=sig(cross(o,a,b));
137
    int s2=sig(cross(o,c,d));
138
    if(s1==0||s2==0)return 0.0;//退化，面积为0
139
    // if(s1==-1) swap(a,b);
140
    // if(s2==-1) swap(c,d);
141
    if (s1 == -1) point_swap(&a, &b);
142
    if (s2 == -1) point_swap(&c, &d);
143
    float2 p[10]={o,a,b};
144
    int n=3;
145
    float2 pp[maxn];
146
    polygon_cut(p,n,o,c,pp);
147
    polygon_cut(p,n,c,d,pp);
148
    polygon_cut(p,n,d,o,pp);
149
    float res=fabs(area(p,n));
150
    if(s1*s2==-1) res=-res;return res;
151
}
152
//求两多边形的交面积
153
__device__ inline float intersectArea(float2*ps1,int n1,float2*ps2,int n2){
154
    if(area(ps1,n1)<0) point_reverse(ps1,ps1+n1);
155
    if(area(ps2,n2)<0) point_reverse(ps2,ps2+n2);
156
    ps1[n1]=ps1[0];
157
    ps2[n2]=ps2[0];
158
    float res=0;
159
    for(int i=0;i<n1;i++){
160
        for(int j=0;j<n2;j++){
161
            res+=intersectArea(ps1[i],ps1[i+1],ps2[j],ps2[j+1]);
162
        }
163
    }
164
    return res;//assumeresispositive!
165
}
166

167

168

169

170
//__device__ inline double iou_poly(vector<double> p, vector<double> q) {
171
//    Point ps1[maxn],ps2[maxn];
172
//    int n1 = 4;
173
//    int n2 = 4;
174
//    for (int i = 0; i < 4; i++) {
175
//        ps1[i].x = p[i * 2];
176
//        ps1[i].y = p[i * 2 + 1];
177
//
178
//        ps2[i].x = q[i * 2];
179
//        ps2[i].y = q[i * 2 + 1];
180
//    }
181
//    double inter_area = intersectArea(ps1, n1, ps2, n2);
182
//    double union_area = fabs(area(ps1, n1)) + fabs(area(ps2, n2)) - inter_area;
183
//    double iou = inter_area / union_area;
184
//
185
////    cout << "inter_area:" << inter_area << endl;
186
////    cout << "union_area:" << union_area << endl;
187
////    cout << "iou:" << iou << endl;
188
//
189
//    return iou;
190
//}
191

192
__device__ inline float devPolyIoU(float const * const p, float const * const q) {
193
    float2 ps1[maxn], ps2[maxn];
194
    int n1 = 4;
195
    int n2 = 4;
196
    for (int i = 0; i < 4; i++) {
197
        ps1[i].x = p[i * 2];
198
        ps1[i].y = p[i * 2 + 1];
199

200
        ps2[i].x = q[i * 2];
201
        ps2[i].y = q[i * 2 + 1];
202
    }
203
    float inter_area = intersectArea(ps1, n1, ps2, n2);
204
    float union_area = fabs(area(ps1, n1)) + fabs(area(ps2, n2)) - inter_area;
205
    float iou = 0;
206
    if (union_area == 0) {
207
        iou = (inter_area + 1) / (union_area + 1);
208
    } else {
209
        iou = inter_area / union_area;
210
    }
211
    return iou;
212
}
213

214
__global__ void poly_nms_kernel(const int n_polys, const float nms_overlap_thresh,
215
                            const float *dev_polys, unsigned long long *dev_mask) {
216
    const int row_start = blockIdx.y;
217
    const int col_start = blockIdx.x;
218

219
    const int row_size =
220
            min(n_polys - row_start * threadsPerBlock, threadsPerBlock);
221
    const int cols_size =
222
            min(n_polys - col_start * threadsPerBlock, threadsPerBlock);
223

224
    __shared__ float block_polys[threadsPerBlock * 9];
225
    if (threadIdx.x < cols_size) {
226
        block_polys[threadIdx.x * 9 + 0] =
227
            dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];
228
        block_polys[threadIdx.x * 9 + 1] =
229
            dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];
230
        block_polys[threadIdx.x * 9 + 2] =
231
            dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];
232
        block_polys[threadIdx.x * 9 + 3] =
233
            dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];
234
        block_polys[threadIdx.x * 9 + 4] =
235
            dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];
236
        block_polys[threadIdx.x * 9 + 5] =
237
            dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];
238
        block_polys[threadIdx.x * 9 + 6] =
239
            dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];
240
        block_polys[threadIdx.x * 9 + 7] =
241
            dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];
242
        block_polys[threadIdx.x * 9 + 8] =
243
            dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 8];
244
    }
245
    __syncthreads();
246

247
    if (threadIdx.x < row_size) {
248
        const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
249
        const float *cur_box = dev_polys + cur_box_idx * 9;
250
        int i = 0;
251
        unsigned long long t = 0;
252
        int start = 0;
253
        if (row_start == col_start) {
254
            start = threadIdx.x + 1;
255
        }
256
        for (i = start; i < cols_size; i++) {
257
            if (devPolyIoU(cur_box, block_polys + i * 9) > nms_overlap_thresh) {
258
                t |= 1ULL << i;
259
            }
260
        }
261
        const int col_blocks = DIVUP(n_polys, threadsPerBlock);
262
        dev_mask[cur_box_idx * col_blocks + col_start] = t;
263
    }
264
}
265

266
void _set_device(int device_id) {
267
    int current_device;
268
    CUDA_CHECK(cudaGetDevice(&current_device));
269
    if (current_device == device_id) {
270
        return;
271
    }
272
   // The call to cudaSetDevice must come before any calls to Get, which
273
   // may perform initailization using the GPU.
274
   CUDA_CHECK(cudaSetDevice(device_id));
275
}
276

277
void _poly_nms(int* keep_out, int* num_out, const float* polys_host, int polys_num,
278
            int polys_dim, float nms_overlap_thresh, int device_id) {
279
    float* polys_dev = NULL;
280
    unsigned long long* mask_dev = NULL;
281
    const int col_blocks = DIVUP(polys_num, threadsPerBlock);
282

283
    CUDA_CHECK(cudaMalloc(&polys_dev,
284
                        polys_num * polys_dim * sizeof(float)));
285
    CUDA_CHECK(cudaMemcpy(polys_dev,
286
                          polys_host,
287
                          polys_num * polys_dim * sizeof(float),
288
                          cudaMemcpyHostToDevice));
289

290
    CUDA_CHECK(cudaMalloc(&mask_dev,
291
                          polys_num * col_blocks * sizeof(unsigned long long)));
292

293
    dim3 blocks(DIVUP(polys_num, threadsPerBlock),
294
                DIVUP(polys_num, threadsPerBlock));
295
    dim3 threads(threadsPerBlock);
296
//    __global__ void poly_nms_kernel(const int n_polys, const float nms_overlap_thresh,
297
//                            const float *dev_polys, unsigned long long *dev_mask)
298
    poly_nms_kernel<<<blocks, threads>>>(polys_num,
299
                                           nms_overlap_thresh,
300
                                           polys_dev,
301
                                           mask_dev);
302

303
    std::vector<unsigned long long> mask_host(polys_num * col_blocks);
304
    CUDA_CHECK(cudaMemcpy(&mask_host[0],
305
                           mask_dev,
306
                           sizeof(unsigned long long) * polys_num * col_blocks,
307
                           cudaMemcpyDeviceToHost));
308

309
    std::vector<unsigned long long> remv(col_blocks);
310
    memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
311
    // TODO: figure out it
312
    int num_to_keep = 0;
313
    for (int i = 0; i < polys_num; i++) {
314
        int nblock = i / threadsPerBlock;
315
        int inblock = i % threadsPerBlock;
316

317
        if (!(remv[nblock] & (1ULL << inblock))) {
318
            keep_out[num_to_keep++] = i;
319
            unsigned long long *p = &mask_host[0] + i * col_blocks;
320
            for (int j = nblock; j < col_blocks; j++) {
321
                remv[j] |= p[j];
322
            }
323
        }
324
    }
325
    *num_out = num_to_keep;
326

327
    CUDA_CHECK(cudaFree(polys_dev));
328
    CUDA_CHECK(cudaFree(mask_dev));
329
}
330

331
//
332
//int main(){
333
//    double p[8] = {0, 0, 1, 0, 1, 1, 0, 1};
334
//    double q[8] = {0.5, 0.5, 1.5, 0.5, 1.5, 1.5, 0.5, 1.5};
335
//    vector<double> P(p, p + 8);
336
//    vector<double> Q(q, q + 8);
337
//    iou_poly(P, Q);
338
//    return 0;
339
//}
340

341
//int main(){
342
//    double p[8] = {0, 0, 1, 0, 1, 1, 0, 1};
343
//    double q[8] = {0.5, 0.5, 1.5, 0.5, 1.5, 1.5, 0.5, 1.5};
344
//    iou_poly(p, q);
345
//    return 0;
346
//}
347
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place. Commercial Alternative to JupyterHub.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.