Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/master/DOTA_devkit/poly_nms_gpu/poly_nms_kernel.cu
Views: 475
1#include "poly_nms.hpp"2#include <vector>3#include <iostream>4#include <cmath>5#include <cstdio>6#include<algorithm>78using namespace std;910//##define CUDA_CHECK(condition)\11//12// do {13// cudaError_t error = condition;14// if (error != cudaSuccess) {15//16// }17// }1819#define CUDA_CHECK(condition) \20/* Code block avoids redefinition of cudaError_t error */ \21do { \22cudaError_t error = condition; \23if (error != cudaSuccess) { \24std::cout << cudaGetErrorString(error) << std::endl; \25} \26} while (0)2728#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))29int const threadsPerBlock = sizeof(unsigned long long) * 8;303132#define maxn 5133const double eps=1E-8;3435__device__ inline int sig(float d){36return(d>eps)-(d<-eps);37}38// struct Point{39// double x,y; Point(){}40// Point(double x,double y):x(x),y(y){}41// bool operator==(const Point&p)const{42// return sig(x-p.x)==0&&sig(y-p.y)==0;43// }44// };4546__device__ inline int point_eq(const float2 a, const float2 b) {47return sig(a.x - b.x) == 0 && sig(a.y - b.y)==0;48}4950__device__ inline void point_swap(float2 *a, float2 *b) {51float2 temp = *a;52*a = *b;53*b = temp;54}5556__device__ inline void point_reverse(float2 *first, float2* last)57{58while ((first!=last)&&(first!=--last)) {59point_swap (first,last);60++first;61}62}63// void point_reverse(Point* first, Point* last)64// {65// while ((first!=last)&&(first!=--last)) {66// point_swap (first,last);67// ++first;68// }69// }707172__device__ inline float cross(float2 o,float2 a,float2 b){ //叉积73return(a.x-o.x)*(b.y-o.y)-(b.x-o.x)*(a.y-o.y);74}75__device__ inline float area(float2* ps,int n){76ps[n]=ps[0];77float res=0;78for(int i=0;i<n;i++){79res+=ps[i].x*ps[i+1].y-ps[i].y*ps[i+1].x;80}81return res/2.0;82}83__device__ inline int lineCross(float2 a,float2 b,float2 c,float2 d,float2&p){84float s1,s2;85s1=cross(a,b,c);86s2=cross(a,b,d);87if(sig(s1)==0&&sig(s2)==0) return 2;88if(sig(s2-s1)==0) return 0;89p.x=(c.x*s2-d.x*s1)/(s2-s1);90p.y=(c.y*s2-d.y*s1)/(s2-s1);91return 1;92}93949596//多边形切割97//用直线ab切割多边形p,切割后的在向量(a,b)的左侧,并原地保存切割结果98//如果退化为一个点,也会返回去,此时n为199// __device__ inline void polygon_cut(float2*p,int&n,float2 a,float2 b){100// static float2 pp[maxn];101// int m=0;p[n]=p[0];102// for(int i=0;i<n;i++){103// if(sig(cross(a,b,p[i]))>0) pp[m++]=p[i];104// if(sig(cross(a,b,p[i]))!=sig(cross(a,b,p[i+1])))105// lineCross(a,b,p[i],p[i+1],pp[m++]);106// }107// n=0;108// for(int i=0;i<m;i++)109// if(!i||!(point_eq(pp[i], pp[i-1])))110// p[n++]=pp[i];111// // while(n>1&&p[n-1]==p[0])n--;112// while(n>1&&point_eq(p[n-1], p[0]))n--;113// }114115__device__ inline void polygon_cut(float2*p,int&n,float2 a,float2 b, float2* pp){116117int m=0;p[n]=p[0];118for(int i=0;i<n;i++){119if(sig(cross(a,b,p[i]))>0) pp[m++]=p[i];120if(sig(cross(a,b,p[i]))!=sig(cross(a,b,p[i+1])))121lineCross(a,b,p[i],p[i+1],pp[m++]);122}123n=0;124for(int i=0;i<m;i++)125if(!i||!(point_eq(pp[i], pp[i-1])))126p[n++]=pp[i];127// while(n>1&&p[n-1]==p[0])n--;128while(n>1&&point_eq(p[n-1], p[0]))n--;129}130131//---------------华丽的分隔线-----------------//132//返回三角形oab和三角形ocd的有向交面积,o是原点//133__device__ inline float intersectArea(float2 a,float2 b,float2 c,float2 d){134float2 o = make_float2(0,0);135int s1=sig(cross(o,a,b));136int s2=sig(cross(o,c,d));137if(s1==0||s2==0)return 0.0;//退化,面积为0138// if(s1==-1) swap(a,b);139// if(s2==-1) swap(c,d);140if (s1 == -1) point_swap(&a, &b);141if (s2 == -1) point_swap(&c, &d);142float2 p[10]={o,a,b};143int n=3;144float2 pp[maxn];145polygon_cut(p,n,o,c,pp);146polygon_cut(p,n,c,d,pp);147polygon_cut(p,n,d,o,pp);148float res=fabs(area(p,n));149if(s1*s2==-1) res=-res;return res;150}151//求两多边形的交面积152__device__ inline float intersectArea(float2*ps1,int n1,float2*ps2,int n2){153if(area(ps1,n1)<0) point_reverse(ps1,ps1+n1);154if(area(ps2,n2)<0) point_reverse(ps2,ps2+n2);155ps1[n1]=ps1[0];156ps2[n2]=ps2[0];157float res=0;158for(int i=0;i<n1;i++){159for(int j=0;j<n2;j++){160res+=intersectArea(ps1[i],ps1[i+1],ps2[j],ps2[j+1]);161}162}163return res;//assumeresispositive!164}165166167168169//__device__ inline double iou_poly(vector<double> p, vector<double> q) {170// Point ps1[maxn],ps2[maxn];171// int n1 = 4;172// int n2 = 4;173// for (int i = 0; i < 4; i++) {174// ps1[i].x = p[i * 2];175// ps1[i].y = p[i * 2 + 1];176//177// ps2[i].x = q[i * 2];178// ps2[i].y = q[i * 2 + 1];179// }180// double inter_area = intersectArea(ps1, n1, ps2, n2);181// double union_area = fabs(area(ps1, n1)) + fabs(area(ps2, n2)) - inter_area;182// double iou = inter_area / union_area;183//184//// cout << "inter_area:" << inter_area << endl;185//// cout << "union_area:" << union_area << endl;186//// cout << "iou:" << iou << endl;187//188// return iou;189//}190191__device__ inline float devPolyIoU(float const * const p, float const * const q) {192float2 ps1[maxn], ps2[maxn];193int n1 = 4;194int n2 = 4;195for (int i = 0; i < 4; i++) {196ps1[i].x = p[i * 2];197ps1[i].y = p[i * 2 + 1];198199ps2[i].x = q[i * 2];200ps2[i].y = q[i * 2 + 1];201}202float inter_area = intersectArea(ps1, n1, ps2, n2);203float union_area = fabs(area(ps1, n1)) + fabs(area(ps2, n2)) - inter_area;204float iou = 0;205if (union_area == 0) {206iou = (inter_area + 1) / (union_area + 1);207} else {208iou = inter_area / union_area;209}210return iou;211}212213__global__ void poly_nms_kernel(const int n_polys, const float nms_overlap_thresh,214const float *dev_polys, unsigned long long *dev_mask) {215const int row_start = blockIdx.y;216const int col_start = blockIdx.x;217218const int row_size =219min(n_polys - row_start * threadsPerBlock, threadsPerBlock);220const int cols_size =221min(n_polys - col_start * threadsPerBlock, threadsPerBlock);222223__shared__ float block_polys[threadsPerBlock * 9];224if (threadIdx.x < cols_size) {225block_polys[threadIdx.x * 9 + 0] =226dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];227block_polys[threadIdx.x * 9 + 1] =228dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];229block_polys[threadIdx.x * 9 + 2] =230dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];231block_polys[threadIdx.x * 9 + 3] =232dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];233block_polys[threadIdx.x * 9 + 4] =234dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];235block_polys[threadIdx.x * 9 + 5] =236dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];237block_polys[threadIdx.x * 9 + 6] =238dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];239block_polys[threadIdx.x * 9 + 7] =240dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];241block_polys[threadIdx.x * 9 + 8] =242dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 8];243}244__syncthreads();245246if (threadIdx.x < row_size) {247const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;248const float *cur_box = dev_polys + cur_box_idx * 9;249int i = 0;250unsigned long long t = 0;251int start = 0;252if (row_start == col_start) {253start = threadIdx.x + 1;254}255for (i = start; i < cols_size; i++) {256if (devPolyIoU(cur_box, block_polys + i * 9) > nms_overlap_thresh) {257t |= 1ULL << i;258}259}260const int col_blocks = DIVUP(n_polys, threadsPerBlock);261dev_mask[cur_box_idx * col_blocks + col_start] = t;262}263}264265void _set_device(int device_id) {266int current_device;267CUDA_CHECK(cudaGetDevice(¤t_device));268if (current_device == device_id) {269return;270}271// The call to cudaSetDevice must come before any calls to Get, which272// may perform initailization using the GPU.273CUDA_CHECK(cudaSetDevice(device_id));274}275276void _poly_nms(int* keep_out, int* num_out, const float* polys_host, int polys_num,277int polys_dim, float nms_overlap_thresh, int device_id) {278float* polys_dev = NULL;279unsigned long long* mask_dev = NULL;280const int col_blocks = DIVUP(polys_num, threadsPerBlock);281282CUDA_CHECK(cudaMalloc(&polys_dev,283polys_num * polys_dim * sizeof(float)));284CUDA_CHECK(cudaMemcpy(polys_dev,285polys_host,286polys_num * polys_dim * sizeof(float),287cudaMemcpyHostToDevice));288289CUDA_CHECK(cudaMalloc(&mask_dev,290polys_num * col_blocks * sizeof(unsigned long long)));291292dim3 blocks(DIVUP(polys_num, threadsPerBlock),293DIVUP(polys_num, threadsPerBlock));294dim3 threads(threadsPerBlock);295// __global__ void poly_nms_kernel(const int n_polys, const float nms_overlap_thresh,296// const float *dev_polys, unsigned long long *dev_mask)297poly_nms_kernel<<<blocks, threads>>>(polys_num,298nms_overlap_thresh,299polys_dev,300mask_dev);301302std::vector<unsigned long long> mask_host(polys_num * col_blocks);303CUDA_CHECK(cudaMemcpy(&mask_host[0],304mask_dev,305sizeof(unsigned long long) * polys_num * col_blocks,306cudaMemcpyDeviceToHost));307308std::vector<unsigned long long> remv(col_blocks);309memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);310// TODO: figure out it311int num_to_keep = 0;312for (int i = 0; i < polys_num; i++) {313int nblock = i / threadsPerBlock;314int inblock = i % threadsPerBlock;315316if (!(remv[nblock] & (1ULL << inblock))) {317keep_out[num_to_keep++] = i;318unsigned long long *p = &mask_host[0] + i * col_blocks;319for (int j = nblock; j < col_blocks; j++) {320remv[j] |= p[j];321}322}323}324*num_out = num_to_keep;325326CUDA_CHECK(cudaFree(polys_dev));327CUDA_CHECK(cudaFree(mask_dev));328}329330//331//int main(){332// double p[8] = {0, 0, 1, 0, 1, 1, 0, 1};333// double q[8] = {0.5, 0.5, 1.5, 0.5, 1.5, 1.5, 0.5, 1.5};334// vector<double> P(p, p + 8);335// vector<double> Q(q, q + 8);336// iou_poly(P, Q);337// return 0;338//}339340//int main(){341// double p[8] = {0, 0, 1, 0, 1, 1, 0, 1};342// double q[8] = {0.5, 0.5, 1.5, 0.5, 1.5, 1.5, 0.5, 1.5};343// iou_poly(p, q);344// return 0;345//}346347