Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/master/utils/nms_rotated/src/poly_nms_cuda.cu
Views: 475
#include <ATen/ATen.h>1#include <ATen/cuda/CUDAContext.h>23#include <THC/THC.h>4#include <THC/THCDeviceUtils.cuh>56#include <vector>7#include <iostream>89#define CUDA_CHECK(condition) \10/* Code block avoids redefinition of cudaError_t error */ \11do { \12cudaError_t error = condition; \13if (error != cudaSuccess) { \14std::cout << cudaGetErrorString(error) << std::endl; \15} \16} while (0)1718#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))19int const threadsPerBlock = sizeof(unsigned long long) * 8;202122#define maxn 1023const double eps=1E-8;2425__device__ inline int sig(float d){26return(d>eps)-(d<-eps);27}2829__device__ inline int point_eq(const float2 a, const float2 b) {30return sig(a.x - b.x) == 0 && sig(a.y - b.y)==0;31}3233__device__ inline void point_swap(float2 *a, float2 *b) {34float2 temp = *a;35*a = *b;36*b = temp;37}3839__device__ inline void point_reverse(float2 *first, float2* last)40{41while ((first!=last)&&(first!=--last)) {42point_swap (first,last);43++first;44}45}4647__device__ inline float cross(float2 o,float2 a,float2 b){ //叉积48return(a.x-o.x)*(b.y-o.y)-(b.x-o.x)*(a.y-o.y);49}50__device__ inline float area(float2* ps,int n){51ps[n]=ps[0];52float res=0;53for(int i=0;i<n;i++){54res+=ps[i].x*ps[i+1].y-ps[i].y*ps[i+1].x;55}56return res/2.0;57}58__device__ inline int lineCross(float2 a,float2 b,float2 c,float2 d,float2&p){59float s1,s2;60s1=cross(a,b,c);61s2=cross(a,b,d);62if(sig(s1)==0&&sig(s2)==0) return 2;63if(sig(s2-s1)==0) return 0;64p.x=(c.x*s2-d.x*s1)/(s2-s1);65p.y=(c.y*s2-d.y*s1)/(s2-s1);66return 1;67}6869__device__ inline void polygon_cut(float2*p,int&n,float2 a,float2 b, float2* pp){7071int m=0;p[n]=p[0];72for(int i=0;i<n;i++){73if(sig(cross(a,b,p[i]))>0) pp[m++]=p[i];74if(sig(cross(a,b,p[i]))!=sig(cross(a,b,p[i+1])))75lineCross(a,b,p[i],p[i+1],pp[m++]);76}77n=0;78for(int i=0;i<m;i++)79if(!i||!(point_eq(pp[i], pp[i-1])))80p[n++]=pp[i];81// while(n>1&&p[n-1]==p[0])n--;82while(n>1&&point_eq(p[n-1], p[0]))n--;83}8485//---------------华丽的分隔线-----------------//86//返回三角形oab和三角形ocd的有向交面积,o是原点//87__device__ inline float intersectArea(float2 a,float2 b,float2 c,float2 d){88float2 o = make_float2(0,0);89int s1=sig(cross(o,a,b));90int s2=sig(cross(o,c,d));91if(s1==0||s2==0)return 0.0;//退化,面积为092// if(s1==-1) swap(a,b);93// if(s2==-1) swap(c,d);94if (s1 == -1) point_swap(&a, &b);95if (s2 == -1) point_swap(&c, &d);96float2 p[10]={o,a,b};97int n=3;98float2 pp[maxn];99polygon_cut(p,n,o,c,pp);100polygon_cut(p,n,c,d,pp);101polygon_cut(p,n,d,o,pp);102float res=fabs(area(p,n));103if(s1*s2==-1) res=-res;return res;104}105//求两多边形的交面积106__device__ inline float intersectArea(float2*ps1,int n1,float2*ps2,int n2){107if(area(ps1,n1)<0) point_reverse(ps1,ps1+n1);108if(area(ps2,n2)<0) point_reverse(ps2,ps2+n2);109ps1[n1]=ps1[0];110ps2[n2]=ps2[0];111float res=0;112for(int i=0;i<n1;i++){113for(int j=0;j<n2;j++){114res+=intersectArea(ps1[i],ps1[i+1],ps2[j],ps2[j+1]);115}116}117return res;//assumeresispositive!118}119120// TODO: optimal if by first calculate the iou between two hbbs121__device__ inline float devPolyIoU(float const * const p, float const * const q) {122float2 ps1[maxn], ps2[maxn];123int n1 = 4;124int n2 = 4;125for (int i = 0; i < 4; i++) {126ps1[i].x = p[i * 2];127ps1[i].y = p[i * 2 + 1];128129ps2[i].x = q[i * 2];130ps2[i].y = q[i * 2 + 1];131}132float inter_area = intersectArea(ps1, n1, ps2, n2);133float union_area = fabs(area(ps1, n1)) + fabs(area(ps2, n2)) - inter_area;134float iou = 0;135if (union_area == 0) {136iou = (inter_area + 1) / (union_area + 1);137} else {138iou = inter_area / union_area;139}140return iou;141}142143__global__ void poly_nms_kernel(const int n_polys, const float nms_overlap_thresh,144const float *dev_polys, unsigned long long *dev_mask) {145const int row_start = blockIdx.y;146const int col_start = blockIdx.x;147148const int row_size =149min(n_polys - row_start * threadsPerBlock, threadsPerBlock);150const int cols_size =151min(n_polys - col_start * threadsPerBlock, threadsPerBlock);152153__shared__ float block_polys[threadsPerBlock * 9];154if (threadIdx.x < cols_size) {155block_polys[threadIdx.x * 9 + 0] =156dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];157block_polys[threadIdx.x * 9 + 1] =158dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];159block_polys[threadIdx.x * 9 + 2] =160dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];161block_polys[threadIdx.x * 9 + 3] =162dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];163block_polys[threadIdx.x * 9 + 4] =164dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];165block_polys[threadIdx.x * 9 + 5] =166dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];167block_polys[threadIdx.x * 9 + 6] =168dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];169block_polys[threadIdx.x * 9 + 7] =170dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];171block_polys[threadIdx.x * 9 + 8] =172dev_polys[(threadsPerBlock * col_start + threadIdx.x) * 9 + 8];173}174__syncthreads();175176if (threadIdx.x < row_size) {177const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;178const float *cur_box = dev_polys + cur_box_idx * 9;179int i = 0;180unsigned long long t = 0;181int start = 0;182if (row_start == col_start) {183start = threadIdx.x + 1;184}185for (i = start; i < cols_size; i++) {186if (devPolyIoU(cur_box, block_polys + i * 9) > nms_overlap_thresh) {187t |= 1ULL << i;188}189}190const int col_blocks = THCCeilDiv(n_polys, threadsPerBlock);191dev_mask[cur_box_idx * col_blocks + col_start] = t;192}193}194195// boxes is a N x 9 tensor196at::Tensor poly_nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {197198at::DeviceGuard guard(boxes.device());199200using scalar_t = float;201AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");202auto scores = boxes.select(1, 8);203auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));204auto boxes_sorted = boxes.index_select(0, order_t);205206int boxes_num = boxes.size(0);207208const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);209210scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();211212THCState *state = at::globalContext().lazyInitCUDA();213214unsigned long long* mask_dev = NULL;215216mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));217218dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),219THCCeilDiv(boxes_num, threadsPerBlock));220dim3 threads(threadsPerBlock);221poly_nms_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(boxes_num,222nms_overlap_thresh,223boxes_dev,224mask_dev);225226std::vector<unsigned long long> mask_host(boxes_num * col_blocks);227THCudaCheck(cudaMemcpyAsync(228&mask_host[0],229mask_dev,230sizeof(unsigned long long) * boxes_num * col_blocks,231cudaMemcpyDeviceToHost,232at::cuda::getCurrentCUDAStream()233));234235std::vector<unsigned long long> remv(col_blocks);236memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);237238at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));239int64_t* keep_out = keep.data_ptr<int64_t>();240241int num_to_keep = 0;242for (int i = 0; i < boxes_num; i++) {243int nblock = i / threadsPerBlock;244int inblock = i % threadsPerBlock;245246if (!(remv[nblock] & (1ULL << inblock))) {247keep_out[num_to_keep++] = i;248unsigned long long *p = &mask_host[0] + i * col_blocks;249for (int j = nblock; j < col_blocks; j++) {250remv[j] |= p[j];251}252}253}254255THCudaFree(state, mask_dev);256257return order_t.index({258keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(259order_t.device(), keep.scalar_type())});260}261262263264