Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/master/utils/nms_rotated/src/nms_rotated_cuda.cu
Views: 475
// Modified from1// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/nms_rotated2// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved3#include <ATen/ATen.h>4#include <ATen/cuda/CUDAContext.h>5#include <c10/cuda/CUDAGuard.h>6#include <ATen/cuda/CUDAApplyUtils.cuh>7#include "box_iou_rotated_utils.h"89int const threadsPerBlock = sizeof(unsigned long long) * 8;1011template <typename T>12__global__ void nms_rotated_cuda_kernel(13const int n_boxes,14const float iou_threshold,15const T* dev_boxes,16unsigned long long* dev_mask) {17// nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel1819const int row_start = blockIdx.y;20const int col_start = blockIdx.x;2122// if (row_start > col_start) return;2324const int row_size =25min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);26const int col_size =27min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);2829// Compared to nms_cuda_kernel, where each box is represented with 4 values30// (x1, y1, x2, y2), each rotated box is represented with 5 values31// (x_center, y_center, width, height, angle_degrees) here.32__shared__ T block_boxes[threadsPerBlock * 5];33if (threadIdx.x < col_size) {34block_boxes[threadIdx.x * 5 + 0] =35dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];36block_boxes[threadIdx.x * 5 + 1] =37dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];38block_boxes[threadIdx.x * 5 + 2] =39dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];40block_boxes[threadIdx.x * 5 + 3] =41dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];42block_boxes[threadIdx.x * 5 + 4] =43dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];44}45__syncthreads();4647if (threadIdx.x < row_size) {48const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;49const T* cur_box = dev_boxes + cur_box_idx * 5;50int i = 0;51unsigned long long t = 0;52int start = 0;53if (row_start == col_start) {54start = threadIdx.x + 1;55}56for (i = start; i < col_size; i++) {57// Instead of devIoU used by original horizontal nms, here58// we use the single_box_iou_rotated function from box_iou_rotated_utils.h59if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5) >60iou_threshold) {61t |= 1ULL << i;62}63}64const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);65dev_mask[cur_box_idx * col_blocks + col_start] = t;66}67}686970at::Tensor nms_rotated_cuda(71// input must be contiguous72const at::Tensor& dets,73const at::Tensor& scores,74float iou_threshold) {75// using scalar_t = float;76AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");77AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");78at::cuda::CUDAGuard device_guard(dets.device());7980auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));81auto dets_sorted = dets.index_select(0, order_t);8283auto dets_num = dets.size(0);8485const int col_blocks =86at::cuda::ATenCeilDiv(static_cast<int>(dets_num), threadsPerBlock);8788at::Tensor mask =89at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));9091dim3 blocks(col_blocks, col_blocks);92dim3 threads(threadsPerBlock);93cudaStream_t stream = at::cuda::getCurrentCUDAStream();9495AT_DISPATCH_FLOATING_TYPES(96dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {97nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(98dets_num,99iou_threshold,100dets_sorted.data_ptr<scalar_t>(),101(unsigned long long*)mask.data_ptr<int64_t>());102});103104at::Tensor mask_cpu = mask.to(at::kCPU);105unsigned long long* mask_host =106(unsigned long long*)mask_cpu.data_ptr<int64_t>();107108std::vector<unsigned long long> remv(col_blocks);109memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);110111at::Tensor keep =112at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));113int64_t* keep_out = keep.data_ptr<int64_t>();114115int num_to_keep = 0;116for (int i = 0; i < dets_num; i++) {117int nblock = i / threadsPerBlock;118int inblock = i % threadsPerBlock;119120if (!(remv[nblock] & (1ULL << inblock))) {121keep_out[num_to_keep++] = i;122unsigned long long* p = mask_host + i * col_blocks;123for (int j = nblock; j < col_blocks; j++) {124remv[j] |= p[j];125}126}127}128129AT_CUDA_CHECK(cudaGetLastError());130return order_t.index(131{keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)132.to(order_t.device(), keep.scalar_type())});133}134135136