| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241 |
- // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- /*
- 3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
- Written by Shaoshuai Shi
- All Rights Reserved 2019-2020.
- */
- #if defined(WITH_GPU)
- #include <cuda.h>
- #include <cuda_runtime_api.h>
- #include "iou3d_nms.h"
- namespace ultra_infer {
- namespace paddle_custom_ops {
- #define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
- // #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
- static inline int DIVUP(const int m, const int n) {
- return ((m) / (n) + ((m) % (n) > 0));
- }
- #define CHECK_ERROR(ans) \
- { gpuAssert((ans), __FILE__, __LINE__); }
- inline void gpuAssert(cudaError_t code, const char *file, int line,
- bool abort = true) {
- if (code != cudaSuccess) {
- fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
- line);
- if (abort)
- exit(code);
- }
- }
- #define D(x) \
- PD_THROW('\n', x, \
- "\n--------------------------------- where is the error ? " \
- "---------------------------------------\n");
- static const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
- void boxesoverlapLauncher(const int num_a, const float *boxes_a,
- const int num_b, const float *boxes_b,
- float *ans_overlap);
- void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
- const float *boxes_b, float *ans_iou);
- void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
- float nms_overlap_thresh);
- void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
- int boxes_num, float nms_overlap_thresh);
- int boxes_overlap_bev_gpu(paddle::Tensor boxes_a, paddle::Tensor boxes_b,
- paddle::Tensor ans_overlap) {
- // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
- // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
- // params ans_overlap: (N, M)
- CHECK_INPUT(boxes_a);
- CHECK_INPUT(boxes_b);
- CHECK_INPUT(ans_overlap);
- int num_a = boxes_a.shape()[0];
- int num_b = boxes_b.shape()[0];
- const float *boxes_a_data = boxes_a.data<float>();
- const float *boxes_b_data = boxes_b.data<float>();
- float *ans_overlap_data = ans_overlap.data<float>();
- boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data,
- ans_overlap_data);
- return 1;
- }
- int boxes_iou_bev_gpu(paddle::Tensor boxes_a, paddle::Tensor boxes_b,
- paddle::Tensor ans_iou) {
- // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
- // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
- // params ans_overlap: (N, M)
- CHECK_INPUT(boxes_a);
- CHECK_INPUT(boxes_b);
- CHECK_INPUT(ans_iou);
- int num_a = boxes_a.shape()[0];
- int num_b = boxes_b.shape()[0];
- const float *boxes_a_data = boxes_a.data<float>();
- const float *boxes_b_data = boxes_b.data<float>();
- float *ans_iou_data = ans_iou.data<float>();
- boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
- return 1;
- }
- std::vector<paddle::Tensor> nms_gpu(const paddle::Tensor &boxes,
- float nms_overlap_thresh) {
- // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
- // params keep: (N)
- CHECK_INPUT(boxes);
- // CHECK_CONTIGUOUS(keep);
- auto keep = paddle::empty({boxes.shape()[0]}, paddle::DataType::INT32,
- paddle::CPUPlace());
- auto num_to_keep_tensor =
- paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());
- int *num_to_keep_data = num_to_keep_tensor.data<int>();
- int boxes_num = boxes.shape()[0];
- const float *boxes_data = boxes.data<float>();
- int *keep_data = keep.data<int>();
- int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
- unsigned long long *mask_data = NULL;
- CHECK_ERROR(cudaMalloc((void **)&mask_data,
- boxes_num * col_blocks * sizeof(unsigned long long)));
- nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
- // unsigned long long mask_cpu[boxes_num * col_blocks];
- // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
- // col_blocks];
- std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
- // printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
- CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
- boxes_num * col_blocks * sizeof(unsigned long long),
- cudaMemcpyDeviceToHost));
- cudaFree(mask_data);
- // WARN(qiuyanjun): codes below will throw a compile error on windows with
- // msvc. Thus, we chose to use std::vectored to store the result instead.
- // unsigned long long remv_cpu[col_blocks];
- // memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long));
- std::vector<unsigned long long> remv_cpu(col_blocks, 0);
- int num_to_keep = 0;
- for (int i = 0; i < boxes_num; i++) {
- int nblock = i / THREADS_PER_BLOCK_NMS;
- int inblock = i % THREADS_PER_BLOCK_NMS;
- if (!(remv_cpu[nblock] & (1ULL << inblock))) {
- keep_data[num_to_keep++] = i;
- unsigned long long *p = &mask_cpu[0] + i * col_blocks;
- for (int j = nblock; j < col_blocks; j++) {
- remv_cpu[j] |= p[j];
- }
- }
- }
- num_to_keep_data[0] = num_to_keep;
- if (cudaSuccess != cudaGetLastError())
- printf("Error!\n");
- return {keep, num_to_keep_tensor};
- }
- int nms_normal_gpu(paddle::Tensor boxes, paddle::Tensor keep,
- float nms_overlap_thresh) {
- // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
- // params keep: (N)
- CHECK_INPUT(boxes);
- // CHECK_CONTIGUOUS(keep);
- int boxes_num = boxes.shape()[0];
- const float *boxes_data = boxes.data<float>();
- // WARN(qiuyanjun): long type for Tensor::data() API is not exported by
- // paddle, it will raise some link error on windows with msvc. Please check:
- // https://github.com/PaddlePaddle/Paddle/blob/release/2.5/paddle/phi/api/lib/tensor.cc
- #if defined(_WIN32)
- int *keep_data = keep.data<int>();
- #else
- long *keep_data = keep.data<long>();
- #endif
- int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
- unsigned long long *mask_data = NULL;
- CHECK_ERROR(cudaMalloc((void **)&mask_data,
- boxes_num * col_blocks * sizeof(unsigned long long)));
- nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
- // unsigned long long mask_cpu[boxes_num * col_blocks];
- // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
- // col_blocks];
- std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
- // printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
- CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
- boxes_num * col_blocks * sizeof(unsigned long long),
- cudaMemcpyDeviceToHost));
- cudaFree(mask_data);
- // WARN(qiuyanjun): codes below will throw a compile error on windows with
- // msvc. Thus, we chose to use std::vectored to store the result instead.
- // unsigned long long remv_cpu[col_blocks];
- // memset(remv_cpu, 0, col_blocks * sizeof(unsigned long long));
- std::vector<unsigned long long> remv_cpu(col_blocks, 0);
- int num_to_keep = 0;
- for (int i = 0; i < boxes_num; i++) {
- int nblock = i / THREADS_PER_BLOCK_NMS;
- int inblock = i % THREADS_PER_BLOCK_NMS;
- if (!(remv_cpu[nblock] & (1ULL << inblock))) {
- keep_data[num_to_keep++] = i;
- unsigned long long *p = &mask_cpu[0] + i * col_blocks;
- for (int j = nblock; j < col_blocks; j++) {
- remv_cpu[j] |= p[j];
- }
- }
- }
- if (cudaSuccess != cudaGetLastError())
- printf("Error!\n");
- return num_to_keep;
- }
- } // namespace paddle_custom_ops
- } // namespace ultra_infer
- #endif
|