| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- // Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- /*
- 3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
- Written by Shaoshuai Shi
- All Rights Reserved 2019-2020.
- */
- #include "iou3d_nms.h"
- #include <cuda.h>
- #include <cuda_runtime_api.h>
- #include <paddle/extension.h>
- #include <vector>
- #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
- const int THREADS_PER_BLOCK_NMS = sizeof(int64_t) * 8;
- void BoxesOverlapLauncher(const cudaStream_t &stream, const int num_a,
- const float *boxes_a, const int num_b,
- const float *boxes_b, float *ans_overlap);
- void BoxesIouBevLauncher(const cudaStream_t &stream, const int num_a,
- const float *boxes_a, const int num_b,
- const float *boxes_b, float *ans_iou);
- void NmsLauncher(const cudaStream_t &stream, const float *boxes, int64_t *mask,
- int boxes_num, float nms_overlap_thresh);
- void NmsNormalLauncher(const cudaStream_t &stream, const float *boxes,
- int64_t *mask, int boxes_num, float nms_overlap_thresh);
- std::vector<paddle::Tensor>
- boxes_overlap_bev_gpu(const paddle::Tensor &boxes_a,
- const paddle::Tensor &boxes_b) {
- // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
- // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
- // params ans_overlap: (N, M)
- int num_a = boxes_a.shape()[0];
- int num_b = boxes_b.shape()[0];
- const float *boxes_a_data = boxes_a.data<float>();
- const float *boxes_b_data = boxes_b.data<float>();
- auto ans_overlap = paddle::empty({num_a, num_b}, paddle::DataType::FLOAT32,
- paddle::GPUPlace());
- float *ans_overlap_data = ans_overlap.data<float>();
- BoxesOverlapLauncher(boxes_a.stream(), num_a, boxes_a_data, num_b,
- boxes_b_data, ans_overlap_data);
- return {ans_overlap};
- }
- std::vector<paddle::Tensor>
- boxes_iou_bev_gpu(const paddle::Tensor &boxes_a_tensor,
- const paddle::Tensor &boxes_b_tensor) {
- // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
- // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
- // params ans_overlap: (N, M)
- int num_a = boxes_a_tensor.shape()[0];
- int num_b = boxes_b_tensor.shape()[0];
- const float *boxes_a_data = boxes_a_tensor.data<float>();
- const float *boxes_b_data = boxes_b_tensor.data<float>();
- auto ans_iou_tensor = paddle::empty({num_a, num_b}, paddle::DataType::FLOAT32,
- paddle::GPUPlace());
- float *ans_iou_data = ans_iou_tensor.data<float>();
- BoxesIouBevLauncher(boxes_a_tensor.stream(), num_a, boxes_a_data, num_b,
- boxes_b_data, ans_iou_data);
- return {ans_iou_tensor};
- }
- std::vector<paddle::Tensor> nms_gpu(const paddle::Tensor &boxes,
- float nms_overlap_thresh) {
- // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
- auto keep = paddle::empty({boxes.shape()[0]}, paddle::DataType::INT32,
- paddle::CPUPlace());
- auto num_to_keep_tensor =
- paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());
- int *num_to_keep_data = num_to_keep_tensor.data<int>();
- int boxes_num = boxes.shape()[0];
- const float *boxes_data = boxes.data<float>();
- int *keep_data = keep.data<int>();
- const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
- // int64_t *mask_data = NULL;
- // CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks *
- // sizeof(int64_t)));
- auto mask = paddle::empty({boxes_num * col_blocks}, paddle::DataType::INT64,
- paddle::GPUPlace());
- int64_t *mask_data = mask.data<int64_t>();
- NmsLauncher(boxes.stream(), boxes_data, mask_data, boxes_num,
- nms_overlap_thresh);
- // std::vector<int64_t> mask_cpu(boxes_num * col_blocks);
- // CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks *
- // sizeof(int64_t),
- // cudaMemcpyDeviceToHost));
- const paddle::Tensor mask_cpu_tensor = mask.copy_to(paddle::CPUPlace(), true);
- const int64_t *mask_cpu = mask_cpu_tensor.data<int64_t>();
- // cudaFree(mask_data);
- int64_t remv_cpu[col_blocks];
- memset(remv_cpu, 0, col_blocks * sizeof(int64_t));
- int num_to_keep = 0;
- for (int i = 0; i < boxes_num; i++) {
- int nblock = i / THREADS_PER_BLOCK_NMS;
- int inblock = i % THREADS_PER_BLOCK_NMS;
- if (!(remv_cpu[nblock] & (1ULL << inblock))) {
- keep_data[num_to_keep++] = i;
- const int64_t *p = &mask_cpu[0] + i * col_blocks;
- for (int j = nblock; j < col_blocks; j++) {
- remv_cpu[j] |= p[j];
- }
- }
- }
- num_to_keep_data[0] = num_to_keep;
- if (cudaSuccess != cudaGetLastError())
- printf("Error!\n");
- return {keep, num_to_keep_tensor};
- }
- std::vector<paddle::Tensor> nms_normal_gpu(const paddle::Tensor &boxes,
- float nms_overlap_thresh) {
- // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
- // params keep: (N)
- auto keep = paddle::empty({boxes.shape()[0]}, paddle::DataType::INT32,
- paddle::CPUPlace());
- auto num_to_keep_tensor =
- paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());
- int *num_to_keep_data = num_to_keep_tensor.data<int>();
- int boxes_num = boxes.shape()[0];
- const float *boxes_data = boxes.data<float>();
- int *keep_data = keep.data<int>();
- const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
- // int64_t *mask_data = NULL;
- // CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks *
- // sizeof(int64_t)));
- auto mask = paddle::empty({boxes_num * col_blocks}, paddle::DataType::INT64,
- paddle::GPUPlace());
- int64_t *mask_data = mask.data<int64_t>();
- NmsNormalLauncher(boxes.stream(), boxes_data, mask_data, boxes_num,
- nms_overlap_thresh);
- // int64_t mask_cpu[boxes_num * col_blocks];
- // int64_t *mask_cpu = new int64_t [boxes_num * col_blocks];
- // std::vector<int64_t> mask_cpu(boxes_num * col_blocks);
- // CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks *
- // sizeof(int64_t),
- // cudaMemcpyDeviceToHost));
- // cudaFree(mask_data);
- const paddle::Tensor mask_cpu_tensor = mask.copy_to(paddle::CPUPlace(), true);
- const int64_t *mask_cpu = mask_cpu_tensor.data<int64_t>();
- int64_t remv_cpu[col_blocks];
- memset(remv_cpu, 0, col_blocks * sizeof(int64_t));
- int num_to_keep = 0;
- for (int i = 0; i < boxes_num; i++) {
- int nblock = i / THREADS_PER_BLOCK_NMS;
- int inblock = i % THREADS_PER_BLOCK_NMS;
- if (!(remv_cpu[nblock] & (1ULL << inblock))) {
- keep_data[num_to_keep++] = i;
- const int64_t *p = &mask_cpu[0] + i * col_blocks;
- for (int j = nblock; j < col_blocks; j++) {
- remv_cpu[j] |= p[j];
- }
- }
- }
- num_to_keep_data[0] = num_to_keep;
- if (cudaSuccess != cudaGetLastError()) {
- printf("Error!\n");
- }
- return {keep, num_to_keep_tensor};
- }
|