iou3d_nms.cpp 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. // Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. /*
  15. 3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
  16. Written by Shaoshuai Shi
  17. All Rights Reserved 2019-2020.
  18. */
  19. #include "iou3d_nms.h"
  20. #include <cuda.h>
  21. #include <cuda_runtime_api.h>
  22. #include <paddle/extension.h>
  23. #include <vector>
  24. #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
  25. const int THREADS_PER_BLOCK_NMS = sizeof(int64_t) * 8;
  26. void BoxesOverlapLauncher(const cudaStream_t &stream, const int num_a,
  27. const float *boxes_a, const int num_b,
  28. const float *boxes_b, float *ans_overlap);
  29. void BoxesIouBevLauncher(const cudaStream_t &stream, const int num_a,
  30. const float *boxes_a, const int num_b,
  31. const float *boxes_b, float *ans_iou);
  32. void NmsLauncher(const cudaStream_t &stream, const float *boxes, int64_t *mask,
  33. int boxes_num, float nms_overlap_thresh);
  34. void NmsNormalLauncher(const cudaStream_t &stream, const float *boxes,
  35. int64_t *mask, int boxes_num, float nms_overlap_thresh);
  36. std::vector<paddle::Tensor> boxes_overlap_bev_gpu(
  37. const paddle::Tensor &boxes_a, const paddle::Tensor &boxes_b) {
  38. // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
  39. // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
  40. // params ans_overlap: (N, M)
  41. int num_a = boxes_a.shape()[0];
  42. int num_b = boxes_b.shape()[0];
  43. const float *boxes_a_data = boxes_a.data<float>();
  44. const float *boxes_b_data = boxes_b.data<float>();
  45. auto ans_overlap = paddle::empty({num_a, num_b}, paddle::DataType::FLOAT32,
  46. paddle::GPUPlace());
  47. float *ans_overlap_data = ans_overlap.data<float>();
  48. BoxesOverlapLauncher(boxes_a.stream(), num_a, boxes_a_data, num_b,
  49. boxes_b_data, ans_overlap_data);
  50. return {ans_overlap};
  51. }
  52. std::vector<paddle::Tensor> boxes_iou_bev_gpu(
  53. const paddle::Tensor &boxes_a_tensor,
  54. const paddle::Tensor &boxes_b_tensor) {
  55. // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
  56. // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
  57. // params ans_overlap: (N, M)
  58. int num_a = boxes_a_tensor.shape()[0];
  59. int num_b = boxes_b_tensor.shape()[0];
  60. const float *boxes_a_data = boxes_a_tensor.data<float>();
  61. const float *boxes_b_data = boxes_b_tensor.data<float>();
  62. auto ans_iou_tensor = paddle::empty({num_a, num_b}, paddle::DataType::FLOAT32,
  63. paddle::GPUPlace());
  64. float *ans_iou_data = ans_iou_tensor.data<float>();
  65. BoxesIouBevLauncher(boxes_a_tensor.stream(), num_a, boxes_a_data, num_b,
  66. boxes_b_data, ans_iou_data);
  67. return {ans_iou_tensor};
  68. }
  69. std::vector<paddle::Tensor> nms_gpu(const paddle::Tensor &boxes,
  70. float nms_overlap_thresh) {
  71. // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  72. auto keep = paddle::empty({boxes.shape()[0]}, paddle::DataType::INT32,
  73. paddle::CPUPlace());
  74. auto num_to_keep_tensor =
  75. paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());
  76. int *num_to_keep_data = num_to_keep_tensor.data<int>();
  77. int boxes_num = boxes.shape()[0];
  78. const float *boxes_data = boxes.data<float>();
  79. int *keep_data = keep.data<int>();
  80. const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
  81. // int64_t *mask_data = NULL;
  82. // CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks *
  83. // sizeof(int64_t)));
  84. auto mask = paddle::empty({boxes_num * col_blocks}, paddle::DataType::INT64,
  85. paddle::GPUPlace());
  86. int64_t *mask_data = mask.data<int64_t>();
  87. NmsLauncher(boxes.stream(), boxes_data, mask_data, boxes_num,
  88. nms_overlap_thresh);
  89. // std::vector<int64_t> mask_cpu(boxes_num * col_blocks);
  90. // CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks *
  91. // sizeof(int64_t),
  92. // cudaMemcpyDeviceToHost));
  93. const paddle::Tensor mask_cpu_tensor = mask.copy_to(paddle::CPUPlace(), true);
  94. const int64_t *mask_cpu = mask_cpu_tensor.data<int64_t>();
  95. // cudaFree(mask_data);
  96. int64_t remv_cpu[col_blocks];
  97. memset(remv_cpu, 0, col_blocks * sizeof(int64_t));
  98. int num_to_keep = 0;
  99. for (int i = 0; i < boxes_num; i++) {
  100. int nblock = i / THREADS_PER_BLOCK_NMS;
  101. int inblock = i % THREADS_PER_BLOCK_NMS;
  102. if (!(remv_cpu[nblock] & (1ULL << inblock))) {
  103. keep_data[num_to_keep++] = i;
  104. const int64_t *p = &mask_cpu[0] + i * col_blocks;
  105. for (int j = nblock; j < col_blocks; j++) {
  106. remv_cpu[j] |= p[j];
  107. }
  108. }
  109. }
  110. num_to_keep_data[0] = num_to_keep;
  111. if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
  112. return {keep, num_to_keep_tensor};
  113. }
  114. std::vector<paddle::Tensor> nms_normal_gpu(const paddle::Tensor &boxes,
  115. float nms_overlap_thresh) {
  116. // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  117. // params keep: (N)
  118. auto keep = paddle::empty({boxes.shape()[0]}, paddle::DataType::INT32,
  119. paddle::CPUPlace());
  120. auto num_to_keep_tensor =
  121. paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());
  122. int *num_to_keep_data = num_to_keep_tensor.data<int>();
  123. int boxes_num = boxes.shape()[0];
  124. const float *boxes_data = boxes.data<float>();
  125. int *keep_data = keep.data<int>();
  126. const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
  127. // int64_t *mask_data = NULL;
  128. // CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks *
  129. // sizeof(int64_t)));
  130. auto mask = paddle::empty({boxes_num * col_blocks}, paddle::DataType::INT64,
  131. paddle::GPUPlace());
  132. int64_t *mask_data = mask.data<int64_t>();
  133. NmsNormalLauncher(boxes.stream(), boxes_data, mask_data, boxes_num,
  134. nms_overlap_thresh);
  135. // int64_t mask_cpu[boxes_num * col_blocks];
  136. // int64_t *mask_cpu = new int64_t [boxes_num * col_blocks];
  137. // std::vector<int64_t> mask_cpu(boxes_num * col_blocks);
  138. // CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks *
  139. // sizeof(int64_t),
  140. // cudaMemcpyDeviceToHost));
  141. // cudaFree(mask_data);
  142. const paddle::Tensor mask_cpu_tensor = mask.copy_to(paddle::CPUPlace(), true);
  143. const int64_t *mask_cpu = mask_cpu_tensor.data<int64_t>();
  144. int64_t remv_cpu[col_blocks];
  145. memset(remv_cpu, 0, col_blocks * sizeof(int64_t));
  146. int num_to_keep = 0;
  147. for (int i = 0; i < boxes_num; i++) {
  148. int nblock = i / THREADS_PER_BLOCK_NMS;
  149. int inblock = i % THREADS_PER_BLOCK_NMS;
  150. if (!(remv_cpu[nblock] & (1ULL << inblock))) {
  151. keep_data[num_to_keep++] = i;
  152. const int64_t *p = &mask_cpu[0] + i * col_blocks;
  153. for (int j = nblock; j < col_blocks; j++) {
  154. remv_cpu[j] |= p[j];
  155. }
  156. }
  157. }
  158. num_to_keep_data[0] = num_to_keep;
  159. if (cudaSuccess != cudaGetLastError()) {
  160. printf("Error!\n");
  161. }
  162. return {keep, num_to_keep_tensor};
  163. }