9 月之前 · ed4da1acc0
--- a/docs/module_usage/tutorials/cv_modules/3d_bev_detection.md
+++ b/docs/module_usage/tutorials/cv_modules/3d_bev_detection.md
@@ -52,35 +52,7 @@ for res in output:
 
				 ```bash
			
 
				 {"res":
			
 
				   {
			
 
				-    'input_path': 'samples/LIDAR_TOP/n015-2018-10-08-15-36-50+0800__LIDAR_TOP__1538984253447765.pcd.bin',
			
 
				-    'sample_id': 'b4ff30109dd14c89b24789dc5713cf8c',
			
 
				-    'input_img_paths': [
			
 
				-      'samples/CAM_FRONT_LEFT/n015-2018-10-08-15-36-50+0800__CAM_FRONT_LEFT__1538984253404844.jpg',
			
 
				-      'samples/CAM_FRONT/n015-2018-10-08-15-36-50+0800__CAM_FRONT__1538984253412460.jpg',
			
 
				-      'samples/CAM_FRONT_RIGHT/n015-2018-10-08-15-36-50+0800__CAM_FRONT_RIGHT__1538984253420339.jpg',
			
 
				-      'samples/CAM_BACK_RIGHT/n015-2018-10-08-15-36-50+0800__CAM_BACK_RIGHT__1538984253427893.jpg',
			
 
				-      'samples/CAM_BACK/n015-2018-10-08-15-36-50+0800__CAM_BACK__1538984253437525.jpg',
			
 
				-      'samples/CAM_BACK_LEFT/n015-2018-10-08-15-36-50+0800__CAM_BACK_LEFT__1538984253447423.jpg'
			
 
				-    ]
			
 
				-    "boxes_3d": [
			
 
				-        [
			
 
				-            14.5425386428833,
			
 
				-            22.142045974731445,
			
 
				-            -1.2903141975402832,
			
 
				-            1.8441576957702637,
			
 
				-            4.433370113372803,
			
 
				-            1.7367216348648071,
			
 
				-            6.367165565490723,
			
 
				-            0.0036598597653210163,
			
 
				-            -0.013568558730185032
			
 
				-        ]
			
 
				-    ],
			
 
				-    "labels_3d": [
			
 
				-        0
			
 
				-    ],
			
 
				-    "scores_3d": [
			
 
				-        0.9920279383659363
			
 
				-    ]
			
 
				+    "input_path": "./data/nuscenes/samples/LIDAR_TOP/n008-2018-08-01-15-16-36-0400__LIDAR_TOP__1533151616947490.pcd.bin", "input_img_paths": ["./data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_LEFT__1533151616904806.jpg", "./data/nuscenes/samples/CAM_FRONT/n008-2018-08-01-15-16-36-0400__CAM_FRONT__1533151616912404.jpg", "./data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_RIGHT__1533151616920482.jpg", "./data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_BACK_RIGHT__1533151616928113.jpg", "./data/nuscenes/samples/CAM_BACK/n008-2018-08-01-15-16-36-0400__CAM_BACK__1533151616937558.jpg", "./data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-08-01-15-16-36-0400__CAM_BACK_LEFT__1533151616947405.jpg"], "sample_id": "cc57c1ea80fe46a7abddfdb15654c872", "boxes_3d": [[-8.913962364196777, 13.30993366241455, -1.7353310585021973, 1.9886571168899536, 4.886075019836426, 1.877254605293274, 6.317165374755859, -0.00018131558317691088, 0.022375036031007767]], "labels_3d": [0], "scores_3d": [0.9951273202896118]
			
 
				   }
			
 
				 }
			
 
				 ```
			
--- a/paddlex/configs/modules/3d_bev_detection/BEVFusion.yaml
+++ b/paddlex/configs/modules/3d_bev_detection/BEVFusion.yaml
@@ -32,7 +32,7 @@ Export:
 
				 
			
 
				 Predict:
			
 
				   batch_size: 1
			
 
				-  model_dir: "output_bevfusion"
			
 
				+  model_dir: "output/best_model/inference"
			
 
				   input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/det_3d/demo_det_3d/nuscenes_infos_val.pkl"
			
 
				   kernel_option:
			
 
				     run_mode: paddle
			
--- a/paddlex/inference/common/batch_sampler/__init__.py
+++ b/paddlex/inference/common/batch_sampler/__init__.py
@@ -16,4 +16,5 @@ from .base_batch_sampler import BaseBatchSampler
 
				 from .image_batch_sampler import ImageBatchSampler
			
 
				 from .video_batch_sampler import VideoBatchSampler
			
 
				 from .ts_batch_sampler import TSBatchSampler
			
 
				+from .det_3d_batch_sampler import Det3DBatchSampler
			
 
				 from .audio_batch_sampler import AudioBatchSampler
			
--- a/paddlex/inference/common/batch_sampler/det_3d_batch_sampler.py
+++ b/paddlex/inference/common/batch_sampler/det_3d_batch_sampler.py
@@ -0,0 +1,100 @@
 
				+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import os
			
 
				+import ast
			
 
				+from pathlib import Path
			
 
				+import numpy as np
			
 
				+import pickle
			
 
				+from typing import Any, Dict, List, Optional, Union
			
 
				+
			
 
				+from ....utils import logging
			
 
				+from ....utils.download import download
			
 
				+from ....utils.cache import CACHE_DIR
			
 
				+from .base_batch_sampler import BaseBatchSampler
			
 
				+
			
 
				+
			
 
				+class Det3DBatchSampler(BaseBatchSampler):
			
 
				+
			
 
				+    # XXX: auto download for url
			
 
				+    def _download_from_url(self, in_path: str) -> str:
			
 
				+        file_name = Path(in_path).name
			
 
				+        save_path = Path(CACHE_DIR) / "predict_input" / file_name
			
 
				+        download(in_path, save_path, overwrite=True)
			
 
				+        return save_path.as_posix()
			
 
				+
			
 
				+    @property
			
 
				+    def batch_size(self) -> int:
			
 
				+        """Gets the batch size."""
			
 
				+        return self._batch_size
			
 
				+
			
 
				+    @batch_size.setter
			
 
				+    def batch_size(self, batch_size: int) -> None:
			
 
				+        """Sets the batch size.
			
 
				+
			
 
				+        Args:
			
 
				+            batch_size (int): The batch size to set.
			
 
				+        """
			
 
				+        if batch_size != 1:
			
 
				+            logging.warning(
			
 
				+                "inference for 3D models only support batch_size equal to 1"
			
 
				+            )
			
 
				+        self._batch_size = batch_size
			
 
				+
			
 
				+    def load_annotations(self, ann_file: str) -> List[Dict]:
			
 
				+        """Load annotations from ann_file.
			
 
				+
			
 
				+        Args:
			
 
				+            ann_file (str): Path of the annotation file.
			
 
				+
			
 
				+        Returns:
			
 
				+            list[dict]: List of annotations sorted by timestamps.
			
 
				+        """
			
 
				+        data = pickle.load(open(ann_file, "rb"))
			
 
				+        data_infos = list(sorted(data["infos"], key=lambda e: e["timestamp"]))
			
 
				+        return data_infos
			
 
				+
			
 
				+    def sample(self, inputs: Union[List[str], str]):
			
 
				+        if not isinstance(inputs, list):
			
 
				+            inputs = [inputs]
			
 
				+
			
 
				+        sample_set = []
			
 
				+        for input in inputs:
			
 
				+            if isinstance(input, str):
			
 
				+                ann_path = (
			
 
				+                    self._download_from_url(input)
			
 
				+                    if input.startswith("http")
			
 
				+                    else input
			
 
				+                )
			
 
				+            else:
			
 
				+                logging.warning(
			
 
				+                    f"Not supported input data type! Only `str` is supported! So has been ignored: {input}."
			
 
				+                )
			
 
				+            self.data_infos = self.load_annotations(ann_path)
			
 
				+            sample_set.extend(self.data_infos)
			
 
				+
			
 
				+        batch = []
			
 
				+        for sample in sample_set:
			
 
				+            batch.append(sample)
			
 
				+            if len(batch) == self.batch_size:
			
 
				+                yield batch
			
 
				+                batch = []
			
 
				+
			
 
				+        if len(batch) > 0:
			
 
				+            yield batch
			
 
				+
			
 
				+    def _rand_batch(self, data_size: int) -> List[Any]:
			
 
				+        raise NotImplementedError(
			
 
				+            "rand batch is not supported for 3D detection annotation data"
			
 
				+        )
			
--- a/paddlex/inference/common/reader/__init__.py
+++ b/paddlex/inference/common/reader/__init__.py
@@ -16,3 +16,4 @@ from .image_reader import ReadImage
 
				 from .audio_reader import ReadAudio
			
 
				 from .video_reader import ReadVideo
			
 
				 from .ts_reader import ReadTS
			
 
				+from .det_3d_reader import ReadNuscenesData
			
--- a/paddlex/inference/common/reader/det_3d_reader.py
+++ b/paddlex/inference/common/reader/det_3d_reader.py
@@ -0,0 +1,239 @@
 
				+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import numpy as np
			
 
				+import cv2
			
 
				+import os
			
 
				+from typing import Generic, List, Optional, Any, Dict
			
 
				+import pickle
			
 
				+
			
 
				+
			
 
				+class _EasyDict(dict):
			
 
				+    def __getattr__(self, key: str):
			
 
				+        if key in self:
			
 
				+            return self[key]
			
 
				+        return super().__getattr__(self, key)
			
 
				+
			
 
				+    def __setattr__(self, key: str, value: Generic):
			
 
				+        self[key] = value
			
 
				+
			
 
				+
			
 
				+class SampleMeta(_EasyDict):
			
 
				+
			
 
				+    # yapf: disable
			
 
				+    __slots__ = [
			
 
				+        "camera_intrinsic",
			
 
				+        # bgr or rgb
			
 
				+        "image_format",
			
 
				+        # pillow or cv2
			
 
				+        "image_reader",
			
 
				+        # chw or hwc
			
 
				+        "channel_order",
			
 
				+        # Unique ID of the sample
			
 
				+        "id",
			
 
				+        "time_lag",
			
 
				+        "ref_from_curr"
			
 
				+    ]
			
 
				+    # yapf: enable
			
 
				+
			
 
				+    def __init__(self, **kwargs):
			
 
				+        for key, value in kwargs.items():
			
 
				+            setattr(self, key, value)
			
 
				+
			
 
				+
			
 
				+class Sample(_EasyDict):
			
 
				+    """Data structure containing sample data information"""
			
 
				+
			
 
				+    _VALID_MODALITIES = ["image", "lidar", "radar", "multimodal", "multiview"]
			
 
				+
			
 
				+    def __init__(self, path: str, modality: str):
			
 
				+        if modality not in self._VALID_MODALITIES:
			
 
				+            raise ValueError(
			
 
				+                "Only modality {} is supported, but got {}".format(
			
 
				+                    self._VALID_MODALITIES, modality
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        self.meta = SampleMeta()
			
 
				+
			
 
				+        self.path = path
			
 
				+        self.data = None
			
 
				+        self.modality = modality.lower()
			
 
				+
			
 
				+        self.bboxes_2d = None
			
 
				+        self.bboxes_3d = None
			
 
				+        self.labels = None
			
 
				+
			
 
				+        self.sweeps = []
			
 
				+        self.attrs = None
			
 
				+
			
 
				+
			
 
				+class ReadNuscenesData:
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        dataset_root="",
			
 
				+        load_interval=1,
			
 
				+        noise_sensor_type="camera",
			
 
				+        drop_frames=False,
			
 
				+        drop_set=[0, "discrete"],
			
 
				+        modality="multimodal",
			
 
				+        extrinsics_noise=False,
			
 
				+        extrinsics_noise_type="single",
			
 
				+    ):
			
 
				+
			
 
				+        self.load_interval = load_interval
			
 
				+        self.noise_data = None
			
 
				+        self.noise_sensor_type = noise_sensor_type
			
 
				+        self.drop_frames = drop_frames
			
 
				+        self.drop_ratio = drop_set[0]
			
 
				+        self.drop_type = drop_set[1]
			
 
				+        self.modality = modality
			
 
				+        self.extrinsics_noise = extrinsics_noise
			
 
				+        self.extrinsics_noise_type = extrinsics_noise_type
			
 
				+        self.dataset_root = dataset_root
			
 
				+
			
 
				+    def get_data_info(self, info):
			
 
				+        """Get data info.
			
 
				+
			
 
				+        Returns:
			
 
				+            dict: Data information that will be passed to the data \
			
 
				+                preprocessing pipelines. It includes the following keys:
			
 
				+
			
 
				+                - sample_idx (str): Sample index.
			
 
				+                - pts_filename (str): Filename of point clouds.
			
 
				+                - sweeps (list[dict]): Infos of sweeps.
			
 
				+                - timestamp (float): Sample timestamp.
			
 
				+                - img_filename (str, optional): Image filename.
			
 
				+                - lidar2img (list[np.ndarray], optional): Transformations \
			
 
				+                    from lidar to different cameras.
			
 
				+                - ann_info (dict): Annotation info.
			
 
				+        """
			
 
				+        sample = Sample(path=None, modality=self.modality)
			
 
				+        sample.sample_idx = info["token"]
			
 
				+        sample.meta.id = info["token"]
			
 
				+        sample.pts_filename = os.path.join(self.dataset_root, info["lidar_path"])
			
 
				+        sample.sweeps = info["sweeps"]
			
 
				+        sample.timestamp = info["timestamp"] / 1e6
			
 
				+
			
 
				+        if self.noise_sensor_type == "lidar":
			
 
				+            if self.drop_frames:
			
 
				+                pts_filename = sample.pts_filename
			
 
				+                file_name = pts_filename.split("/")[-1]
			
 
				+
			
 
				+                if self.noise_data[file_name]["noise"]["drop_frames"][self.drop_ratio][
			
 
				+                    self.drop_type
			
 
				+                ]["stuck"]:
			
 
				+                    replace_file = self.noise_data[file_name]["noise"]["drop_frames"][
			
 
				+                        self.drop_ratio
			
 
				+                    ][self.drop_type]["replace"]
			
 
				+                    if replace_file != "":
			
 
				+                        pts_filename = pts_filename.replace(file_name, replace_file)
			
 
				+
			
 
				+                        sample.pts_filename = pts_filename
			
 
				+                        sample.sweeps = self.noise_data[replace_file]["mmdet_info"][
			
 
				+                            "sweeps"
			
 
				+                        ]
			
 
				+                        sample.timestamp = (
			
 
				+                            self.noise_data[replace_file]["mmdet_info"]["timestamp"]
			
 
				+                            / 1e6
			
 
				+                        )
			
 
				+
			
 
				+        cam_orders = [
			
 
				+            "CAM_FRONT_LEFT",
			
 
				+            "CAM_FRONT",
			
 
				+            "CAM_FRONT_RIGHT",
			
 
				+            "CAM_BACK_RIGHT",
			
 
				+            "CAM_BACK",
			
 
				+            "CAM_BACK_LEFT",
			
 
				+        ]
			
 
				+        if self.modality == "multiview" or self.modality == "multimodal":
			
 
				+            image_paths = []
			
 
				+            lidar2img_rts = []
			
 
				+            caminfos = []
			
 
				+            for cam_type in cam_orders:
			
 
				+                cam_info = info["cams"][cam_type]
			
 
				+
			
 
				+                cam_data_path = cam_info["data_path"]
			
 
				+                cam_data_path = os.path.join(self.dataset_root, cam_data_path)
			
 
				+                file_name = cam_data_path.split("/")[-1]
			
 
				+                if self.noise_sensor_type == "camera":
			
 
				+                    if self.drop_frames:
			
 
				+                        if self.noise_data[file_name]["noise"]["drop_frames"][
			
 
				+                            self.drop_ratio
			
 
				+                        ][self.drop_type]["stuck"]:
			
 
				+                            replace_file = self.noise_data[file_name]["noise"][
			
 
				+                                "drop_frames"
			
 
				+                            ][self.drop_ratio][self.drop_type]["replace"]
			
 
				+                            if replace_file != "":
			
 
				+                                cam_data_path = cam_data_path.replace(
			
 
				+                                    file_name, replace_file
			
 
				+                                )
			
 
				+
			
 
				+                image_paths.append(cam_data_path)
			
 
				+                # obtain lidar to image transformation matrix
			
 
				+                if self.extrinsics_noise:
			
 
				+                    sensor2lidar_rotation = self.noise_data[file_name]["noise"][
			
 
				+                        "extrinsics_noise"
			
 
				+                    ][f"{self.extrinsics_noise_type}_noise_sensor2lidar_rotation"]
			
 
				+                    sensor2lidar_translation = self.noise_data[file_name]["noise"][
			
 
				+                        "extrinsics_noise"
			
 
				+                    ][f"{self.extrinsics_noise_type}_noise_sensor2lidar_translation"]
			
 
				+                else:
			
 
				+                    sensor2lidar_rotation = cam_info["sensor2lidar_rotation"]
			
 
				+                    sensor2lidar_translation = cam_info["sensor2lidar_translation"]
			
 
				+
			
 
				+                lidar2cam_r = np.linalg.inv(sensor2lidar_rotation)
			
 
				+                lidar2cam_t = sensor2lidar_translation @ lidar2cam_r.T
			
 
				+                lidar2cam_rt = np.eye(4)
			
 
				+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
			
 
				+                lidar2cam_rt[3, :3] = -lidar2cam_t
			
 
				+                intrinsic = cam_info["cam_intrinsic"]
			
 
				+                viewpad = np.eye(4)
			
 
				+                viewpad[: intrinsic.shape[0], : intrinsic.shape[1]] = intrinsic
			
 
				+                lidar2img_rt = viewpad @ lidar2cam_rt.T
			
 
				+                lidar2img_rts.append(lidar2img_rt)
			
 
				+                caminfos.append(
			
 
				+                    {
			
 
				+                        "sensor2lidar_translation": sensor2lidar_translation,
			
 
				+                        "sensor2lidar_rotation": sensor2lidar_rotation,
			
 
				+                        "cam_intrinsic": cam_info["cam_intrinsic"],
			
 
				+                    }
			
 
				+                )
			
 
				+
			
 
				+            sample.update(
			
 
				+                dict(
			
 
				+                    img_filename=image_paths, lidar2img=lidar2img_rts, caminfo=caminfos
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        return sample
			
 
				+
			
 
				+    def prepare_test_data(self, info):
			
 
				+        sample = self.get_data_info(info)
			
 
				+        sample = self.add_new_fields(sample)
			
 
				+        return sample
			
 
				+
			
 
				+    def add_new_fields(self, sample):
			
 
				+        sample["img_fields"] = []
			
 
				+        sample["bbox3d_fields"] = []
			
 
				+        sample["pts_mask_fields"] = []
			
 
				+        sample["pts_seg_fields"] = []
			
 
				+        sample["bbox_fields"] = []
			
 
				+        sample["mask_fields"] = []
			
 
				+        sample["seg_fields"] = []
			
 
				+        return sample
			
 
				+
			
 
				+    def __call__(self, batch_data):
			
 
				+        return [self.prepare_test_data(data_info) for data_info in batch_data]
			
--- a/paddlex/inference/models/3d_bev_detection/__init__.py
+++ b/paddlex/inference/models/3d_bev_detection/__init__.py
@@ -0,0 +1,15 @@
 
				+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+from .predictor import BEVDet3DPredictor
			
--- a/paddlex/inference/models/3d_bev_detection/predictor.py
+++ b/paddlex/inference/models/3d_bev_detection/predictor.py
@@ -0,0 +1,281 @@
 
				+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+from typing import Any, Union, Dict, List, Tuple
			
 
				+from importlib import import_module
			
 
				+import lazy_paddle
			
 
				+
			
 
				+if lazy_paddle.is_compiled_with_cuda() and not lazy_paddle.is_compiled_with_rocm():
			
 
				+    from ....ops.voxelize import hard_voxelize
			
 
				+    from ....ops.iou3d_nms import nms_gpu
			
 
				+else:
			
 
				+    from ....utils import logging
			
 
				+
			
 
				+    logging.error("3D BEVFusion custom ops only support GPU platform!")
			
 
				+from ....utils.func_register import FuncRegister
			
 
				+
			
 
				+module_3d_bev_detection = import_module(".3d_bev_detection", "paddlex.modules")
			
 
				+module_3d_model_list = getattr(module_3d_bev_detection, "model_list")
			
 
				+MODELS = getattr(module_3d_model_list, "MODELS")
			
 
				+from ...common.batch_sampler import Det3DBatchSampler
			
 
				+from ...common.reader import ReadNuscenesData
			
 
				+from ..common import StaticInfer
			
 
				+from ..base import BasicPredictor
			
 
				+from .processors import (
			
 
				+    LoadPointsFromFile,
			
 
				+    LoadPointsFromMultiSweeps,
			
 
				+    LoadMultiViewImageFromFiles,
			
 
				+    ResizeImage,
			
 
				+    NormalizeImage,
			
 
				+    PadImage,
			
 
				+    SampleFilterByKey,
			
 
				+    GetInferInput,
			
 
				+)
			
 
				+from .result import BEV3DDetResult
			
 
				+
			
 
				+
			
 
				+class BEVDet3DPredictor(BasicPredictor):
			
 
				+    """BEVDet3DPredictor that inherits from BasicPredictor."""
			
 
				+
			
 
				+    entities = MODELS
			
 
				+
			
 
				+    _FUNC_MAP = {}
			
 
				+    register = FuncRegister(_FUNC_MAP)
			
 
				+
			
 
				+    def __init__(self, *args: List, **kwargs: Dict) -> None:
			
 
				+        """Initializes BEVDet3DPredictor.
			
 
				+
			
 
				+        Args:
			
 
				+            *args: Arbitrary positional arguments passed to the superclass.
			
 
				+            **kwargs: Arbitrary keyword arguments passed to the superclass.
			
 
				+        """
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+        self.pre_tfs, self.infer = self._build()
			
 
				+
			
 
				+    def _build_batch_sampler(self) -> Det3DBatchSampler:
			
 
				+        """Builds and returns an Det3DBatchSampler instance.
			
 
				+
			
 
				+        Returns:
			
 
				+            Det3DBatchSampler: An instance of Det3DBatchSampler.
			
 
				+        """
			
 
				+        return Det3DBatchSampler()
			
 
				+
			
 
				+    def _get_result_class(self) -> type:
			
 
				+        """Returns the result class, BEV3DDetResult.
			
 
				+
			
 
				+        Returns:
			
 
				+            type: The BEV3DDetResult class.
			
 
				+        """
			
 
				+        return BEV3DDetResult
			
 
				+
			
 
				+    def _build(self) -> Tuple:
			
 
				+        """Build the preprocessors and inference engine based on the configuration.
			
 
				+
			
 
				+        Returns:
			
 
				+            tuple: A tuple containing the preprocessors and inference engine.
			
 
				+        """
			
 
				+        pre_tfs = {"Read": ReadNuscenesData()}
			
 
				+        for cfg in self.config["PreProcess"]["transform_ops"]:
			
 
				+            tf_key = list(cfg.keys())[0]
			
 
				+            func = self._FUNC_MAP[tf_key]
			
 
				+            args = cfg.get(tf_key, {})
			
 
				+            name, op = func(self, **args) if args else func(self)
			
 
				+            if op:
			
 
				+                pre_tfs[name] = op
			
 
				+
			
 
				+        infer = StaticInfer(
			
 
				+            model_dir=self.model_dir,
			
 
				+            model_prefix=self.MODEL_FILE_PREFIX,
			
 
				+            option=self.pp_option,
			
 
				+        )
			
 
				+
			
 
				+        return pre_tfs, infer
			
 
				+
			
 
				+    def _format_output(
			
 
				+        self, infer_input: List[Any], outs: List[Any], img_metas: Dict[str, Any]
			
 
				+    ) -> Dict[str, Any]:
			
 
				+        """format inference input and output into predict result
			
 
				+
			
 
				+        Args:
			
 
				+            infer_input(List): Model infer inputs with list containing images, points and lidar2img matrix.
			
 
				+            outs(List): Model infer output containing bboxes, scores, labels result.
			
 
				+            img_metas(Dict): Image metas info of input sample.
			
 
				+
			
 
				+        Returns:
			
 
				+            Dict: A Dict containing formatted inference output results.
			
 
				+        """
			
 
				+        input_lidar_path = img_metas["input_lidar_path"]
			
 
				+        input_img_paths = img_metas["input_img_paths"]
			
 
				+        sample_id = img_metas["sample_id"]
			
 
				+        results = {}
			
 
				+        out_bboxes_3d = []
			
 
				+        out_scores_3d = []
			
 
				+        out_labels_3d = []
			
 
				+        input_imgs = []
			
 
				+        input_points = []
			
 
				+        input_lidar2imgs = []
			
 
				+        input_ids = []
			
 
				+        input_lidar_path_list = []
			
 
				+        input_img_paths_list = []
			
 
				+        out_bboxes_3d.append(outs[0])
			
 
				+        out_labels_3d.append(outs[1])
			
 
				+        out_scores_3d.append(outs[2])
			
 
				+        input_imgs.append(infer_input[1])
			
 
				+        input_points.append(infer_input[0])
			
 
				+        input_lidar2imgs.append(infer_input[2])
			
 
				+        input_ids.append(sample_id)
			
 
				+        input_lidar_path_list.append(input_lidar_path)
			
 
				+        input_img_paths_list.append(input_img_paths)
			
 
				+        results["input_path"] = input_lidar_path_list
			
 
				+        results["input_img_paths"] = input_img_paths_list
			
 
				+        results["sample_id"] = input_ids
			
 
				+        results["boxes_3d"] = out_bboxes_3d
			
 
				+        results["labels_3d"] = out_labels_3d
			
 
				+        results["scores_3d"] = out_scores_3d
			
 
				+        return results
			
 
				+
			
 
				+    def process(self, batch_data: List[str]) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        Process a batch of data through the preprocessing and inference.
			
 
				+
			
 
				+        Args:
			
 
				+            batch_data (List[str]): A batch of input data (e.g., sample anno file paths).
			
 
				+
			
 
				+        Returns:
			
 
				+            dict: A dictionary containing the input path, input img, input points, input lidar2img, output bboxes, output labels, output scores and label names. Keys include 'input_path', 'input_img', 'input_points', 'input_lidar2img', 'boxes_3d', 'labels_3d' and 'scores_3d'.
			
 
				+        """
			
 
				+        sample = self.pre_tfs["Read"](batch_data=batch_data)
			
 
				+        sample = self.pre_tfs["LoadPointsFromFile"](results=sample[0])
			
 
				+        sample = self.pre_tfs["LoadPointsFromMultiSweeps"](results=sample)
			
 
				+        sample = self.pre_tfs["LoadMultiViewImageFromFiles"](sample=sample)
			
 
				+        sample = self.pre_tfs["ResizeImage"](results=sample)
			
 
				+        sample = self.pre_tfs["NormalizeImage"](results=sample)
			
 
				+        sample = self.pre_tfs["PadImage"](results=sample)
			
 
				+        sample = self.pre_tfs["SampleFilterByKey"](sample=sample)
			
 
				+        infer_input, img_metas = self.pre_tfs["GetInferInput"](sample=sample)
			
 
				+        infer_output = self.infer(x=infer_input)
			
 
				+        results = self._format_output(infer_input, infer_output, img_metas)
			
 
				+        return results
			
 
				+
			
 
				+    @register("LoadPointsFromFile")
			
 
				+    def build_load_img_from_file(
			
 
				+        self, load_dim=6, use_dim=[0, 1, 2], shift_height=False, use_color=False
			
 
				+    ):
			
 
				+        return "LoadPointsFromFile", LoadPointsFromFile(
			
 
				+            load_dim=load_dim,
			
 
				+            use_dim=use_dim,
			
 
				+            shift_height=shift_height,
			
 
				+            use_color=use_color,
			
 
				+        )
			
 
				+
			
 
				+    @register("LoadPointsFromMultiSweeps")
			
 
				+    def build_load_points_from_multi_sweeps(
			
 
				+        self,
			
 
				+        sweeps_num=10,
			
 
				+        load_dim=5,
			
 
				+        use_dim=[0, 1, 2, 4],
			
 
				+        pad_empty_sweeps=False,
			
 
				+        remove_close=False,
			
 
				+        test_mode=False,
			
 
				+        point_cloud_angle_range=None,
			
 
				+    ):
			
 
				+        return "LoadPointsFromMultiSweeps", LoadPointsFromMultiSweeps(
			
 
				+            sweeps_num=sweeps_num,
			
 
				+            load_dim=load_dim,
			
 
				+            use_dim=use_dim,
			
 
				+            pad_empty_sweeps=pad_empty_sweeps,
			
 
				+            remove_close=remove_close,
			
 
				+            test_mode=test_mode,
			
 
				+            point_cloud_angle_range=point_cloud_angle_range,
			
 
				+        )
			
 
				+
			
 
				+    @register("LoadMultiViewImageFromFiles")
			
 
				+    def build_load_multi_view_image_from_files(
			
 
				+        self,
			
 
				+        to_float32=False,
			
 
				+        project_pts_to_img_depth=False,
			
 
				+        cam_depth_range=[4.0, 45.0, 1.0],
			
 
				+        constant_std=0.5,
			
 
				+        imread_flag=-1,
			
 
				+    ):
			
 
				+        return "LoadMultiViewImageFromFiles", LoadMultiViewImageFromFiles(
			
 
				+            to_float32=to_float32,
			
 
				+            project_pts_to_img_depth=project_pts_to_img_depth,
			
 
				+            cam_depth_range=cam_depth_range,
			
 
				+            constant_std=constant_std,
			
 
				+            imread_flag=imread_flag,
			
 
				+        )
			
 
				+
			
 
				+    @register("ResizeImage")
			
 
				+    def build_resize_image(
			
 
				+        self,
			
 
				+        img_scale=None,
			
 
				+        multiscale_mode="range",
			
 
				+        ratio_range=None,
			
 
				+        keep_ratio=True,
			
 
				+        bbox_clip_border=True,
			
 
				+        backend="cv2",
			
 
				+        override=False,
			
 
				+    ):
			
 
				+        return "ResizeImage", ResizeImage(
			
 
				+            img_scale=img_scale,
			
 
				+            multiscale_mode=multiscale_mode,
			
 
				+            ratio_range=ratio_range,
			
 
				+            keep_ratio=keep_ratio,
			
 
				+            bbox_clip_border=bbox_clip_border,
			
 
				+            backend=backend,
			
 
				+            override=override,
			
 
				+        )
			
 
				+
			
 
				+    @register("NormalizeImage")
			
 
				+    def build_normalize_image(self, mean, std, to_rgb=True):
			
 
				+        return "NormalizeImage", NormalizeImage(mean=mean, std=std, to_rgb=to_rgb)
			
 
				+
			
 
				+    @register("PadImage")
			
 
				+    def build_pad_image(self, size=None, size_divisor=None, pad_val=0):
			
 
				+        return "PadImage", PadImage(
			
 
				+            size=size, size_divisor=size_divisor, pad_val=pad_val
			
 
				+        )
			
 
				+
			
 
				+    @register("SampleFilterByKey")
			
 
				+    def build_sample_filter_by_key(
			
 
				+        self,
			
 
				+        keys,
			
 
				+        meta_keys=(
			
 
				+            "filename",
			
 
				+            "ori_shape",
			
 
				+            "img_shape",
			
 
				+            "lidar2img",
			
 
				+            "depth2img",
			
 
				+            "cam2img",
			
 
				+            "pad_shape",
			
 
				+            "scale_factor",
			
 
				+            "flip",
			
 
				+            "pcd_horizontal_flip",
			
 
				+            "pcd_vertical_flip",
			
 
				+            "box_type_3d",
			
 
				+            "img_norm_cfg",
			
 
				+            "pcd_trans",
			
 
				+            "sample_idx",
			
 
				+            "pcd_scale_factor",
			
 
				+            "pcd_rotation",
			
 
				+            "pts_filename",
			
 
				+            "transformation_3d_flow",
			
 
				+        ),
			
 
				+    ):
			
 
				+        return "SampleFilterByKey", SampleFilterByKey(keys=keys, meta_keys=meta_keys)
			
 
				+
			
 
				+    @register("GetInferInput")
			
 
				+    def build_get_infer_input(self):
			
 
				+        return "GetInferInput", GetInferInput()
			
--- a/paddlex/inference/models/3d_bev_detection/processors.py
+++ b/paddlex/inference/models/3d_bev_detection/processors.py
@@ -0,0 +1,978 @@
 
				+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+
			
 
				+import numbers
			
 
				+import cv2
			
 
				+import numpy as np
			
 
				+from typing import Generic, List, Optional
			
 
				+import lazy_paddle as paddle
			
 
				+
			
 
				+from ...utils.io import ImageReader
			
 
				+from ....utils import logging
			
 
				+from ...common.reader.det_3d_reader import Sample
			
 
				+
			
 
				+
			
 
				+cv2_interp_codes = {
			
 
				+    "nearest": cv2.INTER_NEAREST,
			
 
				+    "bilinear": cv2.INTER_LINEAR,
			
 
				+    "bicubic": cv2.INTER_CUBIC,
			
 
				+    "area": cv2.INTER_AREA,
			
 
				+    "lanczos": cv2.INTER_LANCZOS4,
			
 
				+}
			
 
				+
			
 
				+
			
 
				+class LoadPointsFromFile:
			
 
				+    """Load points from a file and process them according to specified parameters."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, load_dim=6, use_dim=[0, 1, 2], shift_height=False, use_color=False
			
 
				+    ):
			
 
				+        """Initializes the LoadPointsFromFile object.
			
 
				+
			
 
				+        Args:
			
 
				+            load_dim (int): Dimensions loaded in points.
			
 
				+            use_dim (list or int): Dimensions used in points. If int, will use a range from 0 to use_dim (exclusive).
			
 
				+            shift_height (bool): Whether to shift height values.
			
 
				+            use_color (bool): Whether to include color attributes in the loaded points.
			
 
				+        """
			
 
				+        self.shift_height = shift_height
			
 
				+        self.use_color = use_color
			
 
				+        if isinstance(use_dim, int):
			
 
				+            use_dim = list(range(use_dim))
			
 
				+        assert (
			
 
				+            max(use_dim) < load_dim
			
 
				+        ), f"Expect all used dimensions < {load_dim}, got {use_dim}"
			
 
				+
			
 
				+        self.load_dim = load_dim
			
 
				+        self.use_dim = use_dim
			
 
				+
			
 
				+    def _load_points(self, pts_filename):
			
 
				+        """Private function to load point clouds data from a file.
			
 
				+
			
 
				+        Args:
			
 
				+            pts_filename (str): Path to the point cloud file.
			
 
				+
			
 
				+        Returns:
			
 
				+            numpy.ndarray: Loaded point cloud data.
			
 
				+        """
			
 
				+        points = np.fromfile(pts_filename, dtype=np.float32)
			
 
				+        return points
			
 
				+
			
 
				+    def __call__(self, results):
			
 
				+        """Call function to load points data from file and process it.
			
 
				+
			
 
				+        Args:
			
 
				+            results (dict): Dictionary containing the 'pts_filename' key with the path to the point cloud file.
			
 
				+
			
 
				+        Returns:
			
 
				+            dict: Updated results dictionary with 'points' key added.
			
 
				+        """
			
 
				+        pts_filename = results["pts_filename"]
			
 
				+        points = self._load_points(pts_filename)
			
 
				+        points = points.reshape(-1, self.load_dim)
			
 
				+        points = points[:, self.use_dim]
			
 
				+        attribute_dims = None
			
 
				+
			
 
				+        if self.shift_height:
			
 
				+            floor_height = np.percentile(points[:, 2], 0.99)
			
 
				+            height = points[:, 2] - floor_height
			
 
				+            points = np.concatenate(
			
 
				+                [points[:, :3], np.expand_dims(height, 1), points[:, 3:]], 1
			
 
				+            )
			
 
				+            attribute_dims = dict(height=3)
			
 
				+
			
 
				+        if self.use_color:
			
 
				+            assert len(self.use_dim) >= 6
			
 
				+            if attribute_dims is None:
			
 
				+                attribute_dims = dict()
			
 
				+            attribute_dims.update(
			
 
				+                dict(
			
 
				+                    color=[
			
 
				+                        points.shape[1] - 3,
			
 
				+                        points.shape[1] - 2,
			
 
				+                        points.shape[1] - 1,
			
 
				+                    ]
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        results["points"] = points
			
 
				+
			
 
				+        return results
			
 
				+
			
 
				+
			
 
				+class LoadPointsFromMultiSweeps(object):
			
 
				+    """Load points from multiple sweeps.This is usually used for nuScenes dataset to utilize previous sweeps."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        sweeps_num=10,
			
 
				+        load_dim=5,
			
 
				+        use_dim=[0, 1, 2, 4],
			
 
				+        pad_empty_sweeps=False,
			
 
				+        remove_close=False,
			
 
				+        test_mode=False,
			
 
				+        point_cloud_angle_range=None,
			
 
				+    ):
			
 
				+        """Initializes the LoadPointsFromMultiSweeps object
			
 
				+        Args:
			
 
				+            sweeps_num (int): Number of sweeps. Defaults to 10.
			
 
				+            load_dim (int): Dimension number of the loaded points. Defaults to 5.
			
 
				+            use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4].
			
 
				+                for more details. Defaults to dict(backend='disk').
			
 
				+            pad_empty_sweeps (bool): Whether to repeat keyframe when
			
 
				+                sweeps is empty. Defaults to False.
			
 
				+            remove_close (bool): Whether to remove close points.
			
 
				+                Defaults to False.
			
 
				+            test_mode (bool): If test_model=True used for testing, it will not
			
 
				+                randomly sample sweeps but select the nearest N frames.
			
 
				+                Defaults to False.
			
 
				+        """
			
 
				+        self.load_dim = load_dim
			
 
				+        self.sweeps_num = sweeps_num
			
 
				+        self.use_dim = use_dim
			
 
				+        self.pad_empty_sweeps = pad_empty_sweeps
			
 
				+        self.remove_close = remove_close
			
 
				+        self.test_mode = test_mode
			
 
				+
			
 
				+        if point_cloud_angle_range is not None:
			
 
				+            self.filter_by_angle = True
			
 
				+            self.point_cloud_angle_range = point_cloud_angle_range
			
 
				+            print(point_cloud_angle_range)
			
 
				+        else:
			
 
				+            self.filter_by_angle = False
			
 
				+            # self.point_cloud_angle_range = point_cloud_angle_range
			
 
				+
			
 
				+    def _load_points(self, pts_filename):
			
 
				+        """Private function to load point clouds data.
			
 
				+
			
 
				+        Args:
			
 
				+            pts_filename (str): Filename of point clouds data.
			
 
				+
			
 
				+        Returns:
			
 
				+            np.ndarray: An array containing point clouds data.
			
 
				+        """
			
 
				+        points = np.fromfile(pts_filename, dtype=np.float32)
			
 
				+        return points
			
 
				+
			
 
				+    def _remove_close(self, points, radius=1.0):
			
 
				+        """Removes point too close within a certain radius from origin.
			
 
				+
			
 
				+        Args:
			
 
				+            points (np.ndarray): Sweep points.
			
 
				+            radius (float): Radius below which points are removed.
			
 
				+                Defaults to 1.0.
			
 
				+
			
 
				+        Returns:
			
 
				+            np.ndarray: Points after removing.
			
 
				+        """
			
 
				+        if isinstance(points, np.ndarray):
			
 
				+            points_numpy = points
			
 
				+        else:
			
 
				+            raise NotImplementedError
			
 
				+        x_filt = np.abs(points_numpy[:, 0]) < radius
			
 
				+        y_filt = np.abs(points_numpy[:, 1]) < radius
			
 
				+        not_close = np.logical_not(np.logical_and(x_filt, y_filt))
			
 
				+        return points[not_close]
			
 
				+
			
 
				+    def filter_point_by_angle(self, points):
			
 
				+        """
			
 
				+        Filters points based on their angle in relation to the origin.
			
 
				+
			
 
				+        Args:
			
 
				+            points (np.ndarray): An array of points with shape (N, 2), where each row
			
 
				+                is a point in 2D space.
			
 
				+
			
 
				+        Returns:
			
 
				+            np.ndarray: A filtered array of points that fall within the specified
			
 
				+                angle range.
			
 
				+        """
			
 
				+        if isinstance(points, np.ndarray):
			
 
				+            points_numpy = points
			
 
				+        else:
			
 
				+            raise NotImplementedError
			
 
				+        pts_phi = (
			
 
				+            np.arctan(points_numpy[:, 0] / points_numpy[:, 1])
			
 
				+            + (points_numpy[:, 1] < 0) * np.pi
			
 
				+            + np.pi * 2
			
 
				+        ) % (np.pi * 2)
			
 
				+
			
 
				+        pts_phi[pts_phi > np.pi] -= np.pi * 2
			
 
				+        pts_phi = pts_phi / np.pi * 180
			
 
				+
			
 
				+        assert np.all(-180 <= pts_phi) and np.all(pts_phi <= 180)
			
 
				+
			
 
				+        filt = np.logical_and(
			
 
				+            pts_phi >= self.point_cloud_angle_range[0],
			
 
				+            pts_phi <= self.point_cloud_angle_range[1],
			
 
				+        )
			
 
				+        return points[filt]
			
 
				+
			
 
				+    def __call__(self, results):
			
 
				+        """Call function to load multi-sweep point clouds from files.
			
 
				+
			
 
				+        Args:
			
 
				+            results (dict): Result dict containing multi-sweep point cloud \
			
 
				+                filenames.
			
 
				+
			
 
				+        Returns:
			
 
				+            dict: The result dict containing the multi-sweep points data. \
			
 
				+                Added key and value are described below.
			
 
				+
			
 
				+                - points (np.ndarray): Multi-sweep point cloud arrays.
			
 
				+        """
			
 
				+        points = results["points"]
			
 
				+        points[:, 4] = 0
			
 
				+        sweep_points_list = [points]
			
 
				+        ts = results["timestamp"]
			
 
				+        if self.pad_empty_sweeps and len(results["sweeps"]) == 0:
			
 
				+            for i in range(self.sweeps_num):
			
 
				+                if self.remove_close:
			
 
				+                    sweep_points_list.append(self._remove_close(points))
			
 
				+                else:
			
 
				+                    sweep_points_list.append(points)
			
 
				+        else:
			
 
				+            if len(results["sweeps"]) <= self.sweeps_num:
			
 
				+                choices = np.arange(len(results["sweeps"]))
			
 
				+            elif self.test_mode:
			
 
				+                choices = np.arange(self.sweeps_num)
			
 
				+            else:
			
 
				+                choices = np.random.choice(
			
 
				+                    len(results["sweeps"]), self.sweeps_num, replace=False
			
 
				+                )
			
 
				+            for idx in choices:
			
 
				+                sweep = results["sweeps"][idx]
			
 
				+                points_sweep = self._load_points(sweep["data_path"])
			
 
				+                points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim)
			
 
				+                if self.remove_close:
			
 
				+                    points_sweep = self._remove_close(points_sweep)
			
 
				+                sweep_ts = sweep["timestamp"] / 1e6
			
 
				+                points_sweep[:, :3] = (
			
 
				+                    points_sweep[:, :3] @ sweep["sensor2lidar_rotation"].T
			
 
				+                )
			
 
				+                points_sweep[:, :3] += sweep["sensor2lidar_translation"]
			
 
				+                points_sweep[:, 4] = ts - sweep_ts
			
 
				+                # points_sweep = points.new_point(points_sweep)
			
 
				+                sweep_points_list.append(points_sweep)
			
 
				+
			
 
				+        points = np.concatenate(sweep_points_list, axis=0)
			
 
				+        if self.filter_by_angle:
			
 
				+            points = self.filter_point_by_angle(points)
			
 
				+
			
 
				+        points = points[:, self.use_dim]
			
 
				+        results["points"] = points
			
 
				+        return results
			
 
				+
			
 
				+
			
 
				+class LoadMultiViewImageFromFiles:
			
 
				+    """Load multi-view images from files."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        to_float32=False,
			
 
				+        project_pts_to_img_depth=False,
			
 
				+        cam_depth_range=[4.0, 45.0, 1.0],
			
 
				+        constant_std=0.5,
			
 
				+        imread_flag=-1,
			
 
				+    ):
			
 
				+        """
			
 
				+        Initializes the LoadMultiViewImageFromFiles object.
			
 
				+        Args:
			
 
				+            to_float32 (bool): Whether to convert the loaded images to float32. Default: False.
			
 
				+            project_pts_to_img_depth (bool): Whether to project points to image depth. Default: False.
			
 
				+            cam_depth_range (list): Camera depth range in the format [min, max, focal]. Default: [4.0, 45.0, 1.0].
			
 
				+            constant_std (float): Constant standard deviation for normalization. Default: 0.5.
			
 
				+            imread_flag (int): Flag determining the color type of the loaded image.
			
 
				+                - -1: cv2.IMREAD_UNCHANGED
			
 
				+                -  0: cv2.IMREAD_GRAYSCALE
			
 
				+                -  1: cv2.IMREAD_COLOR
			
 
				+                Default: -1.
			
 
				+        """
			
 
				+        self.to_float32 = to_float32
			
 
				+        self.project_pts_to_img_depth = project_pts_to_img_depth
			
 
				+        self.cam_depth_range = cam_depth_range
			
 
				+        self.constant_std = constant_std
			
 
				+        self.imread_flag = imread_flag
			
 
				+
			
 
				+    def __call__(self, sample):
			
 
				+        """
			
 
				+        Call method to load multi-view image from files and update the sample dictionary.
			
 
				+
			
 
				+        Args:
			
 
				+            sample (dict): Dictionary containing the image filename key.
			
 
				+
			
 
				+        Returns:
			
 
				+            dict: Updated sample dictionary with loaded images and additional information.
			
 
				+        """
			
 
				+        filename = sample["img_filename"]
			
 
				+
			
 
				+        img = np.stack(
			
 
				+            [cv2.imread(name, self.imread_flag) for name in filename], axis=-1
			
 
				+        )
			
 
				+        if self.to_float32:
			
 
				+            img = img.astype(np.float32)
			
 
				+        sample["filename"] = filename
			
 
				+
			
 
				+        sample["img"] = [img[..., i] for i in range(img.shape[-1])]
			
 
				+        sample["img_shape"] = img.shape
			
 
				+        sample["ori_shape"] = img.shape
			
 
				+
			
 
				+        sample["pad_shape"] = img.shape
			
 
				+        # sample['scale_factor'] = 1.0
			
 
				+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
			
 
				+
			
 
				+        sample["img_norm_cfg"] = dict(
			
 
				+            mean=np.zeros(num_channels, dtype=np.float32),
			
 
				+            std=np.ones(num_channels, dtype=np.float32),
			
 
				+            to_rgb=False,
			
 
				+        )
			
 
				+        sample["img_fields"] = ["img"]
			
 
				+        return sample
			
 
				+
			
 
				+
			
 
				+class ResizeImage:
			
 
				+    """Resize images & bbox & mask."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        img_scale=None,
			
 
				+        multiscale_mode="range",
			
 
				+        ratio_range=None,
			
 
				+        keep_ratio=True,
			
 
				+        bbox_clip_border=True,
			
 
				+        backend="cv2",
			
 
				+        override=False,
			
 
				+    ):
			
 
				+        """Initializes the ResizeImage object.
			
 
				+
			
 
				+        Args:
			
 
				+            img_scale (list or int, optional): The scale of the image. If a single integer is provided, it will be converted to a list. Defaults to None.
			
 
				+            multiscale_mode (str): The mode for multiscale resizing. Can be "value" or "range". Defaults to "range".
			
 
				+            ratio_range (list, optional): The range of image aspect ratios. Only used when img_scale is a single value. Defaults to None.
			
 
				+            keep_ratio (bool): Whether to keep the aspect ratio when resizing. Defaults to True.
			
 
				+            bbox_clip_border (bool): Whether to clip the bounding box to the image border. Defaults to True.
			
 
				+            backend (str): The backend to use for image resizing. Can be "cv2". Defaults to "cv2".
			
 
				+            override (bool): Whether to override certain resize parameters. Note: This option needs refactoring. Defaults to False.
			
 
				+        """
			
 
				+        if img_scale is None:
			
 
				+            self.img_scale = None
			
 
				+        else:
			
 
				+            if isinstance(img_scale, list):
			
 
				+                self.img_scale = img_scale
			
 
				+            else:
			
 
				+                self.img_scale = [img_scale]
			
 
				+
			
 
				+        if ratio_range is not None:
			
 
				+            # mode 1: given a scale and a range of image ratio
			
 
				+            assert len(self.img_scale) == 1
			
 
				+        else:
			
 
				+            # mode 2: given multiple scales or a range of scales
			
 
				+            assert multiscale_mode in ["value", "range"]
			
 
				+
			
 
				+        self.backend = backend
			
 
				+        self.multiscale_mode = multiscale_mode
			
 
				+        self.ratio_range = ratio_range
			
 
				+        self.keep_ratio = keep_ratio
			
 
				+        # TODO: refactor the override option in Resize
			
 
				+        self.override = override
			
 
				+        self.bbox_clip_border = bbox_clip_border
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def random_select(img_scales):
			
 
				+        """Randomly select an img_scale from the given list of candidates.
			
 
				+
			
 
				+        Args:
			
 
				+            img_scales (list): A list of image scales to choose from.
			
 
				+
			
 
				+        Returns:
			
 
				+            tuple: A tuple containing the selected image scale and its index in the list.
			
 
				+        """
			
 
				+        scale_idx = np.random.randint(len(img_scales))
			
 
				+        img_scale = img_scales[scale_idx]
			
 
				+        return img_scale, scale_idx
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def random_sample(img_scales):
			
 
				+        """
			
 
				+        Randomly sample an img_scale when `multiscale_mode` is set to 'range'.
			
 
				+
			
 
				+        Args:
			
 
				+            img_scales (list of tuples): A list of tuples, where each tuple contains
			
 
				+                the minimum and maximum scale dimensions for an image.
			
 
				+
			
 
				+        Returns:
			
 
				+            tuple: A tuple containing the randomly sampled img_scale (long_edge, short_edge)
			
 
				+                and None (to maintain function signature compatibility).
			
 
				+        """
			
 
				+        img_scale_long = [max(s) for s in img_scales]
			
 
				+        img_scale_short = [min(s) for s in img_scales]
			
 
				+        long_edge = np.random.randint(min(img_scale_long), max(img_scale_long) + 1)
			
 
				+        short_edge = np.random.randint(min(img_scale_short), max(img_scale_short) + 1)
			
 
				+        img_scale = (long_edge, short_edge)
			
 
				+        return img_scale, None
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def random_sample_ratio(img_scale, ratio_range):
			
 
				+        """
			
 
				+        Randomly sample an img_scale based on the specified ratio_range.
			
 
				+
			
 
				+        Args:
			
 
				+            img_scale (list): A list of two integers representing the minimum and maximum
			
 
				+                scale for the image.
			
 
				+            ratio_range (tuple): A tuple of two floats representing the minimum and maximum
			
 
				+                ratio for sampling the img_scale.
			
 
				+
			
 
				+        Returns:
			
 
				+            tuple: A tuple containing the sampled scale (as a tuple of two integers)
			
 
				+                and None.
			
 
				+        """
			
 
				+
			
 
				+        assert isinstance(img_scale, list) and len(img_scale) == 2
			
 
				+        min_ratio, max_ratio = ratio_range
			
 
				+        assert min_ratio <= max_ratio
			
 
				+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
			
 
				+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
			
 
				+        return scale, None
			
 
				+
			
 
				+    def _random_scale(self, results):
			
 
				+        """Randomly sample an img_scale according to `ratio_range` and `multiscale_mode`.
			
 
				+
			
 
				+        Args:
			
 
				+            results (dict): A dictionary to store the sampled scale and its index.
			
 
				+
			
 
				+        Returns:
			
 
				+            None. The sampled scale and its index are stored in `results` dictionary.
			
 
				+        """
			
 
				+
			
 
				+        if self.ratio_range is not None:
			
 
				+            scale, scale_idx = self.random_sample_ratio(
			
 
				+                self.img_scale[0], self.ratio_range
			
 
				+            )
			
 
				+        elif len(self.img_scale) == 1:
			
 
				+            scale, scale_idx = self.img_scale[0], 0
			
 
				+        elif self.multiscale_mode == "range":
			
 
				+            scale, scale_idx = self.random_sample(self.img_scale)
			
 
				+        elif self.multiscale_mode == "value":
			
 
				+            scale, scale_idx = self.random_select(self.img_scale)
			
 
				+        else:
			
 
				+            raise NotImplementedError
			
 
				+
			
 
				+        results["scale"] = scale
			
 
				+        results["scale_idx"] = scale_idx
			
 
				+
			
 
				+    def _resize_img(self, results):
			
 
				+        """Resize images based on the scale factor provided in ``results['scale']`` while maintaining the aspect ratio if ``self.keep_ratio`` is True.
			
 
				+
			
 
				+        Args:
			
 
				+            results (dict): A dictionary containing image fields and their corresponding scales.
			
 
				+
			
 
				+        Returns:
			
 
				+            None. The ``results`` dictionary is modified in place with resized images and additional fields like `img_shape`, `pad_shape`, `scale_factor`, and `keep_ratio`.
			
 
				+        """
			
 
				+        for key in results.get("img_fields", ["img"]):
			
 
				+            for idx in range(len(results["img"])):
			
 
				+                if self.keep_ratio:
			
 
				+                    img, scale_factor = self.imrescale(
			
 
				+                        results[key][idx],
			
 
				+                        results["scale"],
			
 
				+                        interpolation="bilinear" if key == "img" else "nearest",
			
 
				+                        return_scale=True,
			
 
				+                        backend=self.backend,
			
 
				+                    )
			
 
				+                    new_h, new_w = img.shape[:2]
			
 
				+                    h, w = results[key][idx].shape[:2]
			
 
				+                    w_scale = new_w / w
			
 
				+                    h_scale = new_h / h
			
 
				+                else:
			
 
				+                    raise NotImplementedError
			
 
				+                results[key][idx] = img
			
 
				+
			
 
				+            scale_factor = np.array(
			
 
				+                [w_scale, h_scale, w_scale, h_scale], dtype=np.float32
			
 
				+            )
			
 
				+            results["img_shape"] = img.shape
			
 
				+            # in case that there is no padding
			
 
				+            results["pad_shape"] = img.shape
			
 
				+            results["scale_factor"] = scale_factor
			
 
				+            results["keep_ratio"] = self.keep_ratio
			
 
				+
			
 
				+    def rescale_size(self, old_size, scale, return_scale=False):
			
 
				+        """
			
 
				+        Calculate the new size to be rescaled to based on the given scale.
			
 
				+
			
 
				+        Args:
			
 
				+            old_size (tuple): A tuple containing the width and height of the original size.
			
 
				+            scale (float, int, or list of int): The scale factor or a list of integers representing the maximum and minimum allowed size.
			
 
				+            return_scale (bool): Whether to return the scale factor along with the new size.
			
 
				+
			
 
				+        Returns:
			
 
				+            tuple: A tuple containing the new size and optionally the scale factor if return_scale is True.
			
 
				+
			
 
				+        """
			
 
				+        w, h = old_size
			
 
				+        if isinstance(scale, (float, int)):
			
 
				+            if scale <= 0:
			
 
				+                raise ValueError(f"Invalid scale {scale}, must be positive.")
			
 
				+            scale_factor = scale
			
 
				+        elif isinstance(scale, list):
			
 
				+            max_long_edge = max(scale)
			
 
				+            max_short_edge = min(scale)
			
 
				+            scale_factor = min(max_long_edge / max(h, w), max_short_edge / min(h, w))
			
 
				+        else:
			
 
				+            raise TypeError(
			
 
				+                f"Scale must be a number or list of int, but got {type(scale)}"
			
 
				+            )
			
 
				+
			
 
				+        def _scale_size(size, scale):
			
 
				+            if isinstance(scale, (float, int)):
			
 
				+                scale = (scale, scale)
			
 
				+            w, h = size
			
 
				+            return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
			
 
				+
			
 
				+        new_size = _scale_size((w, h), scale_factor)
			
 
				+
			
 
				+        if return_scale:
			
 
				+            return new_size, scale_factor
			
 
				+        else:
			
 
				+            return new_size
			
 
				+
			
 
				+    def imrescale(
			
 
				+        self, img, scale, return_scale=False, interpolation="bilinear", backend=None
			
 
				+    ):
			
 
				+        """Resize image while keeping the aspect ratio.
			
 
				+
			
 
				+        Args:
			
 
				+            img (numpy.ndarray): The input image.
			
 
				+            scale (float): The scaling factor.
			
 
				+            return_scale (bool): Whether to return the scaling factor along with the resized image.
			
 
				+            interpolation (str): The interpolation method to use. Defaults to 'bilinear'.
			
 
				+            backend (str): The backend to use for resizing. Defaults to None.
			
 
				+
			
 
				+        Returns:
			
 
				+            tuple or numpy.ndarray: The resized image, and optionally the scaling factor.
			
 
				+        """
			
 
				+        h, w = img.shape[:2]
			
 
				+        new_size, scale_factor = self.rescale_size((w, h), scale, return_scale=True)
			
 
				+        rescaled_img = self.imresize(
			
 
				+            img, new_size, interpolation=interpolation, backend=backend
			
 
				+        )
			
 
				+        if return_scale:
			
 
				+            return rescaled_img, scale_factor
			
 
				+        else:
			
 
				+            return rescaled_img
			
 
				+
			
 
				+    def imresize(
			
 
				+        self,
			
 
				+        img,
			
 
				+        size,
			
 
				+        return_scale=False,
			
 
				+        interpolation="bilinear",
			
 
				+        out=None,
			
 
				+        backend=None,
			
 
				+    ):
			
 
				+        """Resize an image to a given size.
			
 
				+
			
 
				+        Args:
			
 
				+            img (numpy.ndarray): The input image to be resized.
			
 
				+            size (tuple): The new size for the image as (height, width).
			
 
				+            return_scale (bool): Whether to return the scaling factors along with the resized image.
			
 
				+            interpolation (str): The interpolation method to use. Default is 'bilinear'.
			
 
				+            out (numpy.ndarray, optional): Output array. If provided, it must have the same shape and dtype as the output array.
			
 
				+            backend (str, optional): The backend to use for resizing. Supported backends are 'cv2' and 'pillow'.
			
 
				+
			
 
				+        Returns:
			
 
				+            numpy.ndarray or tuple: The resized image. If return_scale is True, returns a tuple containing the resized image and the scaling factors (w_scale, h_scale).
			
 
				+        """
			
 
				+        h, w = img.shape[:2]
			
 
				+        if backend not in ["cv2", "pillow"]:
			
 
				+            raise ValueError(
			
 
				+                f"backend: {backend} is not supported for resize."
			
 
				+                f"Supported backends are 'cv2', 'pillow'"
			
 
				+            )
			
 
				+
			
 
				+        if backend == "pillow":
			
 
				+            raise NotImplementedError
			
 
				+        else:
			
 
				+            resized_img = cv2.resize(
			
 
				+                img, size, dst=out, interpolation=cv2_interp_codes[interpolation]
			
 
				+            )
			
 
				+        if not return_scale:
			
 
				+            return resized_img
			
 
				+        else:
			
 
				+            w_scale = size[0] / w
			
 
				+            h_scale = size[1] / h
			
 
				+            return resized_img, w_scale, h_scale
			
 
				+
			
 
				+    def _resize_bboxes(self, results):
			
 
				+        """Resize bounding boxes with `results['scale_factor']`.
			
 
				+
			
 
				+        Args:
			
 
				+            results (dict): A dictionary containing the bounding boxes and other related information.
			
 
				+        """
			
 
				+        for key in results.get("bbox_fields", []):
			
 
				+            bboxes = results[key] * results["scale_factor"]
			
 
				+            if self.bbox_clip_border:
			
 
				+                img_shape = results["img_shape"]
			
 
				+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
			
 
				+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
			
 
				+            results[key] = bboxes
			
 
				+
			
 
				+    def _resize_masks(self, results):
			
 
				+        """Resize masks with ``results['scale']``"""
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def _resize_seg(self, results):
			
 
				+        """Resize semantic segmentation map with ``results['scale']``."""
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def __call__(self, results):
			
 
				+        """Call function to resize images, bounding boxes, masks, and semantic segmentation maps according to the provided scale or scale factor.
			
 
				+
			
 
				+        Args:
			
 
				+            results (dict): A dictionary containing the input data, including 'img', 'scale', and optionally 'scale_factor'.
			
 
				+
			
 
				+        Returns:
			
 
				+            dict: A dictionary with the resized data.
			
 
				+        """
			
 
				+        if "scale" not in results:
			
 
				+            if "scale_factor" in results:
			
 
				+                img_shape = results["img"][0].shape[:2]
			
 
				+                scale_factor = results["scale_factor"]
			
 
				+                assert isinstance(scale_factor, float)
			
 
				+                results["scale"] = list(
			
 
				+                    [int(x * scale_factor) for x in img_shape][::-1]
			
 
				+                )
			
 
				+            else:
			
 
				+                self._random_scale(results)
			
 
				+        else:
			
 
				+            if not self.override:
			
 
				+                assert (
			
 
				+                    "scale_factor" not in results
			
 
				+                ), "scale and scale_factor cannot be both set."
			
 
				+            else:
			
 
				+                results.pop("scale")
			
 
				+                if "scale_factor" in results:
			
 
				+                    results.pop("scale_factor")
			
 
				+                self._random_scale(results)
			
 
				+
			
 
				+        self._resize_img(results)
			
 
				+        self._resize_bboxes(results)
			
 
				+        return results
			
 
				+
			
 
				+
			
 
				+class NormalizeImage:
			
 
				+    """Normalize the image."""
			
 
				+
			
 
				+    """Normalize an image by subtracting the mean and dividing by the standard deviation.
			
 
				+
			
 
				+    Args:
			
 
				+        mean (list or tuple): Mean values for each channel.
			
 
				+        std (list or tuple): Standard deviation values for each channel.
			
 
				+        to_rgb (bool): Whether to convert the image from BGR to RGB.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, mean, std, to_rgb=True):
			
 
				+        """Initializes the NormalizeImage class with mean, std, and to_rgb parameters."""
			
 
				+        self.mean = np.array(mean, dtype=np.float32)
			
 
				+        self.std = np.array(std, dtype=np.float32)
			
 
				+        self.to_rgb = to_rgb
			
 
				+
			
 
				+    def _imnormalize(self, img, mean, std, to_rgb=True):
			
 
				+        """Normalize the given image inplace.
			
 
				+
			
 
				+        Args:
			
 
				+            img (numpy.ndarray): The image to normalize.
			
 
				+            mean (numpy.ndarray): Mean values for normalization.
			
 
				+            std (numpy.ndarray): Standard deviation values for normalization.
			
 
				+            to_rgb (bool): Whether to convert the image from BGR to RGB.
			
 
				+
			
 
				+        Returns:
			
 
				+            numpy.ndarray: The normalized image.
			
 
				+        """
			
 
				+        img = img.copy().astype(np.float32)
			
 
				+        mean = np.float64(mean.reshape(1, -1))
			
 
				+        stdinv = 1 / np.float64(std.reshape(1, -1))
			
 
				+        if to_rgb:
			
 
				+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
			
 
				+        cv2.subtract(img, mean, img)  # inplace
			
 
				+        cv2.multiply(img, stdinv, img)  # inplace
			
 
				+        return img
			
 
				+
			
 
				+    def __call__(self, results):
			
 
				+        """Call method to normalize images in the results dictionary.
			
 
				+
			
 
				+        Args:
			
 
				+            results (dict): A dictionary containing image fields to normalize.
			
 
				+
			
 
				+        Returns:
			
 
				+            dict: The results dictionary with normalized images.
			
 
				+        """
			
 
				+        for key in results.get("img_fields", ["img"]):
			
 
				+            if key == "img_depth":
			
 
				+                continue
			
 
				+            for idx in range(len(results["img"])):
			
 
				+                results[key][idx] = self._imnormalize(
			
 
				+                    results[key][idx], self.mean, self.std, self.to_rgb
			
 
				+                )
			
 
				+        results["img_norm_cfg"] = dict(mean=self.mean, std=self.std, to_rgb=self.to_rgb)
			
 
				+        return results
			
 
				+
			
 
				+
			
 
				+class PadImage(object):
			
 
				+    """Pad the image & mask."""
			
 
				+
			
 
				+    def __init__(self, size=None, size_divisor=None, pad_val=0):
			
 
				+        self.size = size
			
 
				+        self.size_divisor = size_divisor
			
 
				+        self.pad_val = pad_val
			
 
				+        # only one of size and size_divisor should be valid
			
 
				+        assert size is not None or size_divisor is not None
			
 
				+        assert size is None or size_divisor is None
			
 
				+
			
 
				+    def impad(
			
 
				+        self, img, *, shape=None, padding=None, pad_val=0, padding_mode="constant"
			
 
				+    ):
			
 
				+        """Pad the given image to a certain shape or pad on all sides
			
 
				+
			
 
				+        Args:
			
 
				+            img (numpy.ndarray): The input image to be padded.
			
 
				+            shape (tuple, optional): Desired output shape in the form (height, width). One of shape or padding must be specified.
			
 
				+            padding (int, tuple, optional): Number of pixels to pad on each side of the image. If a single int is provided this
			
 
				+                is used to pad all sides with this value. If a tuple of length 2 is provided this is interpreted as (top_bottom, left_right).
			
 
				+                If a tuple of length 4 is provided this is interpreted as (top, right, bottom, left).
			
 
				+            pad_val (int, list, optional): Pixel value used for padding. If a list is provided, it must have the same length as the
			
 
				+                last dimension of the input image. Defaults to 0.
			
 
				+            padding_mode (str, optional): Padding mode to use. One of 'constant', 'edge', 'reflect', 'symmetric'.
			
 
				+                Defaults to 'constant'.
			
 
				+
			
 
				+        Returns:
			
 
				+            numpy.ndarray: The padded image.
			
 
				+
			
 
				+        """
			
 
				+
			
 
				+        assert (shape is not None) ^ (padding is not None)
			
 
				+        if shape is not None:
			
 
				+            padding = [0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0]]
			
 
				+
			
 
				+        # check pad_val
			
 
				+        if isinstance(pad_val, list):
			
 
				+            assert len(pad_val) == img.shape[-1]
			
 
				+        elif not isinstance(pad_val, numbers.Number):
			
 
				+            raise TypeError(
			
 
				+                "pad_val must be a int or a list. " f"But received {type(pad_val)}"
			
 
				+            )
			
 
				+
			
 
				+        # check padding
			
 
				+        if isinstance(padding, list) and len(padding) in [2, 4]:
			
 
				+            if len(padding) == 2:
			
 
				+                padding = [padding[0], padding[1], padding[0], padding[1]]
			
 
				+        elif isinstance(padding, numbers.Number):
			
 
				+            padding = [padding, padding, padding, padding]
			
 
				+        else:
			
 
				+            raise ValueError(
			
 
				+                "Padding must be a int or a 2, or 4 element list."
			
 
				+                f"But received {padding}"
			
 
				+            )
			
 
				+
			
 
				+        # check padding mode
			
 
				+        assert padding_mode in ["constant", "edge", "reflect", "symmetric"]
			
 
				+
			
 
				+        border_type = {
			
 
				+            "constant": cv2.BORDER_CONSTANT,
			
 
				+            "edge": cv2.BORDER_REPLICATE,
			
 
				+            "reflect": cv2.BORDER_REFLECT_101,
			
 
				+            "symmetric": cv2.BORDER_REFLECT,
			
 
				+        }
			
 
				+        img = cv2.copyMakeBorder(
			
 
				+            img,
			
 
				+            padding[1],
			
 
				+            padding[3],
			
 
				+            padding[0],
			
 
				+            padding[2],
			
 
				+            border_type[padding_mode],
			
 
				+            value=pad_val,
			
 
				+        )
			
 
				+
			
 
				+        return img
			
 
				+
			
 
				+    def impad_to_multiple(self, img, divisor, pad_val=0):
			
 
				+        """
			
 
				+        Pad an image to ensure each edge length is a multiple of a given number.
			
 
				+
			
 
				+        Args:
			
 
				+            img (numpy.ndarray): The input image.
			
 
				+            divisor (int): The number to which each edge length should be a multiple.
			
 
				+            pad_val (int, optional): The value to pad the image with. Defaults to 0.
			
 
				+
			
 
				+        Returns:
			
 
				+            numpy.ndarray: The padded image.
			
 
				+        """
			
 
				+        pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
			
 
				+        pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
			
 
				+        return self.impad(img, shape=(pad_h, pad_w), pad_val=pad_val)
			
 
				+
			
 
				+    def _pad_img(self, results):
			
 
				+        """
			
 
				+        Pad images according to ``self.size`` or adjust their shapes to be multiples of ``self.size_divisor``.
			
 
				+
			
 
				+        Args:
			
 
				+            results (dict): A dictionary containing image data, with 'img_fields' as an optional key
			
 
				+                pointing to a list of image field names.
			
 
				+        """
			
 
				+        for key in results.get("img_fields", ["img"]):
			
 
				+            if self.size is not None:
			
 
				+                padded_img = self.impad(
			
 
				+                    results[key], shape=self.size, pad_val=self.pad_val
			
 
				+                )
			
 
				+            elif self.size_divisor is not None:
			
 
				+                for idx in range(len(results[key])):
			
 
				+                    padded_img = self.impad_to_multiple(
			
 
				+                        results[key][idx], self.size_divisor, pad_val=self.pad_val
			
 
				+                    )
			
 
				+                    results[key][idx] = padded_img
			
 
				+        results["pad_shape"] = padded_img.shape
			
 
				+        results["pad_fixed_size"] = self.size
			
 
				+        results["pad_size_divisor"] = self.size_divisor
			
 
				+
			
 
				+    def _pad_masks(self, results):
			
 
				+        """Pad masks according to ``results['pad_shape']``."""
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def _pad_seg(self, results):
			
 
				+        """Pad semantic segmentation map according to ``results['pad_shape']``."""
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def __call__(self, results):
			
 
				+        """Call function to pad images, masks, semantic segmentation maps."""
			
 
				+        self._pad_img(results)
			
 
				+        return results
			
 
				+
			
 
				+
			
 
				+class SampleFilterByKey:
			
 
				+    """Collect data from the loader relevant to the specific task."""
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        keys,
			
 
				+        meta_keys=(
			
 
				+            "filename",
			
 
				+            "ori_shape",
			
 
				+            "img_shape",
			
 
				+            "lidar2img",
			
 
				+            "depth2img",
			
 
				+            "cam2img",
			
 
				+            "pad_shape",
			
 
				+            "scale_factor",
			
 
				+            "flip",
			
 
				+            "pcd_horizontal_flip",
			
 
				+            "pcd_vertical_flip",
			
 
				+            "box_type_3d",
			
 
				+            "img_norm_cfg",
			
 
				+            "pcd_trans",
			
 
				+            "sample_idx",
			
 
				+            "pcd_scale_factor",
			
 
				+            "pcd_rotation",
			
 
				+            "pts_filename",
			
 
				+            "transformation_3d_flow",
			
 
				+        ),
			
 
				+    ):
			
 
				+        self.keys = keys
			
 
				+        self.meta_keys = meta_keys
			
 
				+
			
 
				+    def __call__(self, sample):
			
 
				+        """Call function to filter sample by keys. The keys in `meta_keys` are used to filter metadata from the input sample.
			
 
				+
			
 
				+        Args:
			
 
				+            sample (Sample): The input sample to be filtered.
			
 
				+
			
 
				+        Returns:
			
 
				+            Sample: A new Sample object containing only the filtered metadata and specified keys.
			
 
				+        """
			
 
				+        filtered_sample = Sample(path=sample.path, modality=sample.modality)
			
 
				+        filtered_sample.meta.id = sample.meta.id
			
 
				+        img_metas = {}
			
 
				+
			
 
				+        for key in self.meta_keys:
			
 
				+            if key in sample:
			
 
				+                img_metas[key] = sample[key]
			
 
				+
			
 
				+        filtered_sample["img_metas"] = img_metas
			
 
				+        for key in self.keys:
			
 
				+            filtered_sample[key] = sample[key]
			
 
				+
			
 
				+        return filtered_sample
			
 
				+
			
 
				+
			
 
				+class GetInferInput:
			
 
				+    """Collect infer input data from transformed sample"""
			
 
				+
			
 
				+    def collate_fn(self, batch):
			
 
				+        sample = batch[0]
			
 
				+        collated_batch = {}
			
 
				+        collated_fields = [
			
 
				+            "img",
			
 
				+            "points",
			
 
				+            "img_metas",
			
 
				+            "gt_bboxes_3d",
			
 
				+            "gt_labels_3d",
			
 
				+            "modality",
			
 
				+            "meta",
			
 
				+            "idx",
			
 
				+            "img_depth",
			
 
				+        ]
			
 
				+        for k in list(sample.keys()):
			
 
				+            if k not in collated_fields:
			
 
				+                continue
			
 
				+            if k == "img":
			
 
				+                collated_batch[k] = np.stack([elem[k] for elem in batch], axis=0)
			
 
				+            elif k == "img_depth":
			
 
				+                collated_batch[k] = np.stack(
			
 
				+                    [np.stack(elem[k], axis=0) for elem in batch], axis=0
			
 
				+                )
			
 
				+            else:
			
 
				+                collated_batch[k] = [elem[k] for elem in batch]
			
 
				+        return collated_batch
			
 
				+
			
 
				+    def __call__(self, sample):
			
 
				+        """Call function to infer input data from transformed sample
			
 
				+
			
 
				+        Args:
			
 
				+            sample (Sample): The input sample data.
			
 
				+
			
 
				+        Returns:
			
 
				+            infer_input (list): A list containing all the input data for inference.
			
 
				+            sample_id (str): token id of the input sample.
			
 
				+        """
			
 
				+        if sample.modality == "multimodal" or sample.modality == "multiview":
			
 
				+            if "img" in sample.keys():
			
 
				+                sample.img = np.stack(
			
 
				+                    [img.transpose(2, 0, 1) for img in sample.img], axis=0
			
 
				+                )
			
 
				+
			
 
				+        sample = self.collate_fn([sample])
			
 
				+        infer_input = []
			
 
				+
			
 
				+        img = sample.get("img", None)[0]
			
 
				+        infer_input.append(img.astype(np.float32))
			
 
				+        lidar2img = np.stack(sample["img_metas"][0]["lidar2img"]).astype(np.float32)
			
 
				+        infer_input.append(lidar2img)
			
 
				+        points = sample.get("points", None)[0]
			
 
				+        infer_input.append(points.astype(np.float32))
			
 
				+        img_metas = {
			
 
				+            "input_lidar_path": sample["img_metas"][0]["pts_filename"],
			
 
				+            "input_img_paths": sample["img_metas"][0]["filename"],
			
 
				+            "sample_id": sample["img_metas"][0]["sample_idx"],
			
 
				+        }
			
 
				+
			
 
				+        return infer_input, img_metas
			
--- a/paddlex/inference/models/3d_bev_detection/result.py
+++ b/paddlex/inference/models/3d_bev_detection/result.py
@@ -0,0 +1,32 @@
 
				+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+from ...common.result import BaseResult, StrMixin, JsonMixin
			
 
				+
			
 
				+
			
 
				+class BEV3DDetResult(BaseResult):
			
 
				+    """Base class for computer vision results."""
			
 
				+
			
 
				+    def __init__(self, data: dict) -> None:
			
 
				+        """
			
 
				+        Initialize the BaseCVResult.
			
 
				+
			
 
				+        Args:
			
 
				+            data (dict): The initial data.
			
 
				+
			
 
				+        Raises:
			
 
				+            AssertionError: If the required key (`BaseCVResult.INPUT_IMG_KEY`) are not found in the data.
			
 
				+        """
			
 
				+
			
 
				+        super().__init__(data)
			
--- a/paddlex/inference/models/__init__.py
+++ b/paddlex/inference/models/__init__.py
@@ -15,7 +15,7 @@
 
				 
			
 
				 from pathlib import Path
			
 
				 from typing import Any, Dict, Optional
			
 
				-
			
 
				+from importlib import import_module
			
 
				 from ...utils import errors
			
 
				 from ..utils.official_models import official_models
			
 
				 from .base import BasePredictor, BasicPredictor
			
@@ -49,6 +49,9 @@ from .multilingual_speech_recognition import WhisperPredictor
 
				 from .video_classification import VideoClasPredictor
			
 
				 from .video_detection import VideoDetPredictor
			
 
				 
			
 
				+module_3d_bev_detection = import_module(".3d_bev_detection", "paddlex.inference.models")
			
 
				+BEVDet3DPredictor = getattr(module_3d_bev_detection, "BEVDet3DPredictor")
			
 
				+
			
 
				 
			
 
				 def _create_hp_predictor(
			
 
				     model_name, model_dir, device, config, hpi_params, *args, **kwargs
			
--- a/paddlex/inference/utils/new_ir_blacklist.py
+++ b/paddlex/inference/utils/new_ir_blacklist.py
@@ -23,5 +23,6 @@ NEWIR_BLOCKLIST = [
 
				     "Co-DINO-R50",
			
 
				     "Co-DINO-Swin-L",
			
 
				     "LaTeX_OCR_rec",
			
 
				+    "BEVFusion",
			
 
				     "GroundingDINO-T",
			
 
				 ]
			
--- a/paddlex/inference/utils/official_models.py
+++ b/paddlex/inference/utils/official_models.py
@@ -332,6 +332,7 @@ PP-LCNet_x1_0_vehicle_attribute_infer.tar",
 
				     "PP-DocLayout-L": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/PP-DocLayout-L_infer.tar",
			
 
				     "PP-DocLayout-M": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/PP-DocLayout-M_infer.tar",
			
 
				     "PP-DocLayout-S": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/PP-DocLayout-S_infer.tar",
			
 
				+    "BEVFusion": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/BEVFusion_infer.tar",
			
 
				 }
			
 
				 
			
 
				 
			
--- a/paddlex/ops/__init__.py
+++ b/paddlex/ops/__init__.py
@@ -0,0 +1,149 @@
 
				+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import importlib
			
 
				+import inspect
			
 
				+import os
			
 
				+import sys
			
 
				+from types import ModuleType
			
 
				+
			
 
				+import filelock
			
 
				+from paddle.utils.cpp_extension import load as paddle_jit_load
			
 
				+from paddlex.utils import logging
			
 
				+
			
 
				+
			
 
				+def get_user_home() -> str:
			
 
				+    return os.path.expanduser("~")
			
 
				+
			
 
				+
			
 
				+def get_pprndr_home() -> str:
			
 
				+    return os.path.join(get_user_home(), ".pprndr")
			
 
				+
			
 
				+
			
 
				+def get_sub_home(directory: str) -> str:
			
 
				+    home = os.path.join(get_pprndr_home(), directory)
			
 
				+    os.makedirs(home, exist_ok=True)
			
 
				+    return home
			
 
				+
			
 
				+
			
 
				+TMP_HOME = get_sub_home("tmp")
			
 
				+
			
 
				+custom_ops = {
			
 
				+    "voxelize": {
			
 
				+        "sources": ["voxel/voxelize_op.cc", "voxel/voxelize_op.cu"],
			
 
				+        "version": "0.1.0",
			
 
				+    },
			
 
				+    "iou3d_nms": {
			
 
				+        "sources": [
			
 
				+            "iou3d_nms/iou3d_cpu.cpp",
			
 
				+            "iou3d_nms/iou3d_nms_api.cpp",
			
 
				+            "iou3d_nms/iou3d_nms.cpp",
			
 
				+            "iou3d_nms/iou3d_nms_kernel.cu",
			
 
				+        ],
			
 
				+        "version": "0.1.0",
			
 
				+    },
			
 
				+}
			
 
				+
			
 
				+
			
 
				+class CustomOpNotFoundException(Exception):
			
 
				+    def __init__(self, op_name):
			
 
				+        self.op_name = op_name
			
 
				+
			
 
				+    def __str__(self):
			
 
				+        return "Couldn't Found custom op {}".format(self.op_name)
			
 
				+
			
 
				+
			
 
				+class CustomOperatorPathFinder:
			
 
				+    def find_module(self, fullname: str, path: str = None):
			
 
				+        if not fullname.startswith("paddlex.ops"):
			
 
				+            return None
			
 
				+
			
 
				+        return CustomOperatorPathLoader()
			
 
				+
			
 
				+
			
 
				+class CustomOperatorPathLoader:
			
 
				+    def load_module(self, fullname: str):
			
 
				+        modulename = fullname.split(".")[-1]
			
 
				+
			
 
				+        if modulename not in custom_ops:
			
 
				+            raise CustomOpNotFoundException(modulename)
			
 
				+
			
 
				+        if fullname not in sys.modules:
			
 
				+            try:
			
 
				+                sys.modules[fullname] = importlib.import_module(modulename)
			
 
				+            except ImportError:
			
 
				+                sys.modules[fullname] = PaddleXCustomOperatorModule(
			
 
				+                    modulename, fullname
			
 
				+                )
			
 
				+        return sys.modules[fullname]
			
 
				+
			
 
				+
			
 
				+class PaddleXCustomOperatorModule(ModuleType):
			
 
				+    def __init__(self, modulename: str, fullname: str):
			
 
				+        self.fullname = fullname
			
 
				+        self.modulename = modulename
			
 
				+        self.module = None
			
 
				+        super().__init__(modulename)
			
 
				+
			
 
				+    def jit_build(self):
			
 
				+        try:
			
 
				+            lockfile = "paddlex.ops.{}".format(self.modulename)
			
 
				+            lockfile = os.path.join(TMP_HOME, lockfile)
			
 
				+            file = inspect.getabsfile(sys.modules["paddlex.ops"])
			
 
				+            rootdir = os.path.split(file)[0]
			
 
				+
			
 
				+            args = custom_ops[self.modulename].copy()
			
 
				+            sources = args.pop("sources")
			
 
				+            sources = [os.path.join(rootdir, file) for file in sources]
			
 
				+
			
 
				+            args.pop("version")
			
 
				+            with filelock.FileLock(lockfile):
			
 
				+                return paddle_jit_load(name=self.modulename, sources=sources, **args)
			
 
				+        except:
			
 
				+            logging.error("{} builded fail!".format(self.modulename))
			
 
				+            raise
			
 
				+
			
 
				+    def _load_module(self):
			
 
				+        if self.module is None:
			
 
				+            try:
			
 
				+                self.module = importlib.import_module(self.modulename)
			
 
				+            except ImportError:
			
 
				+                logging.warning(
			
 
				+                    "No custom op {} found, try JIT build".format(self.modulename)
			
 
				+                )
			
 
				+                self.module = self.jit_build()
			
 
				+                logging.info("{} builded success!".format(self.modulename))
			
 
				+
			
 
				+            # refresh
			
 
				+            sys.modules[self.fullname] = self.module
			
 
				+        return self.module
			
 
				+
			
 
				+    def __getattr__(self, attr: str):
			
 
				+        if attr in ["__path__", "__file__"]:
			
 
				+            return None
			
 
				+
			
 
				+        if attr in ["__loader__", "__package__", "__name__", "__spec__"]:
			
 
				+            return super().__getattr__(attr)
			
 
				+
			
 
				+        module = self._load_module()
			
 
				+        if not hasattr(module, attr):
			
 
				+            raise ImportError(
			
 
				+                "cannot import name '{}' from '{}' ({})".format(
			
 
				+                    attr, self.modulename, module.__file__
			
 
				+                )
			
 
				+            )
			
 
				+        return getattr(module, attr)
			
 
				+
			
 
				+
			
 
				+sys.meta_path.insert(0, CustomOperatorPathFinder())
			
--- a/paddlex/ops/iou3d_nms/iou3d_cpu.cpp
+++ b/paddlex/ops/iou3d_nms/iou3d_cpu.cpp
@@ -0,0 +1,264 @@
 
				+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+/*
			
 
				+3D Rotated IoU Calculation (CPU)
			
 
				+Written by Shaoshuai Shi
			
 
				+All Rights Reserved 2020.
			
 
				+*/
			
 
				+
			
 
				+#include "iou3d_cpu.h"
			
 
				+
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <math.h>
			
 
				+#include <paddle/extension.h>
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+inline float min(float a, float b) { return a > b ? b : a; }
			
 
				+
			
 
				+inline float max(float a, float b) { return a > b ? a : b; }
			
 
				+
			
 
				+const float EPS = 1e-8;
			
 
				+struct Point {
			
 
				+  float x, y;
			
 
				+  __device__ Point() {}
			
 
				+  __device__ Point(double _x, double _y) { x = _x, y = _y; }
			
 
				+
			
 
				+  __device__ void set(float _x, float _y) {
			
 
				+    x = _x;
			
 
				+    y = _y;
			
 
				+  }
			
 
				+
			
 
				+  __device__ Point operator+(const Point &b) const {
			
 
				+    return Point(x + b.x, y + b.y);
			
 
				+  }
			
 
				+
			
 
				+  __device__ Point operator-(const Point &b) const {
			
 
				+    return Point(x - b.x, y - b.y);
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+inline float cross(const Point &a, const Point &b) {
			
 
				+  return a.x * b.y - a.y * b.x;
			
 
				+}
			
 
				+
			
 
				+inline float cross(const Point &p1, const Point &p2, const Point &p0) {
			
 
				+  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
			
 
				+}
			
 
				+
			
 
				+inline int check_rect_cross(const Point &p1, const Point &p2, const Point &q1,
			
 
				+                            const Point &q2) {
			
 
				+  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
			
 
				+            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
			
 
				+            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
			
 
				+            min(q1.y, q2.y) <= max(p1.y, p2.y);
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+inline int check_in_box2d(const float *box, const Point &p) {
			
 
				+  // params: (7) [x, y, z, dx, dy, dz, heading]
			
 
				+  const float MARGIN = 1e-2;
			
 
				+
			
 
				+  float center_x = box[0], center_y = box[1];
			
 
				+  float angle_cos = cos(-box[6]),
			
 
				+        angle_sin =
			
 
				+            sin(-box[6]);  // rotate the point in the opposite direction of box
			
 
				+  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
			
 
				+  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
			
 
				+
			
 
				+  return (fabs(rot_x) < box[3] / 2 + MARGIN &&
			
 
				+          fabs(rot_y) < box[4] / 2 + MARGIN);
			
 
				+}
			
 
				+
			
 
				+inline int intersection(const Point &p1, const Point &p0, const Point &q1,
			
 
				+                        const Point &q0, Point &ans) {
			
 
				+  // fast exclusion
			
 
				+  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
			
 
				+
			
 
				+  // check cross standing
			
 
				+  float s1 = cross(q0, p1, p0);
			
 
				+  float s2 = cross(p1, q1, p0);
			
 
				+  float s3 = cross(p0, q1, q0);
			
 
				+  float s4 = cross(q1, p1, q0);
			
 
				+
			
 
				+  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
			
 
				+
			
 
				+  // calculate intersection of two lines
			
 
				+  float s5 = cross(q1, p1, p0);
			
 
				+  if (fabs(s5 - s1) > EPS) {
			
 
				+    ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
			
 
				+    ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
			
 
				+
			
 
				+  } else {
			
 
				+    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
			
 
				+    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
			
 
				+    float D = a0 * b1 - a1 * b0;
			
 
				+
			
 
				+    ans.x = (b0 * c1 - b1 * c0) / D;
			
 
				+    ans.y = (a1 * c0 - a0 * c1) / D;
			
 
				+  }
			
 
				+
			
 
				+  return 1;
			
 
				+}
			
 
				+
			
 
				+inline void rotate_around_center(const Point &center, const float angle_cos,
			
 
				+                                 const float angle_sin, Point &p) {
			
 
				+  float new_x =
			
 
				+      (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
			
 
				+  float new_y =
			
 
				+      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
			
 
				+  p.set(new_x, new_y);
			
 
				+}
			
 
				+
			
 
				+inline int point_cmp(const Point &a, const Point &b, const Point &center) {
			
 
				+  return atan2(a.y - center.y, a.x - center.x) >
			
 
				+         atan2(b.y - center.y, b.x - center.x);
			
 
				+}
			
 
				+
			
 
				+inline float box_overlap(const float *box_a, const float *box_b) {
			
 
				+  // params: box_a (7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params: box_b (7) [x, y, z, dx, dy, dz, heading]
			
 
				+
			
 
				+  //    float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 =
			
 
				+  //    box_a[3], a_angle = box_a[4];
			
 
				+  //    float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 =
			
 
				+  //    box_b[3], b_angle = box_b[4];
			
 
				+  float a_angle = box_a[6], b_angle = box_b[6];
			
 
				+  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
			
 
				+        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
			
 
				+  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
			
 
				+  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
			
 
				+  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
			
 
				+  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
			
 
				+
			
 
				+  Point center_a(box_a[0], box_a[1]);
			
 
				+  Point center_b(box_b[0], box_b[1]);
			
 
				+
			
 
				+  Point box_a_corners[5];
			
 
				+  box_a_corners[0].set(a_x1, a_y1);
			
 
				+  box_a_corners[1].set(a_x2, a_y1);
			
 
				+  box_a_corners[2].set(a_x2, a_y2);
			
 
				+  box_a_corners[3].set(a_x1, a_y2);
			
 
				+
			
 
				+  Point box_b_corners[5];
			
 
				+  box_b_corners[0].set(b_x1, b_y1);
			
 
				+  box_b_corners[1].set(b_x2, b_y1);
			
 
				+  box_b_corners[2].set(b_x2, b_y2);
			
 
				+  box_b_corners[3].set(b_x1, b_y2);
			
 
				+
			
 
				+  // get oriented corners
			
 
				+  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
			
 
				+  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
			
 
				+
			
 
				+  for (int k = 0; k < 4; k++) {
			
 
				+    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
			
 
				+    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
			
 
				+  }
			
 
				+
			
 
				+  box_a_corners[4] = box_a_corners[0];
			
 
				+  box_b_corners[4] = box_b_corners[0];
			
 
				+
			
 
				+  // get intersection of lines
			
 
				+  Point cross_points[16];
			
 
				+  Point poly_center;
			
 
				+  int cnt = 0, flag = 0;
			
 
				+
			
 
				+  poly_center.set(0, 0);
			
 
				+  for (int i = 0; i < 4; i++) {
			
 
				+    for (int j = 0; j < 4; j++) {
			
 
				+      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
			
 
				+                          box_b_corners[j + 1], box_b_corners[j],
			
 
				+                          cross_points[cnt]);
			
 
				+      if (flag) {
			
 
				+        poly_center = poly_center + cross_points[cnt];
			
 
				+        cnt++;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // check corners
			
 
				+  for (int k = 0; k < 4; k++) {
			
 
				+    if (check_in_box2d(box_a, box_b_corners[k])) {
			
 
				+      poly_center = poly_center + box_b_corners[k];
			
 
				+      cross_points[cnt] = box_b_corners[k];
			
 
				+      cnt++;
			
 
				+    }
			
 
				+    if (check_in_box2d(box_b, box_a_corners[k])) {
			
 
				+      poly_center = poly_center + box_a_corners[k];
			
 
				+      cross_points[cnt] = box_a_corners[k];
			
 
				+      cnt++;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  poly_center.x /= cnt;
			
 
				+  poly_center.y /= cnt;
			
 
				+
			
 
				+  // sort the points of polygon
			
 
				+  Point temp;
			
 
				+  for (int j = 0; j < cnt - 1; j++) {
			
 
				+    for (int i = 0; i < cnt - j - 1; i++) {
			
 
				+      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
			
 
				+        temp = cross_points[i];
			
 
				+        cross_points[i] = cross_points[i + 1];
			
 
				+        cross_points[i + 1] = temp;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // get the overlap areas
			
 
				+  float area = 0;
			
 
				+  for (int k = 0; k < cnt - 1; k++) {
			
 
				+    area += cross(cross_points[k] - cross_points[0],
			
 
				+                  cross_points[k + 1] - cross_points[0]);
			
 
				+  }
			
 
				+
			
 
				+  return fabs(area) / 2.0;
			
 
				+}
			
 
				+
			
 
				+inline float iou_bev(const float *box_a, const float *box_b) {
			
 
				+  // params: box_a (7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params: box_b (7) [x, y, z, dx, dy, dz, heading]
			
 
				+  float sa = box_a[3] * box_a[4];
			
 
				+  float sb = box_b[3] * box_b[4];
			
 
				+  float s_overlap = box_overlap(box_a, box_b);
			
 
				+  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
			
 
				+}
			
 
				+
			
 
				+std::vector<paddle::Tensor> boxes_iou_bev_cpu(
			
 
				+    const paddle::Tensor &boxes_a_tensor,
			
 
				+    const paddle::Tensor &boxes_b_tensor) {
			
 
				+  // params boxes_a_tensor: (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params boxes_b_tensor: (M, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params ans_iou_tensor: (N, M)
			
 
				+
			
 
				+  int num_boxes_a = boxes_a_tensor.shape()[0];
			
 
				+  int num_boxes_b = boxes_b_tensor.shape()[0];
			
 
				+  const float *boxes_a = boxes_a_tensor.data<float>();
			
 
				+  const float *boxes_b = boxes_b_tensor.data<float>();
			
 
				+  auto ans_iou_tensor =
			
 
				+      paddle::empty({num_boxes_a, num_boxes_b}, paddle::DataType::FLOAT32,
			
 
				+                    paddle::CPUPlace());
			
 
				+
			
 
				+  float *ans_iou = ans_iou_tensor.data<float>();
			
 
				+
			
 
				+  for (int i = 0; i < num_boxes_a; i++) {
			
 
				+    for (int j = 0; j < num_boxes_b; j++) {
			
 
				+      ans_iou[i * num_boxes_b + j] = iou_bev(boxes_a + i * 7, boxes_b + j * 7);
			
 
				+    }
			
 
				+  }
			
 
				+  return {ans_iou_tensor};
			
 
				+}
			
--- a/paddlex/ops/iou3d_nms/iou3d_cpu.h
+++ b/paddlex/ops/iou3d_nms/iou3d_cpu.h
@@ -0,0 +1,27 @@
 
				+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef IOU3D_CPU_H
			
 
				+#define IOU3D_CPU_H
			
 
				+
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <paddle/extension.h>
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+std::vector<paddle::Tensor> boxes_iou_bev_cpu(
			
 
				+    const paddle::Tensor& boxes_a_tensor, const paddle::Tensor& boxes_b_tensor);
			
 
				+
			
 
				+#endif
			
--- a/paddlex/ops/iou3d_nms/iou3d_nms.cpp
+++ b/paddlex/ops/iou3d_nms/iou3d_nms.cpp
@@ -0,0 +1,204 @@
 
				+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+/*
			
 
				+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
			
 
				+Written by Shaoshuai Shi
			
 
				+All Rights Reserved 2019-2020.
			
 
				+*/
			
 
				+
			
 
				+#include "iou3d_nms.h"
			
 
				+
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <paddle/extension.h>
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
			
 
				+
			
 
				+const int THREADS_PER_BLOCK_NMS = sizeof(int64_t) * 8;
			
 
				+
			
 
				+void BoxesOverlapLauncher(const cudaStream_t &stream, const int num_a,
			
 
				+                          const float *boxes_a, const int num_b,
			
 
				+                          const float *boxes_b, float *ans_overlap);
			
 
				+void BoxesIouBevLauncher(const cudaStream_t &stream, const int num_a,
			
 
				+                         const float *boxes_a, const int num_b,
			
 
				+                         const float *boxes_b, float *ans_iou);
			
 
				+void NmsLauncher(const cudaStream_t &stream, const float *boxes, int64_t *mask,
			
 
				+                 int boxes_num, float nms_overlap_thresh);
			
 
				+void NmsNormalLauncher(const cudaStream_t &stream, const float *boxes,
			
 
				+                       int64_t *mask, int boxes_num, float nms_overlap_thresh);
			
 
				+
			
 
				+std::vector<paddle::Tensor> boxes_overlap_bev_gpu(
			
 
				+    const paddle::Tensor &boxes_a, const paddle::Tensor &boxes_b) {
			
 
				+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params ans_overlap: (N, M)
			
 
				+  int num_a = boxes_a.shape()[0];
			
 
				+  int num_b = boxes_b.shape()[0];
			
 
				+
			
 
				+  const float *boxes_a_data = boxes_a.data<float>();
			
 
				+  const float *boxes_b_data = boxes_b.data<float>();
			
 
				+  auto ans_overlap = paddle::empty({num_a, num_b}, paddle::DataType::FLOAT32,
			
 
				+                                   paddle::GPUPlace());
			
 
				+  float *ans_overlap_data = ans_overlap.data<float>();
			
 
				+
			
 
				+  BoxesOverlapLauncher(boxes_a.stream(), num_a, boxes_a_data, num_b,
			
 
				+                       boxes_b_data, ans_overlap_data);
			
 
				+
			
 
				+  return {ans_overlap};
			
 
				+}
			
 
				+
			
 
				+std::vector<paddle::Tensor> boxes_iou_bev_gpu(
			
 
				+    const paddle::Tensor &boxes_a_tensor,
			
 
				+    const paddle::Tensor &boxes_b_tensor) {
			
 
				+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params ans_overlap: (N, M)
			
 
				+
			
 
				+  int num_a = boxes_a_tensor.shape()[0];
			
 
				+  int num_b = boxes_b_tensor.shape()[0];
			
 
				+
			
 
				+  const float *boxes_a_data = boxes_a_tensor.data<float>();
			
 
				+  const float *boxes_b_data = boxes_b_tensor.data<float>();
			
 
				+  auto ans_iou_tensor = paddle::empty({num_a, num_b}, paddle::DataType::FLOAT32,
			
 
				+                                      paddle::GPUPlace());
			
 
				+  float *ans_iou_data = ans_iou_tensor.data<float>();
			
 
				+
			
 
				+  BoxesIouBevLauncher(boxes_a_tensor.stream(), num_a, boxes_a_data, num_b,
			
 
				+                      boxes_b_data, ans_iou_data);
			
 
				+
			
 
				+  return {ans_iou_tensor};
			
 
				+}
			
 
				+
			
 
				+std::vector<paddle::Tensor> nms_gpu(const paddle::Tensor &boxes,
			
 
				+                                    float nms_overlap_thresh) {
			
 
				+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  auto keep = paddle::empty({boxes.shape()[0]}, paddle::DataType::INT32,
			
 
				+                            paddle::CPUPlace());
			
 
				+  auto num_to_keep_tensor =
			
 
				+      paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());
			
 
				+  int *num_to_keep_data = num_to_keep_tensor.data<int>();
			
 
				+
			
 
				+  int boxes_num = boxes.shape()[0];
			
 
				+  const float *boxes_data = boxes.data<float>();
			
 
				+  int *keep_data = keep.data<int>();
			
 
				+
			
 
				+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
			
 
				+
			
 
				+  // int64_t *mask_data = NULL;
			
 
				+  // CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks *
			
 
				+  // sizeof(int64_t)));
			
 
				+  auto mask = paddle::empty({boxes_num * col_blocks}, paddle::DataType::INT64,
			
 
				+                            paddle::GPUPlace());
			
 
				+  int64_t *mask_data = mask.data<int64_t>();
			
 
				+  NmsLauncher(boxes.stream(), boxes_data, mask_data, boxes_num,
			
 
				+              nms_overlap_thresh);
			
 
				+
			
 
				+  // std::vector<int64_t> mask_cpu(boxes_num * col_blocks);
			
 
				+
			
 
				+  // CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks *
			
 
				+  // sizeof(int64_t),
			
 
				+  //                       cudaMemcpyDeviceToHost));
			
 
				+  const paddle::Tensor mask_cpu_tensor = mask.copy_to(paddle::CPUPlace(), true);
			
 
				+  const int64_t *mask_cpu = mask_cpu_tensor.data<int64_t>();
			
 
				+  // cudaFree(mask_data);
			
 
				+
			
 
				+  int64_t remv_cpu[col_blocks];
			
 
				+  memset(remv_cpu, 0, col_blocks * sizeof(int64_t));
			
 
				+
			
 
				+  int num_to_keep = 0;
			
 
				+
			
 
				+  for (int i = 0; i < boxes_num; i++) {
			
 
				+    int nblock = i / THREADS_PER_BLOCK_NMS;
			
 
				+    int inblock = i % THREADS_PER_BLOCK_NMS;
			
 
				+
			
 
				+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
			
 
				+      keep_data[num_to_keep++] = i;
			
 
				+      const int64_t *p = &mask_cpu[0] + i * col_blocks;
			
 
				+      for (int j = nblock; j < col_blocks; j++) {
			
 
				+        remv_cpu[j] |= p[j];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  num_to_keep_data[0] = num_to_keep;
			
 
				+  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
			
 
				+
			
 
				+  return {keep, num_to_keep_tensor};
			
 
				+}
			
 
				+
			
 
				+std::vector<paddle::Tensor> nms_normal_gpu(const paddle::Tensor &boxes,
			
 
				+                                           float nms_overlap_thresh) {
			
 
				+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params keep: (N)
			
 
				+
			
 
				+  auto keep = paddle::empty({boxes.shape()[0]}, paddle::DataType::INT32,
			
 
				+                            paddle::CPUPlace());
			
 
				+  auto num_to_keep_tensor =
			
 
				+      paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());
			
 
				+  int *num_to_keep_data = num_to_keep_tensor.data<int>();
			
 
				+  int boxes_num = boxes.shape()[0];
			
 
				+  const float *boxes_data = boxes.data<float>();
			
 
				+  int *keep_data = keep.data<int>();
			
 
				+
			
 
				+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
			
 
				+
			
 
				+  // int64_t *mask_data = NULL;
			
 
				+  // CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks *
			
 
				+  // sizeof(int64_t)));
			
 
				+  auto mask = paddle::empty({boxes_num * col_blocks}, paddle::DataType::INT64,
			
 
				+                            paddle::GPUPlace());
			
 
				+  int64_t *mask_data = mask.data<int64_t>();
			
 
				+  NmsNormalLauncher(boxes.stream(), boxes_data, mask_data, boxes_num,
			
 
				+                    nms_overlap_thresh);
			
 
				+
			
 
				+  // int64_t mask_cpu[boxes_num * col_blocks];
			
 
				+  // int64_t *mask_cpu = new int64_t [boxes_num * col_blocks];
			
 
				+  // std::vector<int64_t> mask_cpu(boxes_num * col_blocks);
			
 
				+
			
 
				+  // CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks *
			
 
				+  // sizeof(int64_t),
			
 
				+  //                       cudaMemcpyDeviceToHost));
			
 
				+
			
 
				+  // cudaFree(mask_data);
			
 
				+
			
 
				+  const paddle::Tensor mask_cpu_tensor = mask.copy_to(paddle::CPUPlace(), true);
			
 
				+  const int64_t *mask_cpu = mask_cpu_tensor.data<int64_t>();
			
 
				+
			
 
				+  int64_t remv_cpu[col_blocks];
			
 
				+  memset(remv_cpu, 0, col_blocks * sizeof(int64_t));
			
 
				+
			
 
				+  int num_to_keep = 0;
			
 
				+
			
 
				+  for (int i = 0; i < boxes_num; i++) {
			
 
				+    int nblock = i / THREADS_PER_BLOCK_NMS;
			
 
				+    int inblock = i % THREADS_PER_BLOCK_NMS;
			
 
				+
			
 
				+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
			
 
				+      keep_data[num_to_keep++] = i;
			
 
				+      const int64_t *p = &mask_cpu[0] + i * col_blocks;
			
 
				+      for (int j = nblock; j < col_blocks; j++) {
			
 
				+        remv_cpu[j] |= p[j];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  num_to_keep_data[0] = num_to_keep;
			
 
				+  if (cudaSuccess != cudaGetLastError()) {
			
 
				+    printf("Error!\n");
			
 
				+  }
			
 
				+  return {keep, num_to_keep_tensor};
			
 
				+}
			
--- a/paddlex/ops/iou3d_nms/iou3d_nms.h
+++ b/paddlex/ops/iou3d_nms/iou3d_nms.h
@@ -0,0 +1,33 @@
 
				+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#ifndef IOU3D_NMS_H
			
 
				+#define IOU3D_NMS_H
			
 
				+
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <paddle/extension.h>
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+std::vector<paddle::Tensor> boxes_overlap_bev_gpu(
			
 
				+    const paddle::Tensor &boxes_a, const paddle::Tensor &boxes_b);
			
 
				+std::vector<paddle::Tensor> boxes_iou_bev_gpu(
			
 
				+    const paddle::Tensor &boxes_a_tensor, const paddle::Tensor &boxes_b_tensor);
			
 
				+std::vector<paddle::Tensor> nms_gpu(const paddle::Tensor &boxes,
			
 
				+                                    float nms_overlap_thresh);
			
 
				+std::vector<paddle::Tensor> nms_normal_gpu(const paddle::Tensor &boxes,
			
 
				+                                           float nms_overlap_thresh);
			
 
				+
			
 
				+#endif
			
--- a/paddlex/ops/iou3d_nms/iou3d_nms_api.cpp
+++ b/paddlex/ops/iou3d_nms/iou3d_nms_api.cpp
@@ -0,0 +1,108 @@
 
				+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <paddle/extension.h>
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "iou3d_cpu.h"
			
 
				+#include "iou3d_nms.h"
			
 
				+
			
 
				+std::vector<paddle::DataType> BoxesIouBevCpuInferDtype(
			
 
				+    paddle::DataType boxes_a_dtype, paddle::DataType boxes_b_dtype) {
			
 
				+  return {boxes_a_dtype};
			
 
				+}
			
 
				+
			
 
				+std::vector<std::vector<int64_t>> BoxesIouBevCpuInferShape(
			
 
				+    std::vector<int64_t> boxes_a_shape, std::vector<int64_t> boxes_b_shape) {
			
 
				+  return {{boxes_a_shape[0], boxes_b_shape[0]}};
			
 
				+}
			
 
				+
			
 
				+std::vector<paddle::DataType> NmsInferDtype(paddle::DataType boxes_dtype) {
			
 
				+  return {paddle::DataType::INT64, paddle::DataType::INT64};
			
 
				+}
			
 
				+
			
 
				+std::vector<std::vector<int64_t>> NmsInferShape(
			
 
				+    std::vector<int64_t> boxes_shape) {
			
 
				+  return {{boxes_shape[0]}, {1}};
			
 
				+}
			
 
				+
			
 
				+std::vector<paddle::DataType> NmsNormalInferDtype(
			
 
				+    paddle::DataType boxes_dtype) {
			
 
				+  return {paddle::DataType::INT64, paddle::DataType::INT64};
			
 
				+}
			
 
				+
			
 
				+std::vector<std::vector<int64_t>> NmsNormalInferShape(
			
 
				+    std::vector<int64_t> boxes_shape) {
			
 
				+  return {{boxes_shape[0]}, {1}};
			
 
				+}
			
 
				+
			
 
				+std::vector<paddle::DataType> BoxesIouBevGpuInferDtype(
			
 
				+    paddle::DataType boxes_a_dtype, paddle::DataType boxes_b_dtype) {
			
 
				+  return {boxes_a_dtype};
			
 
				+}
			
 
				+
			
 
				+std::vector<std::vector<int64_t>> BoxesIouBevGpuInferShape(
			
 
				+    std::vector<int64_t> boxes_a_shape, std::vector<int64_t> boxes_b_shape) {
			
 
				+  return {{boxes_a_shape[0], boxes_b_shape[0]}};
			
 
				+}
			
 
				+
			
 
				+std::vector<paddle::DataType> BoxesOverlapBevGpuInferDtype(
			
 
				+    paddle::DataType boxes_a_dtype, paddle::DataType boxes_b_dtype) {
			
 
				+  return {boxes_a_dtype};
			
 
				+}
			
 
				+
			
 
				+std::vector<std::vector<int64_t>> BoxesOverlapBevGpuInferShape(
			
 
				+    std::vector<int64_t> boxes_a_shape, std::vector<int64_t> boxes_b_shape) {
			
 
				+  return {{boxes_a_shape[0], boxes_b_shape[0]}};
			
 
				+}
			
 
				+
			
 
				+PD_BUILD_OP(boxes_iou_bev_cpu)
			
 
				+    .Inputs({"boxes_a_tensor", " boxes_b_tensor"})
			
 
				+    .Outputs({"ans_iou_tensor"})
			
 
				+    .SetKernelFn(PD_KERNEL(boxes_iou_bev_cpu))
			
 
				+    .SetInferDtypeFn(PD_INFER_DTYPE(BoxesIouBevCpuInferDtype))
			
 
				+    .SetInferShapeFn(PD_INFER_SHAPE(BoxesIouBevCpuInferShape));
			
 
				+
			
 
				+PD_BUILD_OP(boxes_iou_bev_gpu)
			
 
				+    .Inputs({"boxes_a_tensor", " boxes_b_tensor"})
			
 
				+    .Outputs({"ans_iou_tensor"})
			
 
				+    .SetKernelFn(PD_KERNEL(boxes_iou_bev_gpu))
			
 
				+    .SetInferDtypeFn(PD_INFER_DTYPE(BoxesIouBevGpuInferDtype))
			
 
				+    .SetInferShapeFn(PD_INFER_SHAPE(BoxesIouBevGpuInferShape));
			
 
				+
			
 
				+PD_BUILD_OP(boxes_overlap_bev_gpu)
			
 
				+    .Inputs({"boxes_a", " boxes_b"})
			
 
				+    .Outputs({"ans_overlap"})
			
 
				+    .SetKernelFn(PD_KERNEL(boxes_overlap_bev_gpu))
			
 
				+    .SetInferDtypeFn(PD_INFER_DTYPE(BoxesOverlapBevGpuInferDtype))
			
 
				+    .SetInferShapeFn(PD_INFER_SHAPE(BoxesOverlapBevGpuInferShape));
			
 
				+
			
 
				+PD_BUILD_OP(nms_gpu)
			
 
				+    .Inputs({"boxes"})
			
 
				+    .Outputs({"keep", "num_to_keep"})
			
 
				+    .Attrs({"nms_overlap_thresh: float"})
			
 
				+    .SetKernelFn(PD_KERNEL(nms_gpu))
			
 
				+    .SetInferDtypeFn(PD_INFER_DTYPE(NmsInferDtype))
			
 
				+    .SetInferShapeFn(PD_INFER_SHAPE(NmsInferShape));
			
 
				+
			
 
				+PD_BUILD_OP(nms_normal_gpu)
			
 
				+    .Inputs({"boxes"})
			
 
				+    .Outputs({"keep", "num_to_keep"})
			
 
				+    .Attrs({"nms_overlap_thresh: float"})
			
 
				+    .SetInferShapeFn(PD_INFER_SHAPE(NmsNormalInferShape))
			
 
				+    .SetKernelFn(PD_KERNEL(nms_normal_gpu))
			
 
				+    .SetInferDtypeFn(PD_INFER_DTYPE(NmsNormalInferDtype));
			
--- a/paddlex/ops/iou3d_nms/iou3d_nms_kernel.cu
+++ b/paddlex/ops/iou3d_nms/iou3d_nms_kernel.cu
@@ -0,0 +1,482 @@
 
				+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+/*
			
 
				+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
			
 
				+Written by Shaoshuai Shi
			
 
				+All Rights Reserved 2019-2020.
			
 
				+*/
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#define THREADS_PER_BLOCK 16
			
 
				+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
			
 
				+
			
 
				+// #define DEBUG
			
 
				+const int THREADS_PER_BLOCK_NMS = sizeof(int64_t) * 8;
			
 
				+const float EPS = 1e-8;
			
 
				+struct Point {
			
 
				+  float x, y;
			
 
				+  __device__ Point() {}
			
 
				+  __device__ Point(double _x, double _y) { x = _x, y = _y; }
			
 
				+
			
 
				+  __device__ void set(float _x, float _y) {
			
 
				+    x = _x;
			
 
				+    y = _y;
			
 
				+  }
			
 
				+
			
 
				+  __device__ Point operator+(const Point &b) const {
			
 
				+    return Point(x + b.x, y + b.y);
			
 
				+  }
			
 
				+
			
 
				+  __device__ Point operator-(const Point &b) const {
			
 
				+    return Point(x - b.x, y - b.y);
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+__device__ inline float cross(const Point &a, const Point &b) {
			
 
				+  return a.x * b.y - a.y * b.x;
			
 
				+}
			
 
				+
			
 
				+__device__ inline float cross(const Point &p1, const Point &p2,
			
 
				+                              const Point &p0) {
			
 
				+  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
			
 
				+}
			
 
				+
			
 
				+__device__ int check_rect_cross(const Point &p1, const Point &p2,
			
 
				+                                const Point &q1, const Point &q2) {
			
 
				+  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
			
 
				+            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
			
 
				+            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
			
 
				+            min(q1.y, q2.y) <= max(p1.y, p2.y);
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+__device__ inline int check_in_box2d(const float *box, const Point &p) {
			
 
				+  // params: (7) [x, y, z, dx, dy, dz, heading]
			
 
				+  const float MARGIN = 1e-2;
			
 
				+
			
 
				+  float center_x = box[0], center_y = box[1];
			
 
				+  float angle_cos = cos(-box[6]),
			
 
				+        angle_sin =
			
 
				+            sin(-box[6]);  // rotate the point in the opposite direction of box
			
 
				+  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
			
 
				+  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
			
 
				+
			
 
				+  return (fabs(rot_x) < box[3] / 2 + MARGIN &&
			
 
				+          fabs(rot_y) < box[4] / 2 + MARGIN);
			
 
				+}
			
 
				+
			
 
				+__device__ inline int intersection(const Point &p1, const Point &p0,
			
 
				+                                   const Point &q1, const Point &q0,
			
 
				+                                   Point &ans) {
			
 
				+  // fast exclusion
			
 
				+  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
			
 
				+
			
 
				+  // check cross standing
			
 
				+  float s1 = cross(q0, p1, p0);
			
 
				+  float s2 = cross(p1, q1, p0);
			
 
				+  float s3 = cross(p0, q1, q0);
			
 
				+  float s4 = cross(q1, p1, q0);
			
 
				+
			
 
				+  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
			
 
				+
			
 
				+  // calculate intersection of two lines
			
 
				+  float s5 = cross(q1, p1, p0);
			
 
				+  if (fabs(s5 - s1) > EPS) {
			
 
				+    ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
			
 
				+    ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
			
 
				+
			
 
				+  } else {
			
 
				+    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
			
 
				+    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
			
 
				+    float D = a0 * b1 - a1 * b0;
			
 
				+
			
 
				+    ans.x = (b0 * c1 - b1 * c0) / D;
			
 
				+    ans.y = (a1 * c0 - a0 * c1) / D;
			
 
				+  }
			
 
				+
			
 
				+  return 1;
			
 
				+}
			
 
				+
			
 
				+__device__ inline void rotate_around_center(const Point &center,
			
 
				+                                            const float angle_cos,
			
 
				+                                            const float angle_sin, Point &p) {
			
 
				+  float new_x =
			
 
				+      (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
			
 
				+  float new_y =
			
 
				+      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
			
 
				+  p.set(new_x, new_y);
			
 
				+}
			
 
				+
			
 
				+__device__ inline int point_cmp(const Point &a, const Point &b,
			
 
				+                                const Point &center) {
			
 
				+  return atan2(a.y - center.y, a.x - center.x) >
			
 
				+         atan2(b.y - center.y, b.x - center.x);
			
 
				+}
			
 
				+
			
 
				+__device__ inline float box_overlap(const float *box_a, const float *box_b) {
			
 
				+  // params box_a: [x, y, z, dx, dy, dz, heading]
			
 
				+  // params box_b: [x, y, z, dx, dy, dz, heading]
			
 
				+
			
 
				+  float a_angle = box_a[6], b_angle = box_b[6];
			
 
				+  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
			
 
				+        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
			
 
				+  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
			
 
				+  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
			
 
				+  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
			
 
				+  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
			
 
				+
			
 
				+  Point center_a(box_a[0], box_a[1]);
			
 
				+  Point center_b(box_b[0], box_b[1]);
			
 
				+
			
 
				+#ifdef DEBUG
			
 
				+  printf(
			
 
				+      "a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n",
			
 
				+      a_x1, a_y1, a_x2, a_y2, a_angle, b_x1, b_y1, b_x2, b_y2, b_angle);
			
 
				+  printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y,
			
 
				+         center_b.x, center_b.y);
			
 
				+#endif
			
 
				+
			
 
				+  Point box_a_corners[5];
			
 
				+  box_a_corners[0].set(a_x1, a_y1);
			
 
				+  box_a_corners[1].set(a_x2, a_y1);
			
 
				+  box_a_corners[2].set(a_x2, a_y2);
			
 
				+  box_a_corners[3].set(a_x1, a_y2);
			
 
				+
			
 
				+  Point box_b_corners[5];
			
 
				+  box_b_corners[0].set(b_x1, b_y1);
			
 
				+  box_b_corners[1].set(b_x2, b_y1);
			
 
				+  box_b_corners[2].set(b_x2, b_y2);
			
 
				+  box_b_corners[3].set(b_x1, b_y2);
			
 
				+
			
 
				+  // get oriented corners
			
 
				+  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
			
 
				+  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
			
 
				+
			
 
				+  for (int k = 0; k < 4; k++) {
			
 
				+#ifdef DEBUG
			
 
				+    printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k,
			
 
				+           box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x,
			
 
				+           box_b_corners[k].y);
			
 
				+#endif
			
 
				+    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
			
 
				+    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
			
 
				+#ifdef DEBUG
			
 
				+    printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x,
			
 
				+           box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
			
 
				+#endif
			
 
				+  }
			
 
				+
			
 
				+  box_a_corners[4] = box_a_corners[0];
			
 
				+  box_b_corners[4] = box_b_corners[0];
			
 
				+
			
 
				+  // get intersection of lines
			
 
				+  Point cross_points[16];
			
 
				+  Point poly_center;
			
 
				+  int cnt = 0, flag = 0;
			
 
				+
			
 
				+  poly_center.set(0, 0);
			
 
				+  for (int i = 0; i < 4; i++) {
			
 
				+    for (int j = 0; j < 4; j++) {
			
 
				+      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
			
 
				+                          box_b_corners[j + 1], box_b_corners[j],
			
 
				+                          cross_points[cnt]);
			
 
				+      if (flag) {
			
 
				+        poly_center = poly_center + cross_points[cnt];
			
 
				+        cnt++;
			
 
				+#ifdef DEBUG
			
 
				+        printf(
			
 
				+            "Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, "
			
 
				+            "%.3f)->(%.3f, %.3f) \n",
			
 
				+            cross_points[cnt - 1].x, cross_points[cnt - 1].y,
			
 
				+            box_a_corners[i].x, box_a_corners[i].y, box_a_corners[i + 1].x,
			
 
				+            box_a_corners[i + 1].y, box_b_corners[i].x, box_b_corners[i].y,
			
 
				+            box_b_corners[i + 1].x, box_b_corners[i + 1].y);
			
 
				+#endif
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // check corners
			
 
				+  for (int k = 0; k < 4; k++) {
			
 
				+    if (check_in_box2d(box_a, box_b_corners[k])) {
			
 
				+      poly_center = poly_center + box_b_corners[k];
			
 
				+      cross_points[cnt] = box_b_corners[k];
			
 
				+      cnt++;
			
 
				+#ifdef DEBUG
			
 
				+      printf("b corners in a: corner_b(%.3f, %.3f)", cross_points[cnt - 1].x,
			
 
				+             cross_points[cnt - 1].y);
			
 
				+#endif
			
 
				+    }
			
 
				+    if (check_in_box2d(box_b, box_a_corners[k])) {
			
 
				+      poly_center = poly_center + box_a_corners[k];
			
 
				+      cross_points[cnt] = box_a_corners[k];
			
 
				+      cnt++;
			
 
				+#ifdef DEBUG
			
 
				+      printf("a corners in b: corner_a(%.3f, %.3f)", cross_points[cnt - 1].x,
			
 
				+             cross_points[cnt - 1].y);
			
 
				+#endif
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  poly_center.x /= cnt;
			
 
				+  poly_center.y /= cnt;
			
 
				+
			
 
				+  // sort the points of polygon
			
 
				+  Point temp;
			
 
				+  for (int j = 0; j < cnt - 1; j++) {
			
 
				+    for (int i = 0; i < cnt - j - 1; i++) {
			
 
				+      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
			
 
				+        temp = cross_points[i];
			
 
				+        cross_points[i] = cross_points[i + 1];
			
 
				+        cross_points[i + 1] = temp;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+#ifdef DEBUG
			
 
				+  printf("cnt=%d\n", cnt);
			
 
				+  for (int i = 0; i < cnt; i++) {
			
 
				+    printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x,
			
 
				+           cross_points[i].y);
			
 
				+  }
			
 
				+#endif
			
 
				+
			
 
				+  // get the overlap areas
			
 
				+  float area = 0;
			
 
				+  for (int k = 0; k < cnt - 1; k++) {
			
 
				+    area += cross(cross_points[k] - cross_points[0],
			
 
				+                  cross_points[k + 1] - cross_points[0]);
			
 
				+  }
			
 
				+
			
 
				+  return fabs(area) / 2.0;
			
 
				+}
			
 
				+
			
 
				+__device__ inline float iou_bev(const float *box_a, const float *box_b) {
			
 
				+  // params box_a: [x, y, z, dx, dy, dz, heading]
			
 
				+  // params box_b: [x, y, z, dx, dy, dz, heading]
			
 
				+  float sa = box_a[3] * box_a[4];
			
 
				+  float sb = box_b[3] * box_b[4];
			
 
				+  float s_overlap = box_overlap(box_a, box_b);
			
 
				+  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
			
 
				+}
			
 
				+
			
 
				+__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a,
			
 
				+                                     const int num_b, const float *boxes_b,
			
 
				+                                     float *ans_overlap) {
			
 
				+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
			
 
				+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
			
 
				+
			
 
				+  if (a_idx >= num_a || b_idx >= num_b) {
			
 
				+    return;
			
 
				+  }
			
 
				+  const float *cur_box_a = boxes_a + a_idx * 7;
			
 
				+  const float *cur_box_b = boxes_b + b_idx * 7;
			
 
				+  float s_overlap = box_overlap(cur_box_a, cur_box_b);
			
 
				+  ans_overlap[a_idx * num_b + b_idx] = s_overlap;
			
 
				+}
			
 
				+
			
 
				+__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a,
			
 
				+                                     const int num_b, const float *boxes_b,
			
 
				+                                     float *ans_iou) {
			
 
				+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
			
 
				+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
			
 
				+
			
 
				+  if (a_idx >= num_a || b_idx >= num_b) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  const float *cur_box_a = boxes_a + a_idx * 7;
			
 
				+  const float *cur_box_b = boxes_b + b_idx * 7;
			
 
				+  float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
			
 
				+  ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
			
 
				+}
			
 
				+
			
 
				+__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
			
 
				+                           const float *boxes, int64_t *mask) {
			
 
				+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
			
 
				+
			
 
				+  const int row_start = blockIdx.y;
			
 
				+  const int col_start = blockIdx.x;
			
 
				+
			
 
				+  // if (row_start > col_start) return;
			
 
				+
			
 
				+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
			
 
				+                             THREADS_PER_BLOCK_NMS);
			
 
				+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
			
 
				+                             THREADS_PER_BLOCK_NMS);
			
 
				+
			
 
				+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
			
 
				+
			
 
				+  if (threadIdx.x < col_size) {
			
 
				+    block_boxes[threadIdx.x * 7 + 0] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
			
 
				+    block_boxes[threadIdx.x * 7 + 1] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
			
 
				+    block_boxes[threadIdx.x * 7 + 2] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
			
 
				+    block_boxes[threadIdx.x * 7 + 3] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
			
 
				+    block_boxes[threadIdx.x * 7 + 4] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
			
 
				+    block_boxes[threadIdx.x * 7 + 5] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
			
 
				+    block_boxes[threadIdx.x * 7 + 6] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
			
 
				+  }
			
 
				+  __syncthreads();
			
 
				+
			
 
				+  if (threadIdx.x < row_size) {
			
 
				+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
			
 
				+    const float *cur_box = boxes + cur_box_idx * 7;
			
 
				+
			
 
				+    int i = 0;
			
 
				+    int64_t t = 0;
			
 
				+    int start = 0;
			
 
				+    if (row_start == col_start) {
			
 
				+      start = threadIdx.x + 1;
			
 
				+    }
			
 
				+    for (i = start; i < col_size; i++) {
			
 
				+      if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
			
 
				+        t |= 1ULL << i;
			
 
				+      }
			
 
				+    }
			
 
				+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
			
 
				+    mask[cur_box_idx * col_blocks + col_start] = t;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+__device__ inline float iou_normal(float const *const a, float const *const b) {
			
 
				+  // params: a: [x, y, z, dx, dy, dz, heading]
			
 
				+  // params: b: [x, y, z, dx, dy, dz, heading]
			
 
				+
			
 
				+  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
			
 
				+        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
			
 
				+  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
			
 
				+        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
			
 
				+  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
			
 
				+  float interS = width * height;
			
 
				+  float Sa = a[3] * a[4];
			
 
				+  float Sb = b[3] * b[4];
			
 
				+  return interS / fmaxf(Sa + Sb - interS, EPS);
			
 
				+}
			
 
				+
			
 
				+__global__ void nms_normal_kernel(const int boxes_num,
			
 
				+                                  const float nms_overlap_thresh,
			
 
				+                                  const float *boxes, int64_t *mask) {
			
 
				+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
			
 
				+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
			
 
				+
			
 
				+  const int row_start = blockIdx.y;
			
 
				+  const int col_start = blockIdx.x;
			
 
				+
			
 
				+  // if (row_start > col_start) return;
			
 
				+
			
 
				+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
			
 
				+                             THREADS_PER_BLOCK_NMS);
			
 
				+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
			
 
				+                             THREADS_PER_BLOCK_NMS);
			
 
				+
			
 
				+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
			
 
				+
			
 
				+  if (threadIdx.x < col_size) {
			
 
				+    block_boxes[threadIdx.x * 7 + 0] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
			
 
				+    block_boxes[threadIdx.x * 7 + 1] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
			
 
				+    block_boxes[threadIdx.x * 7 + 2] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
			
 
				+    block_boxes[threadIdx.x * 7 + 3] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
			
 
				+    block_boxes[threadIdx.x * 7 + 4] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
			
 
				+    block_boxes[threadIdx.x * 7 + 5] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
			
 
				+    block_boxes[threadIdx.x * 7 + 6] =
			
 
				+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
			
 
				+  }
			
 
				+  __syncthreads();
			
 
				+
			
 
				+  if (threadIdx.x < row_size) {
			
 
				+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
			
 
				+    const float *cur_box = boxes + cur_box_idx * 7;
			
 
				+
			
 
				+    int i = 0;
			
 
				+    int64_t t = 0;
			
 
				+    int start = 0;
			
 
				+    if (row_start == col_start) {
			
 
				+      start = threadIdx.x + 1;
			
 
				+    }
			
 
				+    for (i = start; i < col_size; i++) {
			
 
				+      if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
			
 
				+        t |= 1ULL << i;
			
 
				+      }
			
 
				+    }
			
 
				+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
			
 
				+    mask[cur_box_idx * col_blocks + col_start] = t;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void BoxesOverlapLauncher(const cudaStream_t &stream, const int num_a,
			
 
				+                          const float *boxes_a, const int num_b,
			
 
				+                          const float *boxes_b, float *ans_overlap) {
			
 
				+  dim3 blocks(
			
 
				+      DIVUP(num_b, THREADS_PER_BLOCK),
			
 
				+      DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
			
 
				+  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
			
 
				+
			
 
				+  boxes_overlap_kernel<<<blocks, threads, 0, stream>>>(num_a, boxes_a, num_b,
			
 
				+                                                       boxes_b, ans_overlap);
			
 
				+#ifdef DEBUG
			
 
				+  cudaDeviceSynchronize();  // for using printf in kernel function
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void BoxesIouBevLauncher(const cudaStream_t &stream, const int num_a,
			
 
				+                         const float *boxes_a, const int num_b,
			
 
				+                         const float *boxes_b, float *ans_iou) {
			
 
				+  dim3 blocks(
			
 
				+      DIVUP(num_b, THREADS_PER_BLOCK),
			
 
				+      DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
			
 
				+  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
			
 
				+
			
 
				+  boxes_iou_bev_kernel<<<blocks, threads, 0, stream>>>(num_a, boxes_a, num_b,
			
 
				+                                                       boxes_b, ans_iou);
			
 
				+#ifdef DEBUG
			
 
				+  cudaDeviceSynchronize();  // for using printf in kernel function
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void NmsLauncher(const cudaStream_t &stream, const float *boxes, int64_t *mask,
			
 
				+                 int boxes_num, float nms_overlap_thresh) {
			
 
				+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
			
 
				+              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
			
 
				+  dim3 threads(THREADS_PER_BLOCK_NMS);
			
 
				+  nms_kernel<<<blocks, threads, 0, stream>>>(boxes_num, nms_overlap_thresh,
			
 
				+                                             boxes, mask);
			
 
				+}
			
 
				+
			
 
				+void NmsNormalLauncher(const cudaStream_t &stream, const float *boxes,
			
 
				+                       int64_t *mask, int boxes_num, float nms_overlap_thresh) {
			
 
				+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
			
 
				+              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
			
 
				+  dim3 threads(THREADS_PER_BLOCK_NMS);
			
 
				+  nms_normal_kernel<<<blocks, threads, 0, stream>>>(
			
 
				+      boxes_num, nms_overlap_thresh, boxes, mask);
			
 
				+}
			
--- a/paddlex/ops/setup.py
+++ b/paddlex/ops/setup.py
@@ -0,0 +1,37 @@
 
				+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+import lazy_paddle as paddle
			
 
				+from lazy_paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup
			
 
				+
			
 
				+from paddlex.ops import custom_ops
			
 
				+
			
 
				+for op_name, op_dict in custom_ops.items():
			
 
				+    sources = op_dict.pop("sources", [])
			
 
				+    flags = None
			
 
				+
			
 
				+    if paddle.device.is_compiled_with_cuda():
			
 
				+        extension = CUDAExtension
			
 
				+        flags = {"cxx": ["-DPADDLE_WITH_CUDA"]}
			
 
				+        if "extra_cuda_cflags" in op_dict:
			
 
				+            flags["nvcc"] = op_dict.pop("extra_cuda_cflags")
			
 
				+    else:
			
 
				+        sources = filter(lambda x: x.endswith("cu"), sources)
			
 
				+        extension = CppExtension
			
 
				+
			
 
				+    if len(sources) == 0:
			
 
				+        continue
			
 
				+
			
 
				+    extension = extension(sources=sources, extra_compile_args=flags)
			
 
				+    setup(name=op_name, ext_modules=extension)
			
--- a/paddlex/ops/voxel/voxelize_op.cc
+++ b/paddlex/ops/voxel/voxelize_op.cc
@@ -0,0 +1,191 @@
 
				+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "paddle/extension.h"
			
 
				+
			
 
				+template <typename T, typename T_int>
			
 
				+bool hard_voxelize_cpu_kernel(
			
 
				+    const T *points, const float point_cloud_range_x_min,
			
 
				+    const float point_cloud_range_y_min, const float point_cloud_range_z_min,
			
 
				+    const float voxel_size_x, const float voxel_size_y,
			
 
				+    const float voxel_size_z, const int grid_size_x, const int grid_size_y,
			
 
				+    const int grid_size_z, const int64_t num_points, const int num_point_dim,
			
 
				+    const int max_num_points_in_voxel, const int max_voxels, T *voxels,
			
 
				+    T_int *coords, T_int *num_points_per_voxel, T_int *grid_idx_to_voxel_idx,
			
 
				+    T_int *num_voxels) {
			
 
				+  std::fill(voxels,
			
 
				+            voxels + max_voxels * max_num_points_in_voxel * num_point_dim,
			
 
				+            static_cast<T>(0));
			
 
				+
			
 
				+  num_voxels[0] = 0;
			
 
				+  int voxel_idx, grid_idx, curr_num_point;
			
 
				+  int coord_x, coord_y, coord_z;
			
 
				+  for (int point_idx = 0; point_idx < num_points; ++point_idx) {
			
 
				+    coord_x = floor(
			
 
				+        (points[point_idx * num_point_dim + 0] - point_cloud_range_x_min) /
			
 
				+        voxel_size_x);
			
 
				+    coord_y = floor(
			
 
				+        (points[point_idx * num_point_dim + 1] - point_cloud_range_y_min) /
			
 
				+        voxel_size_y);
			
 
				+    coord_z = floor(
			
 
				+        (points[point_idx * num_point_dim + 2] - point_cloud_range_z_min) /
			
 
				+        voxel_size_z);
			
 
				+
			
 
				+    if (coord_x < 0 || coord_x > grid_size_x || coord_x == grid_size_x) {
			
 
				+      continue;
			
 
				+    }
			
 
				+    if (coord_y < 0 || coord_y > grid_size_y || coord_y == grid_size_y) {
			
 
				+      continue;
			
 
				+    }
			
 
				+    if (coord_z < 0 || coord_z > grid_size_z || coord_z == grid_size_z) {
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    grid_idx =
			
 
				+        coord_z * grid_size_y * grid_size_x + coord_y * grid_size_x + coord_x;
			
 
				+    voxel_idx = grid_idx_to_voxel_idx[grid_idx];
			
 
				+    if (voxel_idx == -1) {
			
 
				+      voxel_idx = num_voxels[0];
			
 
				+      if (num_voxels[0] == max_voxels || num_voxels[0] > max_voxels) {
			
 
				+        continue;
			
 
				+      }
			
 
				+      num_voxels[0]++;
			
 
				+      grid_idx_to_voxel_idx[grid_idx] = voxel_idx;
			
 
				+      coords[voxel_idx * 3 + 0] = coord_z;
			
 
				+      coords[voxel_idx * 3 + 1] = coord_y;
			
 
				+      coords[voxel_idx * 3 + 2] = coord_x;
			
 
				+    }
			
 
				+    curr_num_point = num_points_per_voxel[voxel_idx];
			
 
				+    if (curr_num_point < max_num_points_in_voxel) {
			
 
				+      for (int j = 0; j < num_point_dim; ++j) {
			
 
				+        voxels[voxel_idx * max_num_points_in_voxel * num_point_dim +
			
 
				+               curr_num_point * num_point_dim + j] =
			
 
				+            points[point_idx * num_point_dim + j];
			
 
				+      }
			
 
				+      num_points_per_voxel[voxel_idx] = curr_num_point + 1;
			
 
				+    }
			
 
				+  }
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+std::vector<paddle::Tensor> hard_voxelize_cpu(
			
 
				+    const paddle::Tensor &points, const std::vector<float> &voxel_size,
			
 
				+    const std::vector<float> &point_cloud_range,
			
 
				+    const int max_num_points_in_voxel, const int max_voxels) {
			
 
				+  auto num_points = points.shape()[0];
			
 
				+  auto num_point_dim = points.shape()[1];
			
 
				+
			
 
				+  const float voxel_size_x = voxel_size[0];
			
 
				+  const float voxel_size_y = voxel_size[1];
			
 
				+  const float voxel_size_z = voxel_size[2];
			
 
				+  const float point_cloud_range_x_min = point_cloud_range[0];
			
 
				+  const float point_cloud_range_y_min = point_cloud_range[1];
			
 
				+  const float point_cloud_range_z_min = point_cloud_range[2];
			
 
				+  int grid_size_x = static_cast<int>(
			
 
				+      round((point_cloud_range[3] - point_cloud_range[0]) / voxel_size_x));
			
 
				+  int grid_size_y = static_cast<int>(
			
 
				+      round((point_cloud_range[4] - point_cloud_range[1]) / voxel_size_y));
			
 
				+  int grid_size_z = static_cast<int>(
			
 
				+      round((point_cloud_range[5] - point_cloud_range[2]) / voxel_size_z));
			
 
				+
			
 
				+  auto voxels =
			
 
				+      paddle::empty({max_voxels, max_num_points_in_voxel, num_point_dim},
			
 
				+                    paddle::DataType::FLOAT32, paddle::CPUPlace());
			
 
				+
			
 
				+  auto coords = paddle::full({max_voxels, 3}, 0, paddle::DataType::INT32,
			
 
				+                             paddle::CPUPlace());
			
 
				+  auto *coords_data = coords.data<int>();
			
 
				+
			
 
				+  auto num_points_per_voxel = paddle::full(
			
 
				+      {max_voxels}, 0, paddle::DataType::INT32, paddle::CPUPlace());
			
 
				+  auto *num_points_per_voxel_data = num_points_per_voxel.data<int>();
			
 
				+  std::fill(num_points_per_voxel_data,
			
 
				+            num_points_per_voxel_data + num_points_per_voxel.size(),
			
 
				+            static_cast<int>(0));
			
 
				+
			
 
				+  auto num_voxels =
			
 
				+      paddle::full({1}, 0, paddle::DataType::INT32, paddle::CPUPlace());
			
 
				+  auto *num_voxels_data = num_voxels.data<int>();
			
 
				+
			
 
				+  auto grid_idx_to_voxel_idx =
			
 
				+      paddle::full({grid_size_z, grid_size_y, grid_size_x}, -1,
			
 
				+                   paddle::DataType::INT32, paddle::CPUPlace());
			
 
				+  auto *grid_idx_to_voxel_idx_data = grid_idx_to_voxel_idx.data<int>();
			
 
				+
			
 
				+  PD_DISPATCH_FLOATING_TYPES(
			
 
				+      points.type(), "hard_voxelize_cpu_kernel", ([&] {
			
 
				+        hard_voxelize_cpu_kernel<data_t, int>(
			
 
				+            points.data<data_t>(), point_cloud_range_x_min,
			
 
				+            point_cloud_range_y_min, point_cloud_range_z_min, voxel_size_x,
			
 
				+            voxel_size_y, voxel_size_z, grid_size_x, grid_size_y, grid_size_z,
			
 
				+            num_points, num_point_dim, max_num_points_in_voxel, max_voxels,
			
 
				+            voxels.data<data_t>(), coords_data, num_points_per_voxel_data,
			
 
				+            grid_idx_to_voxel_idx_data, num_voxels_data);
			
 
				+      }));
			
 
				+
			
 
				+  return {voxels, coords, num_points_per_voxel, num_voxels};
			
 
				+}
			
 
				+
			
 
				+#ifdef PADDLE_WITH_CUDA
			
 
				+std::vector<paddle::Tensor> hard_voxelize_cuda(
			
 
				+    const paddle::Tensor &points, const std::vector<float> &voxel_size,
			
 
				+    const std::vector<float> &point_cloud_range, int max_num_points_in_voxel,
			
 
				+    int max_voxels);
			
 
				+#endif
			
 
				+
			
 
				+std::vector<paddle::Tensor> hard_voxelize(
			
 
				+    const paddle::Tensor &points, const std::vector<float> &voxel_size,
			
 
				+    const std::vector<float> &point_cloud_range,
			
 
				+    const int max_num_points_in_voxel, const int max_voxels) {
			
 
				+  if (points.is_cpu()) {
			
 
				+    return hard_voxelize_cpu(points, voxel_size, point_cloud_range,
			
 
				+                             max_num_points_in_voxel, max_voxels);
			
 
				+#ifdef PADDLE_WITH_CUDA
			
 
				+  } else if (points.is_gpu() || points.is_gpu_pinned()) {
			
 
				+    return hard_voxelize_cuda(points, voxel_size, point_cloud_range,
			
 
				+                              max_num_points_in_voxel, max_voxels);
			
 
				+#endif
			
 
				+  } else {
			
 
				+    PD_THROW(
			
 
				+        "Unsupported device type for hard_voxelize "
			
 
				+        "operator.");
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+std::vector<std::vector<int64_t>> HardInferShape(
			
 
				+    std::vector<int64_t> points_shape, const std::vector<float> &voxel_size,
			
 
				+    const std::vector<float> &point_cloud_range,
			
 
				+    const int &max_num_points_in_voxel, const int &max_voxels) {
			
 
				+  return {{max_voxels, max_num_points_in_voxel, points_shape[1]},
			
 
				+          {max_voxels, 3},
			
 
				+          {max_voxels},
			
 
				+          {1}};
			
 
				+}
			
 
				+
			
 
				+std::vector<paddle::DataType> HardInferDtype(paddle::DataType points_dtype) {
			
 
				+  return {points_dtype, paddle::DataType::INT32, paddle::DataType::INT32,
			
 
				+          paddle::DataType::INT32};
			
 
				+}
			
 
				+
			
 
				+PD_BUILD_OP(hard_voxelize)
			
 
				+    .Inputs({"POINTS"})
			
 
				+    .Outputs({"VOXELS", "COORS", "NUM_POINTS_PER_VOXEL", "num_voxels"})
			
 
				+    .SetKernelFn(PD_KERNEL(hard_voxelize))
			
 
				+    .Attrs({"voxel_size: std::vector<float>",
			
 
				+            "point_cloud_range: std::vector<float>",
			
 
				+            "max_num_points_in_voxel: int", "max_voxels: int"})
			
 
				+    .SetInferShapeFn(PD_INFER_SHAPE(HardInferShape))
			
 
				+    .SetInferDtypeFn(PD_INFER_DTYPE(HardInferDtype));
			
--- a/paddlex/ops/voxel/voxelize_op.cu
+++ b/paddlex/ops/voxel/voxelize_op.cu
@@ -0,0 +1,346 @@
 
				+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
			
 
				+//
			
 
				+// Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+// you may not use this file except in compliance with the License.
			
 
				+// You may obtain a copy of the License at
			
 
				+//
			
 
				+//     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+//
			
 
				+// Unless required by applicable law or agreed to in writing, software
			
 
				+// distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+// See the License for the specific language governing permissions and
			
 
				+// limitations under the License.
			
 
				+
			
 
				+#include "paddle/extension.h"
			
 
				+
			
 
				+#define CHECK_INPUT_CUDA(x) \
			
 
				+  PD_CHECK(x.is_gpu() || x.is_gpu_pinned(), #x " must be a GPU Tensor.")
			
 
				+
			
 
				+#define CUDA_KERNEL_LOOP(i, n)                                  \
			
 
				+  for (auto i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
			
 
				+       i += blockDim.x * gridDim.x)
			
 
				+
			
 
				+template <typename T, typename T_int>
			
 
				+__global__ void init_num_point_grid(
			
 
				+    const T *points, const float point_cloud_range_x_min,
			
 
				+    const float point_cloud_range_y_min, const float point_cloud_range_z_min,
			
 
				+    const float voxel_size_x, const float voxel_size_y,
			
 
				+    const float voxel_size_z, const int grid_size_x, const int grid_size_y,
			
 
				+    const int grid_size_z, const int64_t num_points, const int num_point_dim,
			
 
				+    T_int *num_points_in_grid, int *points_valid) {
			
 
				+  int64_t point_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  if (point_idx > num_points || point_idx == num_points) {
			
 
				+    return;
			
 
				+  }
			
 
				+  int coord_x =
			
 
				+      floor((points[point_idx * num_point_dim + 0] - point_cloud_range_x_min) /
			
 
				+            voxel_size_x);
			
 
				+  int coord_y =
			
 
				+      floor((points[point_idx * num_point_dim + 1] - point_cloud_range_y_min) /
			
 
				+            voxel_size_y);
			
 
				+  int coord_z =
			
 
				+      floor((points[point_idx * num_point_dim + 2] - point_cloud_range_z_min) /
			
 
				+            voxel_size_z);
			
 
				+
			
 
				+  if (coord_x < 0 || coord_x > grid_size_x || coord_x == grid_size_x) {
			
 
				+    return;
			
 
				+  }
			
 
				+  if (coord_y < 0 || coord_y > grid_size_y || coord_y == grid_size_y) {
			
 
				+    return;
			
 
				+  }
			
 
				+  if (coord_z < 0 || coord_z > grid_size_z || coord_z == grid_size_z) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  int grid_idx =
			
 
				+      coord_z * grid_size_y * grid_size_x + coord_y * grid_size_x + coord_x;
			
 
				+  num_points_in_grid[grid_idx] = 0;
			
 
				+  points_valid[grid_idx] = num_points;
			
 
				+}
			
 
				+
			
 
				+template <typename T, typename T_int>
			
 
				+__global__ void map_point_to_grid_kernel(
			
 
				+    const T *points, const float point_cloud_range_x_min,
			
 
				+    const float point_cloud_range_y_min, const float point_cloud_range_z_min,
			
 
				+    const float voxel_size_x, const float voxel_size_y,
			
 
				+    const float voxel_size_z, const int grid_size_x, const int grid_size_y,
			
 
				+    const int grid_size_z, const int64_t num_points, const int num_point_dim,
			
 
				+    const int max_num_points_in_voxel, T_int *points_to_grid_idx,
			
 
				+    T_int *points_to_num_idx, T_int *num_points_in_grid, int *points_valid) {
			
 
				+  int64_t point_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  if (point_idx > num_points || point_idx == num_points) {
			
 
				+    return;
			
 
				+  }
			
 
				+  int coord_x =
			
 
				+      floor((points[point_idx * num_point_dim + 0] - point_cloud_range_x_min) /
			
 
				+            voxel_size_x);
			
 
				+  int coord_y =
			
 
				+      floor((points[point_idx * num_point_dim + 1] - point_cloud_range_y_min) /
			
 
				+            voxel_size_y);
			
 
				+  int coord_z =
			
 
				+      floor((points[point_idx * num_point_dim + 2] - point_cloud_range_z_min) /
			
 
				+            voxel_size_z);
			
 
				+
			
 
				+  if (coord_x < 0 || coord_x > grid_size_x || coord_x == grid_size_x) {
			
 
				+    return;
			
 
				+  }
			
 
				+  if (coord_y < 0 || coord_y > grid_size_y || coord_y == grid_size_y) {
			
 
				+    return;
			
 
				+  }
			
 
				+  if (coord_z < 0 || coord_z > grid_size_z || coord_z == grid_size_z) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  int grid_idx =
			
 
				+      coord_z * grid_size_y * grid_size_x + coord_y * grid_size_x + coord_x;
			
 
				+  T_int num = atomicAdd(num_points_in_grid + grid_idx, 1);
			
 
				+  if (num < max_num_points_in_voxel) {
			
 
				+    points_to_num_idx[point_idx] = num;
			
 
				+    points_to_grid_idx[point_idx] = grid_idx;
			
 
				+    atomicMin(points_valid + grid_idx, static_cast<int>(point_idx));
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+template <typename T_int>
			
 
				+__global__ void update_points_flag(const int *points_valid,
			
 
				+                                   const T_int *points_to_grid_idx,
			
 
				+                                   const int num_points, int *points_flag) {
			
 
				+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+  for (int i = tid; i < num_points; i += gridDim.x * blockDim.x) {
			
 
				+    T_int grid_idx = points_to_grid_idx[i];
			
 
				+    if (grid_idx >= 0) {
			
 
				+      int id = points_valid[grid_idx];
			
 
				+      if (id != num_points && id == i) {
			
 
				+        points_flag[i] = 1;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+template <typename T_int>
			
 
				+__global__ void get_voxel_idx_kernel(const int *points_flag,
			
 
				+                                     const T_int *points_to_grid_idx,
			
 
				+                                     const int *points_flag_prefix_sum,
			
 
				+                                     const int num_points, const int max_voxels,
			
 
				+                                     T_int *num_voxels,
			
 
				+                                     T_int *grid_idx_to_voxel_idx) {
			
 
				+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+  for (int i = tid; i < num_points; i += gridDim.x * blockDim.x) {
			
 
				+    if (points_flag[i] == 1) {
			
 
				+      T_int grid_idx = points_to_grid_idx[i];
			
 
				+      int num = points_flag_prefix_sum[i];
			
 
				+      if (num < max_voxels) {
			
 
				+        grid_idx_to_voxel_idx[grid_idx] = num;
			
 
				+      }
			
 
				+    }
			
 
				+    if (i == num_points - 1) {
			
 
				+      int num = points_flag_prefix_sum[i] + points_flag[i];
			
 
				+      if (num < max_voxels) {
			
 
				+        num_voxels[0] = num;
			
 
				+      } else {
			
 
				+        num_voxels[0] = max_voxels;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+__global__ void init_voxels_kernel(const int64_t num, T *voxels) {
			
 
				+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  if (idx > num || idx == num) {
			
 
				+    return;
			
 
				+  }
			
 
				+  voxels[idx] = static_cast<T>(0);
			
 
				+}
			
 
				+
			
 
				+template <typename T, typename T_int>
			
 
				+__global__ void assign_voxels_kernel(
			
 
				+    const T *points, const T_int *points_to_grid_idx,
			
 
				+    const T_int *points_to_num_idx, const T_int *grid_idx_to_voxel_idx,
			
 
				+    const int64_t num_points, const int num_point_dim,
			
 
				+    const int max_num_points_in_voxel, T *voxels) {
			
 
				+  int64_t point_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  if (point_idx > num_points || point_idx == num_points) {
			
 
				+    return;
			
 
				+  }
			
 
				+  T_int grid_idx = points_to_grid_idx[point_idx];
			
 
				+  T_int num_idx = points_to_num_idx[point_idx];
			
 
				+  if (grid_idx > -1 && num_idx > -1) {
			
 
				+    T_int voxel_idx = grid_idx_to_voxel_idx[grid_idx];
			
 
				+    if (voxel_idx > -1) {
			
 
				+      for (int64_t i = 0; i < num_point_dim; ++i) {
			
 
				+        voxels[voxel_idx * max_num_points_in_voxel * num_point_dim +
			
 
				+               num_idx * num_point_dim + i] =
			
 
				+            points[point_idx * num_point_dim + i];
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+template <typename T, typename T_int>
			
 
				+__global__ void assign_coords_kernel(const T_int *grid_idx_to_voxel_idx,
			
 
				+                                     const T_int *num_points_in_grid,
			
 
				+                                     const int num_grids, const int grid_size_x,
			
 
				+                                     const int grid_size_y,
			
 
				+                                     const int grid_size_z,
			
 
				+                                     const int max_num_points_in_voxel,
			
 
				+                                     T *coords, T *num_points_per_voxel) {
			
 
				+  int64_t grid_idx = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+  if (grid_idx > num_grids || grid_idx == num_grids) {
			
 
				+    return;
			
 
				+  }
			
 
				+  T_int voxel_idx = grid_idx_to_voxel_idx[grid_idx];
			
 
				+  if (voxel_idx > -1) {
			
 
				+    T_int coord_z = grid_idx / grid_size_x / grid_size_y;
			
 
				+    T_int coord_y =
			
 
				+        (grid_idx - coord_z * grid_size_x * grid_size_y) / grid_size_x;
			
 
				+    T_int coord_x =
			
 
				+        grid_idx - coord_z * grid_size_x * grid_size_y - coord_y * grid_size_x;
			
 
				+    coords[voxel_idx * 3 + 0] = coord_z;
			
 
				+    coords[voxel_idx * 3 + 1] = coord_y;
			
 
				+    coords[voxel_idx * 3 + 2] = coord_x;
			
 
				+    num_points_per_voxel[voxel_idx] =
			
 
				+        min(num_points_in_grid[grid_idx], max_num_points_in_voxel);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+std::vector<paddle::Tensor> hard_voxelize_cuda(
			
 
				+    const paddle::Tensor &points, const std::vector<float> &voxel_size,
			
 
				+    const std::vector<float> &point_cloud_range, int max_num_points_in_voxel,
			
 
				+    int max_voxels) {
			
 
				+  // check device
			
 
				+  CHECK_INPUT_CUDA(points);
			
 
				+
			
 
				+  int64_t num_points = points.shape()[0];
			
 
				+  int64_t num_point_dim = points.shape()[1];
			
 
				+
			
 
				+  const float voxel_size_x = voxel_size[0];
			
 
				+  const float voxel_size_y = voxel_size[1];
			
 
				+  const float voxel_size_z = voxel_size[2];
			
 
				+  const float point_cloud_range_x_min = point_cloud_range[0];
			
 
				+  const float point_cloud_range_y_min = point_cloud_range[1];
			
 
				+  const float point_cloud_range_z_min = point_cloud_range[2];
			
 
				+  int grid_size_x = static_cast<int>(
			
 
				+      round((point_cloud_range[3] - point_cloud_range[0]) / voxel_size_x));
			
 
				+  int grid_size_y = static_cast<int>(
			
 
				+      round((point_cloud_range[4] - point_cloud_range[1]) / voxel_size_y));
			
 
				+  int grid_size_z = static_cast<int>(
			
 
				+      round((point_cloud_range[5] - point_cloud_range[2]) / voxel_size_z));
			
 
				+  int num_grids = grid_size_x * grid_size_y * grid_size_z;
			
 
				+
			
 
				+  auto voxels =
			
 
				+      paddle::empty({max_voxels, max_num_points_in_voxel, num_point_dim},
			
 
				+                    paddle::DataType::FLOAT32, paddle::GPUPlace());
			
 
				+
			
 
				+  auto coords = paddle::full({max_voxels, 3}, 0, paddle::DataType::INT32,
			
 
				+                             paddle::GPUPlace());
			
 
				+  auto *coords_data = coords.data<int>();
			
 
				+
			
 
				+  auto num_points_per_voxel = paddle::full(
			
 
				+      {max_voxels}, 0, paddle::DataType::INT32, paddle::GPUPlace());
			
 
				+  auto *num_points_per_voxel_data = num_points_per_voxel.data<int>();
			
 
				+
			
 
				+  auto points_to_grid_idx = paddle::full(
			
 
				+      {num_points}, -1, paddle::DataType::INT32, paddle::GPUPlace());
			
 
				+  auto *points_to_grid_idx_data = points_to_grid_idx.data<int>();
			
 
				+
			
 
				+  auto points_to_num_idx = paddle::full(
			
 
				+      {num_points}, -1, paddle::DataType::INT32, paddle::GPUPlace());
			
 
				+  auto *points_to_num_idx_data = points_to_num_idx.data<int>();
			
 
				+
			
 
				+  auto num_points_in_grid =
			
 
				+      paddle::empty({grid_size_z, grid_size_y, grid_size_x},
			
 
				+                    paddle::DataType::INT32, paddle::GPUPlace());
			
 
				+  auto *num_points_in_grid_data = num_points_in_grid.data<int>();
			
 
				+
			
 
				+  auto grid_idx_to_voxel_idx =
			
 
				+      paddle::full({grid_size_z, grid_size_y, grid_size_x}, -1,
			
 
				+                   paddle::DataType::INT32, paddle::GPUPlace());
			
 
				+  auto *grid_idx_to_voxel_idx_data = grid_idx_to_voxel_idx.data<int>();
			
 
				+
			
 
				+  auto num_voxels =
			
 
				+      paddle::full({1}, 0, paddle::DataType::INT32, paddle::GPUPlace());
			
 
				+  auto *num_voxels_data = num_voxels.data<int>();
			
 
				+
			
 
				+  auto points_valid =
			
 
				+      paddle::empty({grid_size_z, grid_size_y, grid_size_x},
			
 
				+                    paddle::DataType::INT32, paddle::GPUPlace());
			
 
				+  int *points_valid_data = points_valid.data<int>();
			
 
				+  auto points_flag = paddle::full({num_points}, 0, paddle::DataType::INT32,
			
 
				+                                  paddle::GPUPlace());
			
 
				+
			
 
				+  // 1. Find the grid index for each point, compute the
			
 
				+  // number of points in each grid
			
 
				+  int64_t threads = 512;
			
 
				+  int64_t blocks = (num_points + threads - 1) / threads;
			
 
				+
			
 
				+  PD_DISPATCH_FLOATING_TYPES(
			
 
				+      points.type(), "init_num_point_grid", ([&] {
			
 
				+        init_num_point_grid<data_t, int>
			
 
				+            <<<blocks, threads, 0, points.stream()>>>(
			
 
				+                points.data<data_t>(), point_cloud_range_x_min,
			
 
				+                point_cloud_range_y_min, point_cloud_range_z_min, voxel_size_x,
			
 
				+                voxel_size_y, voxel_size_z, grid_size_x, grid_size_y,
			
 
				+                grid_size_z, num_points, num_point_dim, num_points_in_grid_data,
			
 
				+                points_valid_data);
			
 
				+      }));
			
 
				+
			
 
				+  PD_DISPATCH_FLOATING_TYPES(
			
 
				+      points.type(), "map_point_to_grid_kernel", ([&] {
			
 
				+        map_point_to_grid_kernel<data_t, int>
			
 
				+            <<<blocks, threads, 0, points.stream()>>>(
			
 
				+                points.data<data_t>(), point_cloud_range_x_min,
			
 
				+                point_cloud_range_y_min, point_cloud_range_z_min, voxel_size_x,
			
 
				+                voxel_size_y, voxel_size_z, grid_size_x, grid_size_y,
			
 
				+                grid_size_z, num_points, num_point_dim, max_num_points_in_voxel,
			
 
				+                points_to_grid_idx_data, points_to_num_idx_data,
			
 
				+                num_points_in_grid_data, points_valid_data);
			
 
				+      }));
			
 
				+
			
 
				+  // 2. Find the number of non-zero voxels
			
 
				+  int *points_flag_data = points_flag.data<int>();
			
 
				+
			
 
				+  threads = 512;
			
 
				+  blocks = (num_points + threads - 1) / threads;
			
 
				+  update_points_flag<int><<<blocks, threads, 0, points.stream()>>>(
			
 
				+      points_valid_data, points_to_grid_idx_data, num_points, points_flag_data);
			
 
				+
			
 
				+  auto points_flag_prefix_sum =
			
 
				+      paddle::experimental::cumsum(points_flag, 0, false, true, false);
			
 
				+  int *points_flag_prefix_sum_data = points_flag_prefix_sum.data<int>();
			
 
				+
			
 
				+  get_voxel_idx_kernel<int><<<blocks, threads, 0, points.stream()>>>(
			
 
				+      points_flag_data, points_to_grid_idx_data, points_flag_prefix_sum_data,
			
 
				+      num_points, max_voxels, num_voxels_data, grid_idx_to_voxel_idx_data);
			
 
				+
			
 
				+  // 3. Store points to voxels coords and num_points_per_voxel
			
 
				+  int64_t num = max_voxels * max_num_points_in_voxel * num_point_dim;
			
 
				+  threads = 512;
			
 
				+  blocks = (num + threads - 1) / threads;
			
 
				+  PD_DISPATCH_FLOATING_TYPES(points.type(), "init_voxels_kernel", ([&] {
			
 
				+                               init_voxels_kernel<data_t>
			
 
				+                                   <<<blocks, threads, 0, points.stream()>>>(
			
 
				+                                       num, voxels.data<data_t>());
			
 
				+                             }));
			
 
				+
			
 
				+  threads = 512;
			
 
				+  blocks = (num_points + threads - 1) / threads;
			
 
				+  PD_DISPATCH_FLOATING_TYPES(
			
 
				+      points.type(), "assign_voxels_kernel", ([&] {
			
 
				+        assign_voxels_kernel<data_t, int>
			
 
				+            <<<blocks, threads, 0, points.stream()>>>(
			
 
				+                points.data<data_t>(), points_to_grid_idx_data,
			
 
				+                points_to_num_idx_data, grid_idx_to_voxel_idx_data, num_points,
			
 
				+                num_point_dim, max_num_points_in_voxel, voxels.data<data_t>());
			
 
				+      }));
			
 
				+
			
 
				+  // 4. Store coords, num_points_per_voxel
			
 
				+  blocks = (num_grids + threads - 1) / threads;
			
 
				+  assign_coords_kernel<int><<<blocks, threads, 0, points.stream()>>>(
			
 
				+      grid_idx_to_voxel_idx_data, num_points_in_grid_data, num_grids,
			
 
				+      grid_size_x, grid_size_y, grid_size_z, max_num_points_in_voxel,
			
 
				+      coords_data, num_points_per_voxel_data);
			
 
				+
			
 
				+  return {voxels, coords, num_points_per_voxel, num_voxels};
			
 
				+}