10 months ago · c6d4c935a0
--- a/docs/practical_tutorials/small_object_detection_tutorial.en.md
+++ b/docs/practical_tutorials/small_object_detection_tutorial.en.md
@@ -300,4 +300,4 @@ For more parameters, please refer to [Small Object Detection Pipeline Usage Tuto
 
				 * Service deployment: Service deployment is a common deployment form in actual production environments. By encapsulating inference functions into services, clients can access these services through network requests to obtain inference results. PaddleX supports users to achieve service deployment of the pipeline at a low cost. For detailed service deployment processes, please refer to [PaddleX Service Deployment Guide](../pipeline_deploy/service_deploy.md).
			
 
				 * Edge deployment: Edge deployment is a way of placing computing and data processing functions on user devices themselves, where devices can directly process data without relying on remote servers. PaddleX supports deploying models on edge devices such as Android. For detailed edge deployment processes, please refer to [PaddleX Edge Deployment Guide](../pipeline_deploy/edge_deploy.md).
			
 
				 
			
 
				-You can choose an appropriate method to deploy the model pipeline according to your needs and proceed with subsequent AI application integration.
			
 
				+You can choose an appropriate method to deploy the model pipeline according to your needs and proceed with subsequent AI application integration.
			
--- a/paddlex/configs/modules/object_detection/Co-DINO-R50.yaml
+++ b/paddlex/configs/modules/object_detection/Co-DINO-R50.yaml
@@ -0,0 +1,40 @@
 
				+Global:
			
 
				+  model: Co-DINO-R50
			
 
				+  mode: check_dataset # check_dataset/train/evaluate/predict
			
 
				+  dataset_dir: "/paddle/dataset/paddlex/det/det_coco_examples"
			
 
				+  device: gpu:0,1,2,3
			
 
				+  output: "output"
			
 
				+
			
 
				+CheckDataset:
			
 
				+  convert:
			
 
				+    enable: False
			
 
				+    src_dataset_type: null
			
 
				+  split:
			
 
				+    enable: False
			
 
				+    train_percent: null
			
 
				+    val_percent: null
			
 
				+
			
 
				+Train:
			
 
				+  num_classes: 4
			
 
				+  epochs_iters: 50
			
 
				+  batch_size: 1
			
 
				+  learning_rate: 0.0001
			
 
				+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-DINO-R50_pretrained.pdparams
			
 
				+  warmup_steps: 100
			
 
				+  resume_path: null
			
 
				+  log_interval: 10
			
 
				+  eval_interval: 1
			
 
				+
			
 
				+Evaluate:
			
 
				+  weight_path: "output/best_model/best_model.pdparams"
			
 
				+  log_interval: 10
			
 
				+
			
 
				+Predict:
			
 
				+  batch_size: 1
			
 
				+  model_dir: "output/best_model/inference"
			
 
				+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_object_detection_002.png"
			
 
				+  kernel_option:
			
 
				+    run_mode: paddle
			
 
				+
			
 
				+Export:
			
 
				+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-DINO-R50_pretrained.pdparams
			
--- a/paddlex/configs/modules/object_detection/Co-DINO-Swin-L.yaml
+++ b/paddlex/configs/modules/object_detection/Co-DINO-Swin-L.yaml
@@ -0,0 +1,40 @@
 
				+Global:
			
 
				+  model: Co-DINO-Swin-L
			
 
				+  mode: check_dataset # check_dataset/train/evaluate/predict
			
 
				+  dataset_dir: "/paddle/dataset/paddlex/det/det_coco_examples"
			
 
				+  device: gpu:0,1,2,3
			
 
				+  output: "output"
			
 
				+
			
 
				+CheckDataset:
			
 
				+  convert:
			
 
				+    enable: False
			
 
				+    src_dataset_type: null
			
 
				+  split:
			
 
				+    enable: False
			
 
				+    train_percent: null
			
 
				+    val_percent: null
			
 
				+
			
 
				+Train:
			
 
				+  num_classes: 4
			
 
				+  epochs_iters: 50
			
 
				+  batch_size: 1
			
 
				+  learning_rate: 0.0001
			
 
				+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-DINO-Swin-L_pretrained.pdparams
			
 
				+  warmup_steps: 100
			
 
				+  resume_path: null
			
 
				+  log_interval: 10
			
 
				+  eval_interval: 1
			
 
				+
			
 
				+Evaluate:
			
 
				+  weight_path: "output/best_model/best_model.pdparams"
			
 
				+  log_interval: 10
			
 
				+
			
 
				+Predict:
			
 
				+  batch_size: 1
			
 
				+  model_dir: "output/best_model/inference"
			
 
				+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_object_detection_002.png"
			
 
				+  kernel_option:
			
 
				+    run_mode: paddle
			
 
				+
			
 
				+Export:
			
 
				+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-DINO-Swin-L_pretrained.pdparams
			
--- a/paddlex/configs/modules/object_detection/Co-Deformable-DETR-R50.yaml
+++ b/paddlex/configs/modules/object_detection/Co-Deformable-DETR-R50.yaml
@@ -0,0 +1,40 @@
 
				+Global:
			
 
				+  model: Co-Deformable-DETR-R50
			
 
				+  mode: check_dataset # check_dataset/train/evaluate/predict
			
 
				+  dataset_dir: "/paddle/dataset/paddlex/det/det_coco_examples"
			
 
				+  device: gpu:0,1,2,3
			
 
				+  output: "output"
			
 
				+
			
 
				+CheckDataset:
			
 
				+  convert:
			
 
				+    enable: False
			
 
				+    src_dataset_type: null
			
 
				+  split:
			
 
				+    enable: False
			
 
				+    train_percent: null
			
 
				+    val_percent: null
			
 
				+
			
 
				+Train:
			
 
				+  num_classes: 4
			
 
				+  epochs_iters: 50
			
 
				+  batch_size: 1
			
 
				+  learning_rate: 0.0001
			
 
				+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-Deformable-DETR-R50_pretrained.pdparams
			
 
				+  warmup_steps: 100
			
 
				+  resume_path: null
			
 
				+  log_interval: 10
			
 
				+  eval_interval: 1
			
 
				+
			
 
				+Evaluate:
			
 
				+  weight_path: "output/best_model/best_model.pdparams"
			
 
				+  log_interval: 10
			
 
				+
			
 
				+Predict:
			
 
				+  batch_size: 1
			
 
				+  model_dir: "output/best_model/inference"
			
 
				+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_object_detection_002.png"
			
 
				+  kernel_option:
			
 
				+    run_mode: paddle
			
 
				+
			
 
				+Export:
			
 
				+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-Deformable-DETR-R50_pretrained.pdparams
			
--- a/paddlex/configs/modules/object_detection/Co-Deformable-DETR-Swin-T.yaml
+++ b/paddlex/configs/modules/object_detection/Co-Deformable-DETR-Swin-T.yaml
@@ -0,0 +1,40 @@
 
				+Global:
			
 
				+  model: Co-Deformable-DETR-Swin-T
			
 
				+  mode: check_dataset # check_dataset/train/evaluate/predict
			
 
				+  dataset_dir: "/paddle/dataset/paddlex/det/det_coco_examples"
			
 
				+  device: gpu:0,1,2,3
			
 
				+  output: "output"
			
 
				+
			
 
				+CheckDataset:
			
 
				+  convert:
			
 
				+    enable: False
			
 
				+    src_dataset_type: null
			
 
				+  split:
			
 
				+    enable: False
			
 
				+    train_percent: null
			
 
				+    val_percent: null
			
 
				+
			
 
				+Train:
			
 
				+  num_classes: 4
			
 
				+  epochs_iters: 50
			
 
				+  batch_size: 1
			
 
				+  learning_rate: 0.0001
			
 
				+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-Deformable-DETR-Swin-T_pretrained.pdparams
			
 
				+  warmup_steps: 100
			
 
				+  resume_path: null
			
 
				+  log_interval: 10
			
 
				+  eval_interval: 1
			
 
				+
			
 
				+Evaluate:
			
 
				+  weight_path: "output/best_model/best_model.pdparams"
			
 
				+  log_interval: 10
			
 
				+
			
 
				+Predict:
			
 
				+  batch_size: 1
			
 
				+  model_dir: "output/best_model/inference"
			
 
				+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_object_detection_002.png"
			
 
				+  kernel_option:
			
 
				+    run_mode: paddle
			
 
				+
			
 
				+Export:
			
 
				+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-Deformable-DETR-Swin-T_pretrained.pdparams
			
--- a/paddlex/inference/models_new/__init__.py
+++ b/paddlex/inference/models_new/__init__.py
@@ -39,6 +39,7 @@ from .image_multilabel_classification import MLClasPredictor
 
				 # from .table_recognition import TablePredictor
			
 
				 # from .general_recognition import ShiTuRecPredictor
			
 
				 from .anomaly_detection import UadPredictor
			
 
				+
			
 
				 # from .face_recognition import FaceRecPredictor
			
 
				 from .multilingual_speech_recognition import WhisperPredictor
			
 
				 from .video_classification import VideoClasPredictor
			
--- a/paddlex/inference/models_new/object_detection/predictor.py
+++ b/paddlex/inference/models_new/object_detection/predictor.py
@@ -12,7 +12,7 @@
 
				 # See the License for the specific language governing permissions and
			
 
				 # limitations under the License.
			
 
				 
			
 
				-from typing import Any, List, Sequence, Optional
			
 
				+from typing import Any, List, Sequence, Optional, Union, Tuple
			
 
				 
			
 
				 import numpy as np
			
 
				 
			
@@ -34,6 +34,7 @@ from .processors import (
 
				     WarpAffine,
			
 
				 )
			
 
				 from .result import DetResult
			
 
				+from .utils import STATIC_SHAPE_MODEL_LIST
			
 
				 
			
 
				 
			
 
				 class DetPredictor(BasicPredictor):
			
@@ -43,15 +44,36 @@ class DetPredictor(BasicPredictor):
 
				     _FUNC_MAP = {}
			
 
				     register = FuncRegister(_FUNC_MAP)
			
 
				 
			
 
				-    def __init__(self, *args, threshold: Optional[float] = None, **kwargs):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        *args,
			
 
				+        imgsz: Optional[Union[int, Tuple[int, int]]] = None,
			
 
				+        threshold: Optional[float] = None,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				         """Initializes DetPredictor.
			
 
				         Args:
			
 
				             *args: Arbitrary positional arguments passed to the superclass.
			
 
				+            imgsz (Optional[Union[int, Tuple[int, int]]], optional): The input image size (w, h). Defaults to None.
			
 
				             threshold (Optional[float], optional): The threshold for filtering out low-confidence predictions.
			
 
				                 Defaults to None.
			
 
				             **kwargs: Arbitrary keyword arguments passed to the superclass.
			
 
				         """
			
 
				         super().__init__(*args, **kwargs)
			
 
				+
			
 
				+        if imgsz is not None:
			
 
				+            assert (
			
 
				+                self.model_name not in STATIC_SHAPE_MODEL_LIST
			
 
				+            ), f"The model {self.model_name} is not supported set input shape"
			
 
				+            if isinstance(imgsz, int):
			
 
				+                imgsz = (imgsz, imgsz)
			
 
				+            elif isinstance(imgsz, (tuple, list)):
			
 
				+                assert len(imgsz) == 2, f"The length of `imgsz` should be 2."
			
 
				+            else:
			
 
				+                raise ValueError(
			
 
				+                    f"The type of `imgsz` must be int or Tuple[int, int], but got {type(imgsz)}."
			
 
				+                )
			
 
				+        self.imgsz = imgsz
			
 
				         self.threshold = threshold
			
 
				         self.pre_ops, self.infer, self.post_op = self._build()
			
 
				 
			
@@ -61,7 +83,12 @@ class DetPredictor(BasicPredictor):
 
				     def _get_result_class(self):
			
 
				         return DetResult
			
 
				 
			
 
				-    def _build(self):
			
 
				+    def _build(self) -> Tuple:
			
 
				+        """Build the preprocessors, inference engine, and postprocessors based on the configuration.
			
 
				+
			
 
				+        Returns:
			
 
				+            tuple: A tuple containing the preprocessors, inference engine, and postprocessors.
			
 
				+        """
			
 
				         # build preprocess ops
			
 
				         pre_ops = [ReadImage(format="RGB")]
			
 
				         for cfg in self.config["Preprocess"]:
			
@@ -73,6 +100,10 @@ class DetPredictor(BasicPredictor):
 
				             if op:
			
 
				                 pre_ops.append(op)
			
 
				         pre_ops.append(self.build_to_batch())
			
 
				+        if self.imgsz is not None:
			
 
				+            if isinstance(pre_ops[1], Resize):
			
 
				+                pre_ops.pop(1)
			
 
				+            pre_ops.insert(1, self.build_resize(self.imgsz, False, 2))
			
 
				 
			
 
				         # build infer
			
 
				         infer = StaticInfer(
			
@@ -231,6 +262,7 @@ class DetPredictor(BasicPredictor):
 
				     def build_to_batch(self):
			
 
				         model_names_required_imgsize = [
			
 
				             "DETR",
			
 
				+            "DINO",
			
 
				             "RCNN",
			
 
				             "YOLOv3",
			
 
				             "CenterNet",
			
--- a/paddlex/inference/models_new/object_detection/processors.py
+++ b/paddlex/inference/models_new/object_detection/processors.py
@@ -406,19 +406,10 @@ class WarpAffine:
 
				             ori_img = data["img"]
			
 
				             if "ori_img_size" not in data:
			
 
				                 data["ori_img_size"] = [ori_img.shape[1], ori_img.shape[0]]
			
 
				-            ori_img_size = data["ori_img_size"]
			
 
				 
			
 
				             img = self.apply(ori_img)
			
 
				             data["img"] = img
			
 
				 
			
 
				-            img_size = [img.shape[1], img.shape[0]]
			
 
				-            data["img_size"] = img_size  # [size_w, size_h]
			
 
				-
			
 
				-            data["scale_factors"] = [  # [w_scale, h_scale]
			
 
				-                img_size[0] / ori_img_size[0],
			
 
				-                img_size[1] / ori_img_size[1],
			
 
				-            ]
			
 
				-
			
 
				         return datas
			
 
				 
			
 
				 
			
--- a/paddlex/inference/models_new/object_detection/utils.py
+++ b/paddlex/inference/models_new/object_detection/utils.py
@@ -0,0 +1,65 @@
 
				+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+STATIC_SHAPE_MODEL_LIST = [
			
 
				+    "CenterNet-DLA-34",
			
 
				+    "CenterNet-ResNet50",
			
 
				+    "Co-Deformable-DETR-Swin-T",
			
 
				+    "Co-DINO-Swin-L",
			
 
				+    "FasterRCNN-Swin-Tiny-FPN",
			
 
				+    "Mask-RT-DETR-H",
			
 
				+    "Mask-RT-DETR-L",
			
 
				+    "Mask-RT-DETR-M",
			
 
				+    "Mask-RT-DETR-S",
			
 
				+    "Mask-RT-DETR-X",
			
 
				+    "PicoDet_layout_1x_table",
			
 
				+    "PicoDet_layout_1x",
			
 
				+    "PicoDet-L_layout_17cls",
			
 
				+    "PicoDet-L_layout_3cls",
			
 
				+    "PicoDet-L",
			
 
				+    "PicoDet-M",
			
 
				+    "PicoDet-S_layout_17cls",
			
 
				+    "PicoDet-S_layout_3cls",
			
 
				+    "PicoDet-S",
			
 
				+    "PicoDet-XS",
			
 
				+    "PP-ShiTuV2_det",
			
 
				+    "PP-YOLOE-L_human",
			
 
				+    "PP-YOLOE-L_vehicle",
			
 
				+    "PP-YOLOE_plus-L",
			
 
				+    "PP-YOLOE_plus-M",
			
 
				+    "PP-YOLOE_plus_SOD-largesize-L",
			
 
				+    "PP-YOLOE_plus_SOD-L",
			
 
				+    "PP-YOLOE_plus_SOD-S",
			
 
				+    "PP-YOLOE_plus-S",
			
 
				+    "PP-YOLOE_plus-X",
			
 
				+    "PP-YOLOE_seg-S",
			
 
				+    "PP-YOLOE-S_human",
			
 
				+    "PP-YOLOE-S_vehicle",
			
 
				+    "RT-DETR-H_layout_17cls",
			
 
				+    "RT-DETR-H_layout_3cls",
			
 
				+    "RT-DETR-H",
			
 
				+    "RT-DETR-L",
			
 
				+    "RT-DETR-R18",
			
 
				+    "RT-DETR-R50",
			
 
				+    "RT-DETR-X",
			
 
				+    "YOLOv3-DarkNet53",
			
 
				+    "YOLOv3-MobileNetV3",
			
 
				+    "YOLOv3-ResNet50_vd_DCN",
			
 
				+    "YOLOX-L",
			
 
				+    "YOLOX-M",
			
 
				+    "YOLOX-N",
			
 
				+    "YOLOX-S",
			
 
				+    "YOLOX-T",
			
 
				+    "YOLOX-X",
			
 
				+]
			
--- a/paddlex/inference/utils/new_ir_blacklist.py
+++ b/paddlex/inference/utils/new_ir_blacklist.py
@@ -18,5 +18,9 @@ NEWIR_BLOCKLIST = [
 
				     "TimesNet_ad",
			
 
				     "Nonstationary_ad",
			
 
				     "DLinear_ad",
			
 
				+    "Co-Deformable-DETR-R50",
			
 
				+    "Co-Deformable-DETR-Swin-T",
			
 
				+    "Co-DINO-R50",
			
 
				+    "Co-DINO-Swin-L",
			
 
				     "LaTeX_OCR_rec",
			
 
				 ]
			
--- a/paddlex/inference/utils/official_models.py
+++ b/paddlex/inference/utils/official_models.py
@@ -304,6 +304,10 @@ PP-LCNet_x1_0_vehicle_attribute_infer.tar",
 
				     "MobileFaceNet": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/MobileFaceNet_infer.tar",
			
 
				     "ResNet50_face": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/ResNet50_face_infer.tar",
			
 
				     "PP-YOLOE-R_L": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/PP-YOLOE-R_L_infer.tar",
			
 
				+    "Co-Deformable-DETR-R50": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/Co-Deformable-DETR-R50_infer.tar",
			
 
				+    "Co-Deformable-DETR-Swin-T": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/Co-Deformable-DETR-Swin-T_infer.tar",
			
 
				+    "Co-DINO-R50": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/Co-DINO-R50_infer.tar",
			
 
				+    "Co-DINO-Swin-L": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/Co-DINO-Swin-L_infer.tar",
			
 
				     "whisper_large": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/whisper_large.tar",
			
 
				     "whisper_base": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/whisper_base.tar",
			
 
				     "whisper_medium": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/whisper_medium.tar",
			
--- a/paddlex/modules/object_detection/model_list.py
+++ b/paddlex/modules/object_detection/model_list.py
@@ -72,6 +72,10 @@ MODELS = [
 
				     "BlazeFace-FPN-SSH",
			
 
				     "PP-YOLOE_plus-S_face",
			
 
				     "PP-YOLOE-R_L",
			
 
				+    "Co-Deformable-DETR-R50",
			
 
				+    "Co-Deformable-DETR-Swin-T",
			
 
				+    "Co-DINO-R50",
			
 
				+    "Co-DINO-Swin-L",
			
 
				     "RT-DETR-L_wired_table_cell_det",
			
 
				     "RT-DETR-L_wireless_table_cell_det",
			
 
				 ]
			
--- a/paddlex/repo_apis/PaddleDetection_api/configs/Co-DINO-R50.yaml
+++ b/paddlex/repo_apis/PaddleDetection_api/configs/Co-DINO-R50.yaml
@@ -0,0 +1,300 @@
 
				+# Runtime
			
 
				+find_unused_parameters: True
			
 
				+use_gpu: true
			
 
				+use_xpu: false
			
 
				+use_mlu: false
			
 
				+use_npu: false
			
 
				+log_iter: 20
			
 
				+save_dir: output
			
 
				+snapshot_epoch: 1
			
 
				+print_flops: false
			
 
				+print_params: false
			
 
				+use_ema: true
			
 
				+
			
 
				+
			
 
				+# Dataset
			
 
				+metric: COCO
			
 
				+num_classes: 80
			
 
				+
			
 
				+TrainDataset:
			
 
				+  name: COCODataSet
			
 
				+  image_dir: train2017
			
 
				+  anno_path: annotations/instances_train2017.json
			
 
				+  dataset_dir: dataset/coco
			
 
				+  allow_empty: true
			
 
				+  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
			
 
				+
			
 
				+EvalDataset:
			
 
				+  name: COCODataSet
			
 
				+  image_dir: val2017
			
 
				+  anno_path: annotations/instances_val2017.json
			
 
				+  dataset_dir: dataset/coco
			
 
				+  allow_empty: true
			
 
				+
			
 
				+TestDataset:
			
 
				+  name: ImageFolder
			
 
				+  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
			
 
				+  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
			
 
				+
			
 
				+
			
 
				+# Reader
			
 
				+worker_num: 2
			
 
				+TrainReader:
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - RandomFlip: {prob: 0.5}
			
 
				+  - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ],
			
 
				+                    transforms2: [
			
 
				+                        RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] },
			
 
				+                        RandomSizeCrop: { min_size: 384, max_size: 600 },
			
 
				+                        RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ]
			
 
				+  }
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_transforms:
			
 
				+  - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true}
			
 
				+  batch_size: 2
			
 
				+  shuffle: true
			
 
				+  drop_last: true
			
 
				+  collate_batch: false
			
 
				+  use_shared_memory: false
			
 
				+
			
 
				+EvalReader:
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - Resize: {target_size: [800, 1333], keep_ratio: True}
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_size: 1
			
 
				+  shuffle: false
			
 
				+  drop_last: false
			
 
				+
			
 
				+TestReader:
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - Resize: {target_size: [800, 1333], keep_ratio: True}
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_size: 1
			
 
				+  shuffle: false
			
 
				+  drop_last: false
			
 
				+
			
 
				+
			
 
				+# Model
			
 
				+architecture: CO_DETR
			
 
				+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams
			
 
				+num_dec_layer: &num_dec_layer 6
			
 
				+
			
 
				+CO_DETR:
			
 
				+  backbone: ResNet
			
 
				+  backbone_lr_mult: 0.1
			
 
				+  neck: ChannelMapper
			
 
				+  query_head: CoDINOHead
			
 
				+  rpn_head: RPNHead
			
 
				+  roi_head: Co_RoiHead
			
 
				+  bbox_head:
			
 
				+    name: CoATSSHead
			
 
				+    in_channels: 256
			
 
				+    stacked_convs: 1
			
 
				+    feat_channels: 256
			
 
				+    bbox_weight: [10., 10., 5., 5.]
			
 
				+    anchor_generator: 
			
 
				+      name: CoAnchorGenerator
			
 
				+      octave_base_scale: 8
			
 
				+      scales_per_octave: 1
			
 
				+      aspect_ratios: [1.0]
			
 
				+      strides: [4., 8., 16., 32., 64., 128.]
			
 
				+    assigner: 
			
 
				+      name: ATSSAssigner
			
 
				+      topk: 9
			
 
				+      sm_use: True
			
 
				+    loss_cls: 
			
 
				+      name: Weighted_FocalLoss
			
 
				+      use_sigmoid: true
			
 
				+      gamma: 2.0
			
 
				+      alpha: 0.25
			
 
				+      loss_weight: 12.0
			
 
				+    loss_bbox: 
			
 
				+      name: GIoULoss
			
 
				+      loss_weight: 24.0
			
 
				+      reduction: sum
			
 
				+    loss_cent_weight: 12.0
			
 
				+
			
 
				+ResNet:
			
 
				+  # index 0 stands for res2
			
 
				+  depth: 50
			
 
				+  norm_type: bn
			
 
				+  freeze_at: 0
			
 
				+  return_idx: [0, 1, 2, 3]
			
 
				+  num_stages: 4
			
 
				+
			
 
				+ChannelMapper:
			
 
				+  in_channels: [256, 512, 1024, 2048]
			
 
				+  kernel_size: 1
			
 
				+  out_channels: 256
			
 
				+  norm_type: "gn"
			
 
				+  norm_groups: 32
			
 
				+  act: None
			
 
				+  num_outs: 5
			
 
				+  strides: [4., 8., 16., 32., 64.]
			
 
				+ 
			
 
				+CoDINOHead:
			
 
				+  num_query: 900
			
 
				+  num_dn_query: 100
			
 
				+  label_noise_ratio: 0.5
			
 
				+  box_noise_scale: 1.0
			
 
				+  in_channels: 2048
			
 
				+  sync_cls_avg_factor: True
			
 
				+  with_box_refine: True
			
 
				+  as_two_stage: True
			
 
				+  mixed_selection: True
			
 
				+  transformer:
			
 
				+    name: CoDINOTransformer
			
 
				+    two_stage_num_proposals: 900
			
 
				+    with_pos_coord: True
			
 
				+    with_coord_feat: False
			
 
				+    num_co_heads: 2
			
 
				+    num_feature_levels: 5
			
 
				+    as_two_stage: True
			
 
				+    mixed_selection: True
			
 
				+    embed_dims: &embed_dims 256
			
 
				+    encoder:
			
 
				+      name: DeformableTransformerEncoder
			
 
				+      num_layers: *num_dec_layer
			
 
				+      with_rp: 6
			
 
				+      encoder_layer:
			
 
				+        name: DeformableTransformerEncoderLayer
			
 
				+        d_model: *embed_dims
			
 
				+        n_head: 8
			
 
				+        dim_feedforward: 2048
			
 
				+        n_levels: 5
			
 
				+        n_points: 4
			
 
				+        dropout: 0.0
			
 
				+    decoder:
			
 
				+      name: DINOTransformerDecoder
			
 
				+      hidden_dim: *embed_dims
			
 
				+      num_layers: *num_dec_layer
			
 
				+      decoder_layer:
			
 
				+        name: DINOTransformerDecoderLayer
			
 
				+        d_model: *embed_dims
			
 
				+        n_head: 8
			
 
				+        dim_feedforward: 2048
			
 
				+        n_points: 4
			
 
				+        n_levels: 5
			
 
				+        dropout: 0.0
			
 
				+  positional_encoding:
			
 
				+    name: PositionEmbedding
			
 
				+    num_pos_feats: 128
			
 
				+    temperature: 20
			
 
				+    normalize: true
			
 
				+  loss_cls:
			
 
				+    name: QualityFocalLoss
			
 
				+    use_sigmoid: true
			
 
				+    beta: 2.0
			
 
				+    loss_weight: 1.0
			
 
				+  loss_bbox:
			
 
				+    name: L1Loss
			
 
				+    loss_weight: 5.0
			
 
				+  loss_iou:
			
 
				+    name: GIoULoss
			
 
				+    loss_weight: 2.0
			
 
				+    reduction: sum
			
 
				+  assigner:
			
 
				+    name: HungarianAssigner
			
 
				+    cls_cost:
			
 
				+      name: FocalLossCost
			
 
				+      weight: 2.0
			
 
				+    reg_cost:
			
 
				+      name: BBoxL1Cost
			
 
				+      weight: 5.0
			
 
				+      box_format: xywh
			
 
				+    iou_cost:
			
 
				+      name: IoUCost
			
 
				+      iou_mode: giou
			
 
				+      weight: 2.0
			
 
				+  test_cfg:
			
 
				+    max_per_img: 300
			
 
				+    score_thr: 0.0
			
 
				+  nms: 
			
 
				+    name: MultiClassNMS
			
 
				+    keep_top_k: -1
			
 
				+    score_threshold: 0.0
			
 
				+    nms_threshold: 0.8
			
 
				+
			
 
				+RPNHead:
			
 
				+  loss_rpn_bbox: 
			
 
				+    name: L1Loss
			
 
				+    reduction: sum
			
 
				+    loss_weight: 12.0
			
 
				+  in_channel: 256
			
 
				+  anchor_generator: 
			
 
				+    name: RetinaAnchorGenerator
			
 
				+    octave_base_scale: 4
			
 
				+    scales_per_octave: 3
			
 
				+    aspect_ratios: [0.5, 1.0, 2.0]
			
 
				+    strides: [4., 8., 16., 32., 64., 128.]
			
 
				+  rpn_target_assign:
			
 
				+    batch_size_per_im: 256
			
 
				+    fg_fraction: 0.5
			
 
				+    negative_overlap: 0.3
			
 
				+    positive_overlap: 0.7
			
 
				+    use_random: True
			
 
				+  train_proposal:
			
 
				+    min_size: 0.0
			
 
				+    nms_thresh: 0.7
			
 
				+    pre_nms_top_n: 4000
			
 
				+    post_nms_top_n: 1000
			
 
				+    topk_after_collect: True
			
 
				+  test_proposal:
			
 
				+    min_size: 0.0
			
 
				+    nms_thresh: 0.7
			
 
				+    pre_nms_top_n: 1000
			
 
				+    post_nms_top_n: 1000
			
 
				+
			
 
				+Co_RoiHead:
			
 
				+  in_channel: 256
			
 
				+  loss_normalize_pos: True
			
 
				+  head: TwoFCHead
			
 
				+  roi_extractor:
			
 
				+    end_level: 4
			
 
				+    resolution: 7
			
 
				+    sampling_ratio: 0
			
 
				+    aligned: True
			
 
				+  bbox_assigner: 
			
 
				+    name: BBoxAssigner
			
 
				+    batch_size_per_im: 512
			
 
				+    bg_thresh: 0.5
			
 
				+    fg_thresh: 0.5
			
 
				+    fg_fraction: 0.25
			
 
				+    use_random: True
			
 
				+  bbox_loss: 
			
 
				+    name: GIoULoss
			
 
				+    loss_weight: 120.0
			
 
				+  cls_loss_weight: 12.0
			
 
				+
			
 
				+
			
 
				+# Optimizer
			
 
				+epoch: 12
			
 
				+
			
 
				+LearningRate:
			
 
				+  base_lr: 0.0002
			
 
				+  schedulers:
			
 
				+  - !PiecewiseDecay
			
 
				+    gamma: 0.1
			
 
				+    milestones: [11]
			
 
				+    use_warmup: false
			
 
				+
			
 
				+OptimizerBuilder:
			
 
				+  clip_grad_by_norm: 0.1
			
 
				+  regularizer: false
			
 
				+  optimizer:
			
 
				+    type: AdamW
			
 
				+    weight_decay: 0.0001
			
 
				+
			
 
				+
			
 
				+# Exporting the model
			
 
				+export:
			
 
				+  post_process: True  # Whether post-processing is included in the network when export model.
			
 
				+  nms: True           # Whether NMS is included in the network when export model.
			
 
				+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
			
 
				+  fuse_conv_bn: False
			
--- a/paddlex/repo_apis/PaddleDetection_api/configs/Co-DINO-Swin-L.yaml
+++ b/paddlex/repo_apis/PaddleDetection_api/configs/Co-DINO-Swin-L.yaml
@@ -0,0 +1,301 @@
 
				+# Runtime
			
 
				+find_unused_parameters: True
			
 
				+use_gpu: true
			
 
				+use_xpu: false
			
 
				+use_mlu: false
			
 
				+use_npu: false
			
 
				+log_iter: 20
			
 
				+save_dir: output
			
 
				+snapshot_epoch: 1
			
 
				+print_flops: false
			
 
				+print_params: false
			
 
				+use_ema: true
			
 
				+
			
 
				+
			
 
				+# Dataset
			
 
				+metric: COCO
			
 
				+num_classes: 80
			
 
				+
			
 
				+TrainDataset:
			
 
				+  name: COCODataSet
			
 
				+  image_dir: train2017
			
 
				+  anno_path: annotations/instances_train2017.json
			
 
				+  dataset_dir: dataset/coco
			
 
				+  allow_empty: true
			
 
				+  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
			
 
				+
			
 
				+EvalDataset:
			
 
				+  name: COCODataSet
			
 
				+  image_dir: val2017
			
 
				+  anno_path: annotations/instances_val2017.json
			
 
				+  dataset_dir: dataset/coco
			
 
				+  allow_empty: true
			
 
				+
			
 
				+TestDataset:
			
 
				+  name: ImageFolder
			
 
				+  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
			
 
				+  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
			
 
				+
			
 
				+
			
 
				+# Reader
			
 
				+worker_num: 1
			
 
				+TrainReader:
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - RandomFlip: {prob: 0.5}
			
 
				+  - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ],
			
 
				+                    transforms2: [
			
 
				+                        RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] },
			
 
				+                        RandomSizeCrop: { min_size: 384, max_size: 600 },
			
 
				+                        RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ]
			
 
				+  }
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_transforms:
			
 
				+  - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true}
			
 
				+  batch_size: 1
			
 
				+  shuffle: true
			
 
				+  drop_last: true
			
 
				+  collate_batch: false
			
 
				+  use_shared_memory: false
			
 
				+
			
 
				+EvalReader:
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - Resize: {target_size: [800, 1333], keep_ratio: True}
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_size: 1
			
 
				+  shuffle: false
			
 
				+  drop_last: false
			
 
				+
			
 
				+TestReader:
			
 
				+  inputs_def:
			
 
				+    image_shape: [-1, 3, 640, 640]
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - Resize: {target_size: 640, keep_ratio: false}
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_size: 1
			
 
				+  shuffle: false
			
 
				+  drop_last: false
			
 
				+
			
 
				+
			
 
				+# Model
			
 
				+architecture: CO_DETR
			
 
				+pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams
			
 
				+num_dec_layer: &num_dec_layer 6
			
 
				+
			
 
				+CO_DETR:
			
 
				+  backbone: SwinTransformer
			
 
				+  backbone_lr_mult: 0.1
			
 
				+  neck: ChannelMapper
			
 
				+  query_head: CoDINOHead
			
 
				+  rpn_head: RPNHead
			
 
				+  roi_head: Co_RoiHead
			
 
				+  bbox_head:
			
 
				+    name: CoATSSHead
			
 
				+    in_channels: 256
			
 
				+    stacked_convs: 1
			
 
				+    feat_channels: 256
			
 
				+    bbox_weight: [10., 10., 5., 5.]
			
 
				+    anchor_generator: 
			
 
				+      name: CoAnchorGenerator
			
 
				+      octave_base_scale: 8
			
 
				+      scales_per_octave: 1
			
 
				+      aspect_ratios: [1.0]
			
 
				+      strides: [4., 8., 16., 32., 64., 128.]
			
 
				+    assigner: 
			
 
				+      name: ATSSAssigner
			
 
				+      topk: 9
			
 
				+      sm_use: True
			
 
				+    loss_cls: 
			
 
				+      name: Weighted_FocalLoss
			
 
				+      use_sigmoid: true
			
 
				+      gamma: 2.0
			
 
				+      alpha: 0.25
			
 
				+      loss_weight: 12.0
			
 
				+    loss_bbox: 
			
 
				+      name: GIoULoss
			
 
				+      loss_weight: 24.0
			
 
				+      reduction: sum
			
 
				+    loss_cent_weight: 12.0
			
 
				+
			
 
				+SwinTransformer:
			
 
				+  arch: 'swin_L_384' # ['swin_T_224', 'swin_S_224', 'swin_B_224', 'swin_L_224', 'swin_B_384', 'swin_L_384']
			
 
				+  out_indices: [0, 1, 2, 3]
			
 
				+  ape: false
			
 
				+  drop_path_rate: 0.3
			
 
				+  patch_norm: true
			
 
				+
			
 
				+ChannelMapper:
			
 
				+  in_channels: [192, 384, 768, 1536]
			
 
				+  kernel_size: 1
			
 
				+  out_channels: 256
			
 
				+  norm_type: "gn"
			
 
				+  norm_groups: 32
			
 
				+  act: None
			
 
				+  num_outs: 5
			
 
				+  strides: [4., 8., 16., 32., 64.]
			
 
				+ 
			
 
				+CoDINOHead:
			
 
				+  num_query: 900
			
 
				+  num_dn_query: 100
			
 
				+  label_noise_ratio: 0.5
			
 
				+  box_noise_scale: 1.0
			
 
				+  in_channels: 2048
			
 
				+  sync_cls_avg_factor: True
			
 
				+  with_box_refine: True
			
 
				+  as_two_stage: True
			
 
				+  mixed_selection: True
			
 
				+  transformer:
			
 
				+    name: CoDINOTransformer
			
 
				+    two_stage_num_proposals: 900
			
 
				+    with_pos_coord: True
			
 
				+    with_coord_feat: False
			
 
				+    num_co_heads: 2
			
 
				+    num_feature_levels: 5
			
 
				+    as_two_stage: True
			
 
				+    mixed_selection: True
			
 
				+    embed_dims: &embed_dims 256
			
 
				+    encoder:
			
 
				+      name: DeformableTransformerEncoder
			
 
				+      num_layers: *num_dec_layer
			
 
				+      with_rp: 6
			
 
				+      encoder_layer:
			
 
				+        name: DeformableTransformerEncoderLayer
			
 
				+        d_model: *embed_dims
			
 
				+        n_head: 8
			
 
				+        dim_feedforward: 2048
			
 
				+        n_levels: 5
			
 
				+        n_points: 4
			
 
				+        dropout: 0.0
			
 
				+    decoder:
			
 
				+      name: DINOTransformerDecoder
			
 
				+      hidden_dim: *embed_dims
			
 
				+      num_layers: *num_dec_layer
			
 
				+      decoder_layer:
			
 
				+        name: DINOTransformerDecoderLayer
			
 
				+        d_model: *embed_dims
			
 
				+        n_head: 8
			
 
				+        dim_feedforward: 2048
			
 
				+        n_points: 4
			
 
				+        n_levels: 5
			
 
				+        dropout: 0.0
			
 
				+  positional_encoding:
			
 
				+    name: PositionEmbedding
			
 
				+    num_pos_feats: 128
			
 
				+    temperature: 20
			
 
				+    normalize: true
			
 
				+  loss_cls:
			
 
				+    name: QualityFocalLoss
			
 
				+    use_sigmoid: true
			
 
				+    beta: 2.0
			
 
				+    loss_weight: 1.0
			
 
				+  loss_bbox:
			
 
				+    name: L1Loss
			
 
				+    loss_weight: 5.0
			
 
				+  loss_iou:
			
 
				+    name: GIoULoss
			
 
				+    loss_weight: 2.0
			
 
				+    reduction: sum
			
 
				+  assigner:
			
 
				+    name: HungarianAssigner
			
 
				+    cls_cost:
			
 
				+      name: FocalLossCost
			
 
				+      weight: 2.0
			
 
				+    reg_cost:
			
 
				+      name: BBoxL1Cost
			
 
				+      weight: 5.0
			
 
				+      box_format: xywh
			
 
				+    iou_cost:
			
 
				+      name: IoUCost
			
 
				+      iou_mode: giou
			
 
				+      weight: 2.0
			
 
				+  test_cfg:
			
 
				+    max_per_img: 300
			
 
				+    score_thr: 0.0
			
 
				+  nms: 
			
 
				+    name: MultiClassNMS
			
 
				+    keep_top_k: -1
			
 
				+    score_threshold: 0.0
			
 
				+    nms_threshold: 0.8
			
 
				+
			
 
				+RPNHead:
			
 
				+  loss_rpn_bbox: 
			
 
				+    name: L1Loss
			
 
				+    reduction: sum
			
 
				+    loss_weight: 12.0
			
 
				+  in_channel: 256
			
 
				+  anchor_generator: 
			
 
				+    name: RetinaAnchorGenerator
			
 
				+    octave_base_scale: 4
			
 
				+    scales_per_octave: 3
			
 
				+    aspect_ratios: [0.5, 1.0, 2.0]
			
 
				+    strides: [4., 8., 16., 32., 64., 128.]
			
 
				+  rpn_target_assign:
			
 
				+    batch_size_per_im: 256
			
 
				+    fg_fraction: 0.5
			
 
				+    negative_overlap: 0.3
			
 
				+    positive_overlap: 0.7
			
 
				+    use_random: True
			
 
				+  train_proposal:
			
 
				+    min_size: 0.0
			
 
				+    nms_thresh: 0.7
			
 
				+    pre_nms_top_n: 4000
			
 
				+    post_nms_top_n: 1000
			
 
				+    topk_after_collect: True
			
 
				+  test_proposal:
			
 
				+    min_size: 0.0
			
 
				+    nms_thresh: 0.7
			
 
				+    pre_nms_top_n: 1000
			
 
				+    post_nms_top_n: 1000
			
 
				+
			
 
				+Co_RoiHead:
			
 
				+  in_channel: 256
			
 
				+  loss_normalize_pos: True
			
 
				+  head: TwoFCHead
			
 
				+  roi_extractor:
			
 
				+    end_level: 4
			
 
				+    resolution: 7
			
 
				+    sampling_ratio: 0
			
 
				+    aligned: True
			
 
				+  bbox_assigner: 
			
 
				+    name: BBoxAssigner
			
 
				+    batch_size_per_im: 512
			
 
				+    bg_thresh: 0.5
			
 
				+    fg_thresh: 0.5
			
 
				+    fg_fraction: 0.25
			
 
				+    use_random: True
			
 
				+  bbox_loss: 
			
 
				+    name: GIoULoss
			
 
				+    loss_weight: 120.0
			
 
				+  cls_loss_weight: 12.0
			
 
				+
			
 
				+
			
 
				+# Optimizer
			
 
				+epoch: 12
			
 
				+
			
 
				+LearningRate:
			
 
				+  base_lr: 0.0001
			
 
				+  schedulers:
			
 
				+  - !PiecewiseDecay
			
 
				+    gamma: 0.1
			
 
				+    milestones: [11]
			
 
				+    use_warmup: false
			
 
				+
			
 
				+OptimizerBuilder:
			
 
				+  clip_grad_by_norm: 0.1
			
 
				+  regularizer: false
			
 
				+  optimizer:
			
 
				+    type: AdamW
			
 
				+    weight_decay: 0.0001
			
 
				+
			
 
				+
			
 
				+# Exporting the model
			
 
				+export:
			
 
				+  post_process: True  # Whether post-processing is included in the network when export model.
			
 
				+  nms: True           # Whether NMS is included in the network when export model.
			
 
				+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
			
 
				+  fuse_conv_bn: False
			
--- a/paddlex/repo_apis/PaddleDetection_api/configs/Co-Deformable-DETR-R50.yaml
+++ b/paddlex/repo_apis/PaddleDetection_api/configs/Co-Deformable-DETR-R50.yaml
@@ -0,0 +1,285 @@
 
				+# Runtime
			
 
				+find_unused_parameters: True
			
 
				+use_gpu: true
			
 
				+use_xpu: false
			
 
				+use_mlu: false
			
 
				+use_npu: false
			
 
				+log_iter: 20
			
 
				+save_dir: output
			
 
				+snapshot_epoch: 1
			
 
				+print_flops: false
			
 
				+print_params: false
			
 
				+use_ema: true
			
 
				+
			
 
				+
			
 
				+# Dataset
			
 
				+metric: COCO
			
 
				+num_classes: 80
			
 
				+
			
 
				+TrainDataset:
			
 
				+  name: COCODataSet
			
 
				+  image_dir: train2017
			
 
				+  anno_path: annotations/instances_train2017.json
			
 
				+  dataset_dir: dataset/coco
			
 
				+  allow_empty: true
			
 
				+  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
			
 
				+
			
 
				+EvalDataset:
			
 
				+  name: COCODataSet
			
 
				+  image_dir: val2017
			
 
				+  anno_path: annotations/instances_val2017.json
			
 
				+  dataset_dir: dataset/coco
			
 
				+  allow_empty: true
			
 
				+
			
 
				+TestDataset:
			
 
				+  name: ImageFolder
			
 
				+  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
			
 
				+  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
			
 
				+
			
 
				+
			
 
				+# Reader
			
 
				+worker_num: 2
			
 
				+TrainReader:
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - RandomFlip: {prob: 0.5}
			
 
				+  - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ],
			
 
				+                    transforms2: [
			
 
				+                        RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] },
			
 
				+                        RandomSizeCrop: { min_size: 384, max_size: 600 },
			
 
				+                        RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ]
			
 
				+  }
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_transforms:
			
 
				+  - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true}
			
 
				+  batch_size: 2
			
 
				+  shuffle: true
			
 
				+  drop_last: true
			
 
				+  collate_batch: false
			
 
				+  use_shared_memory: false
			
 
				+
			
 
				+EvalReader:
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - Resize: {target_size: [800, 1333], keep_ratio: True}
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_size: 1
			
 
				+  shuffle: false
			
 
				+  drop_last: false
			
 
				+
			
 
				+TestReader:
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - Resize: {target_size: [800, 1333], keep_ratio: True}
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_size: 1
			
 
				+  shuffle: false
			
 
				+  drop_last: false
			
 
				+
			
 
				+
			
 
				+# Model
			
 
				+architecture: CO_DETR
			
 
				+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams
			
 
				+num_dec_layer: &num_dec_layer 6
			
 
				+
			
 
				+CO_DETR:
			
 
				+  backbone: ResNet
			
 
				+  backbone_lr_mult: 0.1
			
 
				+  neck: ChannelMapper
			
 
				+  query_head: CoDeformDETRHead
			
 
				+  rpn_head: RPNHead
			
 
				+  roi_head: Co_RoiHead
			
 
				+  bbox_head:
			
 
				+    name: CoATSSHead
			
 
				+    in_channels: 256
			
 
				+    stacked_convs: 1
			
 
				+    feat_channels: 256
			
 
				+    bbox_weight: [10., 10., 5., 5.]
			
 
				+    anchor_generator: 
			
 
				+      name: CoAnchorGenerator
			
 
				+      octave_base_scale: 8
			
 
				+      scales_per_octave: 1
			
 
				+      aspect_ratios: [1.0]
			
 
				+      strides: [8., 16., 32., 64., 128.]
			
 
				+    assigner: 
			
 
				+      name: ATSSAssigner
			
 
				+      topk: 9
			
 
				+      sm_use: True
			
 
				+    loss_cls: 
			
 
				+      name: Weighted_FocalLoss
			
 
				+      use_sigmoid: true
			
 
				+      gamma: 2.0
			
 
				+      alpha: 0.25
			
 
				+      loss_weight: 12.0
			
 
				+    loss_bbox: 
			
 
				+      name: GIoULoss
			
 
				+      loss_weight: 24.0
			
 
				+      reduction: sum
			
 
				+    loss_cent_weight: 12.0
			
 
				+
			
 
				+ResNet:
			
 
				+  # index 0 stands for res2
			
 
				+  depth: 50
			
 
				+  norm_type: bn
			
 
				+  freeze_at: 0
			
 
				+  return_idx: [1, 2, 3]
			
 
				+  num_stages: 4
			
 
				+
			
 
				+ChannelMapper:
			
 
				+  in_channels: [512, 1024, 2048]
			
 
				+  kernel_size: 1
			
 
				+  out_channels: 256
			
 
				+  norm_type: "gn"
			
 
				+  norm_groups: 32
			
 
				+  act: None
			
 
				+  num_outs: 4
			
 
				+  strides: [8., 16., 32., 64.]
			
 
				+ 
			
 
				+CoDeformDETRHead:
			
 
				+  num_query: 300
			
 
				+  in_channels: 2048
			
 
				+  sync_cls_avg_factor: True
			
 
				+  with_box_refine: True
			
 
				+  as_two_stage: True
			
 
				+  mixed_selection: True
			
 
				+  transformer:
			
 
				+    name: CoDeformableDetrTransformer
			
 
				+    num_co_heads: 2
			
 
				+    as_two_stage: True
			
 
				+    mixed_selection: True
			
 
				+    embed_dims: &embed_dims 256
			
 
				+    encoder:
			
 
				+      name: DeformableTransformerEncoder
			
 
				+      num_layers: *num_dec_layer
			
 
				+      encoder_layer:
			
 
				+        name: DeformableTransformerEncoderLayer
			
 
				+        d_model: *embed_dims
			
 
				+        n_head: 8
			
 
				+        dim_feedforward: 2048
			
 
				+        n_levels: 4
			
 
				+        n_points: 4
			
 
				+        dropout: 0.0
			
 
				+    decoder:
			
 
				+      name: CoDeformableDetrTransformerDecoder
			
 
				+      num_layers: *num_dec_layer
			
 
				+      return_intermediate: True
			
 
				+      look_forward_twice: True
			
 
				+      decoder_layer:
			
 
				+        name: DeformableTransformerDecoderLayer
			
 
				+        d_model: *embed_dims
			
 
				+        dim_feedforward: 2048
			
 
				+        dropout: 0.0
			
 
				+  positional_encoding:
			
 
				+    name: PositionEmbedding
			
 
				+    num_pos_feats: 128
			
 
				+    normalize: true
			
 
				+    offset: -0.5
			
 
				+  loss_cls:
			
 
				+    name: Weighted_FocalLoss
			
 
				+    use_sigmoid: true
			
 
				+    gamma: 2.0
			
 
				+    alpha: 0.25
			
 
				+    loss_weight: 2.0
			
 
				+  loss_bbox:
			
 
				+    name: L1Loss
			
 
				+    loss_weight: 5.0
			
 
				+  loss_iou:
			
 
				+    name: GIoULoss
			
 
				+    loss_weight: 2.0
			
 
				+    reduction: sum
			
 
				+  assigner:
			
 
				+    name: HungarianAssigner
			
 
				+    cls_cost:
			
 
				+      name: FocalLossCost
			
 
				+      weight: 2.0
			
 
				+    reg_cost:
			
 
				+      name: BBoxL1Cost
			
 
				+      weight: 5.0
			
 
				+      box_format: xywh
			
 
				+    iou_cost:
			
 
				+      name: IoUCost
			
 
				+      iou_mode: giou
			
 
				+      weight: 2.0
			
 
				+  test_cfg:
			
 
				+    max_per_img: 100
			
 
				+    score_thr: 0.0
			
 
				+
			
 
				+RPNHead:
			
 
				+  loss_rpn_bbox: 
			
 
				+    name: L1Loss
			
 
				+    reduction: sum
			
 
				+    loss_weight: 12.0
			
 
				+  in_channel: 256
			
 
				+  anchor_generator: 
			
 
				+    name: RetinaAnchorGenerator
			
 
				+    octave_base_scale: 4
			
 
				+    scales_per_octave: 3
			
 
				+    aspect_ratios: [0.5, 1.0, 2.0]
			
 
				+    strides: [8.0, 16.0, 32.0, 64.0, 128.0]
			
 
				+  rpn_target_assign:
			
 
				+    batch_size_per_im: 256
			
 
				+    fg_fraction: 0.5
			
 
				+    negative_overlap: 0.3
			
 
				+    positive_overlap: 0.7
			
 
				+    use_random: True
			
 
				+  train_proposal:
			
 
				+    min_size: 0.0
			
 
				+    nms_thresh: 0.7
			
 
				+    pre_nms_top_n: 4000
			
 
				+    post_nms_top_n: 1000
			
 
				+    topk_after_collect: True
			
 
				+  test_proposal:
			
 
				+    min_size: 0.0
			
 
				+    nms_thresh: 0.7
			
 
				+    pre_nms_top_n: 1000
			
 
				+    post_nms_top_n: 1000
			
 
				+
			
 
				+Co_RoiHead:
			
 
				+  in_channel: 256
			
 
				+  loss_normalize_pos: True
			
 
				+  head: TwoFCHead
			
 
				+  roi_extractor:
			
 
				+    resolution: 7
			
 
				+    sampling_ratio: 0
			
 
				+    aligned: True
			
 
				+  bbox_assigner: 
			
 
				+    name: BBoxAssigner
			
 
				+    batch_size_per_im: 512
			
 
				+    bg_thresh: 0.5
			
 
				+    fg_thresh: 0.5
			
 
				+    fg_fraction: 0.25
			
 
				+    use_random: True
			
 
				+  bbox_loss: 
			
 
				+    name: GIoULoss
			
 
				+    loss_weight: 120.0
			
 
				+  cls_loss_weight: 12.0
			
 
				+
			
 
				+
			
 
				+# Optimizer
			
 
				+epoch: 12
			
 
				+
			
 
				+LearningRate:
			
 
				+  base_lr: 0.0002
			
 
				+  schedulers:
			
 
				+  - !PiecewiseDecay
			
 
				+    gamma: 0.1
			
 
				+    milestones: [11]
			
 
				+    use_warmup: false
			
 
				+
			
 
				+OptimizerBuilder:
			
 
				+  clip_grad_by_norm: 0.1
			
 
				+  regularizer: false
			
 
				+  optimizer:
			
 
				+    type: AdamW
			
 
				+    weight_decay: 0.0001
			
 
				+
			
 
				+
			
 
				+# Exporting the model
			
 
				+export:
			
 
				+  post_process: True  # Whether post-processing is included in the network when export model.
			
 
				+  nms: True           # Whether NMS is included in the network when export model.
			
 
				+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
			
 
				+  fuse_conv_bn: False
			
--- a/paddlex/repo_apis/PaddleDetection_api/configs/Co-Deformable-DETR-Swin-T.yaml
+++ b/paddlex/repo_apis/PaddleDetection_api/configs/Co-Deformable-DETR-Swin-T.yaml
@@ -0,0 +1,286 @@
 
				+# Runtime
			
 
				+find_unused_parameters: True
			
 
				+use_gpu: true
			
 
				+use_xpu: false
			
 
				+use_mlu: false
			
 
				+use_npu: false
			
 
				+log_iter: 20
			
 
				+save_dir: output
			
 
				+snapshot_epoch: 1
			
 
				+print_flops: false
			
 
				+print_params: false
			
 
				+use_ema: true
			
 
				+
			
 
				+
			
 
				+# Dataset
			
 
				+metric: COCO
			
 
				+num_classes: 80
			
 
				+
			
 
				+TrainDataset:
			
 
				+  name: COCODataSet
			
 
				+  image_dir: train2017
			
 
				+  anno_path: annotations/instances_train2017.json
			
 
				+  dataset_dir: dataset/coco
			
 
				+  allow_empty: true
			
 
				+  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
			
 
				+
			
 
				+EvalDataset:
			
 
				+  name: COCODataSet
			
 
				+  image_dir: val2017
			
 
				+  anno_path: annotations/instances_val2017.json
			
 
				+  dataset_dir: dataset/coco
			
 
				+  allow_empty: true
			
 
				+
			
 
				+TestDataset:
			
 
				+  name: ImageFolder
			
 
				+  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
			
 
				+  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
			
 
				+
			
 
				+
			
 
				+# Reader
			
 
				+worker_num: 2
			
 
				+TrainReader:
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - RandomFlip: {prob: 0.5}
			
 
				+  - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ],
			
 
				+                    transforms2: [
			
 
				+                        RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] },
			
 
				+                        RandomSizeCrop: { min_size: 384, max_size: 600 },
			
 
				+                        RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ]
			
 
				+  }
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_transforms:
			
 
				+  - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true}
			
 
				+  batch_size: 2
			
 
				+  shuffle: true
			
 
				+  drop_last: true
			
 
				+  collate_batch: false
			
 
				+  use_shared_memory: false
			
 
				+
			
 
				+EvalReader:
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - Resize: {target_size: [800, 1333], keep_ratio: True}
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_size: 1
			
 
				+  shuffle: false
			
 
				+  drop_last: false
			
 
				+
			
 
				+TestReader:
			
 
				+  inputs_def:
			
 
				+    image_shape: [-1, 3, 640, 640]
			
 
				+  sample_transforms:
			
 
				+  - Decode: {}
			
 
				+  - Resize: {target_size: 640, keep_ratio: false}
			
 
				+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				+  - Permute: {}
			
 
				+  batch_size: 1
			
 
				+  shuffle: false
			
 
				+  drop_last: false
			
 
				+
			
 
				+
			
 
				+# Model
			
 
				+architecture: CO_DETR
			
 
				+pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams
			
 
				+num_dec_layer: &num_dec_layer 6
			
 
				+
			
 
				+CO_DETR:
			
 
				+  backbone: SwinTransformer
			
 
				+  backbone_lr_mult: 0.1
			
 
				+  neck: ChannelMapper
			
 
				+  query_head: CoDeformDETRHead
			
 
				+  rpn_head: RPNHead
			
 
				+  roi_head: Co_RoiHead
			
 
				+  bbox_head:
			
 
				+    name: CoATSSHead
			
 
				+    in_channels: 256
			
 
				+    stacked_convs: 1
			
 
				+    feat_channels: 256
			
 
				+    bbox_weight: [10., 10., 5., 5.]
			
 
				+    anchor_generator: 
			
 
				+      name: CoAnchorGenerator
			
 
				+      octave_base_scale: 8
			
 
				+      scales_per_octave: 1
			
 
				+      aspect_ratios: [1.0]
			
 
				+      strides: [8., 16., 32., 64., 128.]
			
 
				+    assigner: 
			
 
				+      name: ATSSAssigner
			
 
				+      topk: 9
			
 
				+      sm_use: True
			
 
				+    loss_cls: 
			
 
				+      name: Weighted_FocalLoss
			
 
				+      use_sigmoid: true
			
 
				+      gamma: 2.0
			
 
				+      alpha: 0.25
			
 
				+      loss_weight: 12.0
			
 
				+    loss_bbox: 
			
 
				+      name: GIoULoss
			
 
				+      loss_weight: 24.0
			
 
				+      reduction: sum
			
 
				+    loss_cent_weight: 12.0
			
 
				+
			
 
				+SwinTransformer:
			
 
				+  arch: 'swin_T_224' # ['swin_T_224', 'swin_S_224', 'swin_B_224', 'swin_L_224', 'swin_B_384', 'swin_L_384']
			
 
				+  out_indices: [1, 2, 3]
			
 
				+  ape: false
			
 
				+  drop_path_rate: 0.2
			
 
				+  patch_norm: true
			
 
				+
			
 
				+ChannelMapper:
			
 
				+  in_channels: [192, 384, 768]
			
 
				+  kernel_size: 1
			
 
				+  out_channels: 256
			
 
				+  norm_type: "gn"
			
 
				+  norm_groups: 32
			
 
				+  act: None
			
 
				+  num_outs: 4
			
 
				+  strides: [8., 16., 32., 64.]
			
 
				+ 
			
 
				+CoDeformDETRHead:
			
 
				+  num_query: 300
			
 
				+  in_channels: 2048
			
 
				+  sync_cls_avg_factor: True
			
 
				+  with_box_refine: True
			
 
				+  as_two_stage: True
			
 
				+  mixed_selection: True
			
 
				+  transformer:
			
 
				+    name: CoDeformableDetrTransformer
			
 
				+    num_co_heads: 2
			
 
				+    as_two_stage: True
			
 
				+    mixed_selection: True
			
 
				+    embed_dims: &embed_dims 256
			
 
				+    encoder:
			
 
				+      name: DeformableTransformerEncoder
			
 
				+      num_layers: *num_dec_layer
			
 
				+      encoder_layer:
			
 
				+        name: DeformableTransformerEncoderLayer
			
 
				+        d_model: *embed_dims
			
 
				+        n_head: 8
			
 
				+        dim_feedforward: 2048
			
 
				+        n_levels: 4
			
 
				+        n_points: 4
			
 
				+        dropout: 0.0
			
 
				+    decoder:
			
 
				+      name: CoDeformableDetrTransformerDecoder
			
 
				+      num_layers: *num_dec_layer
			
 
				+      return_intermediate: True
			
 
				+      look_forward_twice: True
			
 
				+      decoder_layer:
			
 
				+        name: DeformableTransformerDecoderLayer
			
 
				+        d_model: *embed_dims
			
 
				+        dim_feedforward: 2048
			
 
				+        dropout: 0.0
			
 
				+  positional_encoding:
			
 
				+    name: PositionEmbedding
			
 
				+    num_pos_feats: 128
			
 
				+    normalize: true
			
 
				+    offset: -0.5
			
 
				+  loss_cls:
			
 
				+    name: Weighted_FocalLoss
			
 
				+    use_sigmoid: true
			
 
				+    gamma: 2.0
			
 
				+    alpha: 0.25
			
 
				+    loss_weight: 2.0
			
 
				+  loss_bbox:
			
 
				+    name: L1Loss
			
 
				+    loss_weight: 5.0
			
 
				+  loss_iou:
			
 
				+    name: GIoULoss
			
 
				+    loss_weight: 2.0
			
 
				+    reduction: sum
			
 
				+  assigner:
			
 
				+    name: HungarianAssigner
			
 
				+    cls_cost:
			
 
				+      name: FocalLossCost
			
 
				+      weight: 2.0
			
 
				+    reg_cost:
			
 
				+      name: BBoxL1Cost
			
 
				+      weight: 5.0
			
 
				+      box_format: xywh
			
 
				+    iou_cost:
			
 
				+      name: IoUCost
			
 
				+      iou_mode: giou
			
 
				+      weight: 2.0
			
 
				+  test_cfg:
			
 
				+    max_per_img: 100
			
 
				+    score_thr: 0.0
			
 
				+
			
 
				+RPNHead:
			
 
				+  loss_rpn_bbox: 
			
 
				+    name: L1Loss
			
 
				+    reduction: sum
			
 
				+    loss_weight: 12.0
			
 
				+  in_channel: 256
			
 
				+  anchor_generator: 
			
 
				+    name: RetinaAnchorGenerator
			
 
				+    octave_base_scale: 4
			
 
				+    scales_per_octave: 3
			
 
				+    aspect_ratios: [0.5, 1.0, 2.0]
			
 
				+    strides: [8.0, 16.0, 32.0, 64.0, 128.0]
			
 
				+  rpn_target_assign:
			
 
				+    batch_size_per_im: 256
			
 
				+    fg_fraction: 0.5
			
 
				+    negative_overlap: 0.3
			
 
				+    positive_overlap: 0.7
			
 
				+    use_random: True
			
 
				+  train_proposal:
			
 
				+    min_size: 0.0
			
 
				+    nms_thresh: 0.7
			
 
				+    pre_nms_top_n: 4000
			
 
				+    post_nms_top_n: 1000
			
 
				+    topk_after_collect: True
			
 
				+  test_proposal:
			
 
				+    min_size: 0.0
			
 
				+    nms_thresh: 0.7
			
 
				+    pre_nms_top_n: 1000
			
 
				+    post_nms_top_n: 1000
			
 
				+
			
 
				+Co_RoiHead:
			
 
				+  in_channel: 256
			
 
				+  loss_normalize_pos: True
			
 
				+  head: TwoFCHead
			
 
				+  roi_extractor:
			
 
				+    resolution: 7
			
 
				+    sampling_ratio: 0
			
 
				+    aligned: True
			
 
				+  bbox_assigner: 
			
 
				+    name: BBoxAssigner
			
 
				+    batch_size_per_im: 512
			
 
				+    bg_thresh: 0.5
			
 
				+    fg_thresh: 0.5
			
 
				+    fg_fraction: 0.25
			
 
				+    use_random: True
			
 
				+  bbox_loss: 
			
 
				+    name: GIoULoss
			
 
				+    loss_weight: 120.0
			
 
				+  cls_loss_weight: 12.0
			
 
				+
			
 
				+
			
 
				+# Optimizer
			
 
				+epoch: 12
			
 
				+
			
 
				+LearningRate:
			
 
				+  base_lr: 0.0002
			
 
				+  schedulers:
			
 
				+  - !PiecewiseDecay
			
 
				+    gamma: 0.1
			
 
				+    milestones: [11]
			
 
				+    use_warmup: false
			
 
				+
			
 
				+OptimizerBuilder:
			
 
				+  clip_grad_by_norm: 0.1
			
 
				+  regularizer: false
			
 
				+  optimizer:
			
 
				+    type: AdamW
			
 
				+    weight_decay: 0.0001
			
 
				+
			
 
				+
			
 
				+# Exporting the model
			
 
				+export:
			
 
				+  post_process: True  # Whether post-processing is included in the network when export model.
			
 
				+  nms: True           # Whether NMS is included in the network when export model.
			
 
				+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
			
 
				+  fuse_conv_bn: False
			
--- a/paddlex/repo_apis/PaddleDetection_api/configs/PP-YOLOE_plus-S.yaml
+++ b/paddlex/repo_apis/PaddleDetection_api/configs/PP-YOLOE_plus-S.yaml
@@ -98,6 +98,7 @@ CSPResNet:
 
				   channels: [64, 128, 256, 512, 1024]
			
 
				   return_idx: [1, 2, 3]
			
 
				   use_large_stem: true
			
 
				+  use_alpha: True
			
 
				 
			
 
				 CustomCSPPAN:
			
 
				   out_channels: [768, 384, 192]
			
--- a/paddlex/repo_apis/PaddleDetection_api/configs/PP-YOLOE_plus-S_face.yaml
+++ b/paddlex/repo_apis/PaddleDetection_api/configs/PP-YOLOE_plus-S_face.yaml
@@ -17,10 +17,6 @@ metric: COCO
 
				 num_classes: 1
			
 
				 
			
 
				 worker_num: 4
			
 
				-eval_height: &eval_height 1088
			
 
				-eval_width: &eval_width 1088
			
 
				-eval_size: &eval_size [*eval_height, *eval_width]
			
 
				-
			
 
				 TrainDataset:
			
 
				   name: COCODataSet
			
 
				   image_dir: WIDER_train/images
			
@@ -62,17 +58,15 @@ TrainReader:
 
				 EvalReader:
			
 
				   sample_transforms:
			
 
				     - Decode: {}
			
 
				-    - Resize: {target_size: *eval_size, keep_ratio: False, interp: 2}
			
 
				+    - Resize: {target_size: [1088, 1088], keep_ratio: False, interp: 2}
			
 
				     - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
			
 
				     - Permute: {}
			
 
				   batch_size: 2
			
 
				 
			
 
				 TestReader:
			
 
				-  inputs_def:
			
 
				-    image_shape: [3, *eval_height, *eval_width]
			
 
				   sample_transforms:
			
 
				     - Decode: {}
			
 
				-    - Resize: {target_size: *eval_size, keep_ratio: False, interp: 2}
			
 
				+    - Resize: {target_size: [1088, 1088], keep_ratio: False, interp: 2}
			
 
				     - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
			
 
				     - Permute: {}
			
 
				   batch_size: 1
			
--- a/paddlex/repo_apis/PaddleDetection_api/configs/PicoDet_LCNet_x2_5_face.yaml
+++ b/paddlex/repo_apis/PaddleDetection_api/configs/PicoDet_LCNet_x2_5_face.yaml
@@ -93,10 +93,6 @@ OptimizerBuilder:
 
				     type: L2
			
 
				 
			
 
				 worker_num: 6
			
 
				-eval_height: &eval_height 1088
			
 
				-eval_width: &eval_width 1088
			
 
				-eval_size: &eval_size [*eval_height, *eval_width]
			
 
				-
			
 
				 TrainReader:
			
 
				   sample_transforms:
			
 
				   - Decode: {}
			
@@ -116,7 +112,7 @@ TrainReader:
 
				 EvalReader:
			
 
				   sample_transforms:
			
 
				   - Decode: {}
			
 
				-  - Resize: {interp: 2, target_size: *eval_size, keep_ratio: False}
			
 
				+  - Resize: {interp: 2, target_size: [1088, 1088], keep_ratio: False}
			
 
				   - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				   - Permute: {}
			
 
				   batch_transforms:
			
@@ -126,11 +122,9 @@ EvalReader:
 
				 
			
 
				 
			
 
				 TestReader:
			
 
				-  inputs_def:
			
 
				-    image_shape: [3, *eval_height, *eval_width]
			
 
				   sample_transforms:
			
 
				   - Decode: {}
			
 
				-  - Resize: {interp: 2, target_size: *eval_size, keep_ratio: False}
			
 
				+  - Resize: {interp: 2, target_size: [1088, 1088], keep_ratio: False}
			
 
				   - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
			
 
				   - Permute: {}
			
 
				   batch_size: 1
			
--- a/paddlex/repo_apis/PaddleDetection_api/object_det/register.py
+++ b/paddlex/repo_apis/PaddleDetection_api/object_det/register.py
@@ -972,3 +972,63 @@ register_model_info(
 
				         },
			
 
				     }
			
 
				 )
			
 
				+
			
 
				+register_model_info(
			
 
				+    {
			
 
				+        "model_name": "Co-Deformable-DETR-R50",
			
 
				+        "suite": "Det",
			
 
				+        "config_path": osp.join(PDX_CONFIG_DIR, "Co-Deformable-DETR-R50.yaml"),
			
 
				+        "supported_apis": ["train", "evaluate", "predict", "export", "infer"],
			
 
				+        "supported_dataset_types": ["COCODetDataset"],
			
 
				+        "supported_train_opts": {
			
 
				+            "device": ["cpu", "gpu_nxcx", "xpu", "npu", "mlu"],
			
 
				+            "dy2st": False,
			
 
				+            "amp": ["OFF"],
			
 
				+        },
			
 
				+    }
			
 
				+)
			
 
				+
			
 
				+register_model_info(
			
 
				+    {
			
 
				+        "model_name": "Co-Deformable-DETR-Swin-T",
			
 
				+        "suite": "Det",
			
 
				+        "config_path": osp.join(PDX_CONFIG_DIR, "Co-Deformable-DETR-Swin-T.yaml"),
			
 
				+        "supported_apis": ["train", "evaluate", "predict", "export", "infer"],
			
 
				+        "supported_dataset_types": ["COCODetDataset"],
			
 
				+        "supported_train_opts": {
			
 
				+            "device": ["cpu", "gpu_nxcx", "xpu", "npu", "mlu"],
			
 
				+            "dy2st": False,
			
 
				+            "amp": ["OFF"],
			
 
				+        },
			
 
				+    }
			
 
				+)
			
 
				+
			
 
				+register_model_info(
			
 
				+    {
			
 
				+        "model_name": "Co-DINO-R50",
			
 
				+        "suite": "Det",
			
 
				+        "config_path": osp.join(PDX_CONFIG_DIR, "Co-DINO-R50.yaml"),
			
 
				+        "supported_apis": ["train", "evaluate", "predict", "export", "infer"],
			
 
				+        "supported_dataset_types": ["COCODetDataset"],
			
 
				+        "supported_train_opts": {
			
 
				+            "device": ["cpu", "gpu_nxcx", "xpu", "npu", "mlu"],
			
 
				+            "dy2st": False,
			
 
				+            "amp": ["OFF"],
			
 
				+        },
			
 
				+    }
			
 
				+)
			
 
				+
			
 
				+register_model_info(
			
 
				+    {
			
 
				+        "model_name": "Co-DINO-Swin-L",
			
 
				+        "suite": "Det",
			
 
				+        "config_path": osp.join(PDX_CONFIG_DIR, "Co-DINO-Swin-L.yaml"),
			
 
				+        "supported_apis": ["train", "evaluate", "predict", "export", "infer"],
			
 
				+        "supported_dataset_types": ["COCODetDataset"],
			
 
				+        "supported_train_opts": {
			
 
				+            "device": ["cpu", "gpu_nxcx", "xpu", "npu", "mlu"],
			
 
				+            "dy2st": False,
			
 
				+            "amp": ["OFF"],
			
 
				+        },
			
 
				+    }
			
 
				+)