Explorar el Código

support co_dino and support imgsz for DetPredictor (#2777)

* support co_dino and support imgsz for DetPredictor

* fixed codetr swin model export bug

* use bs=1 for codetr and fixed ppyoloe+s bug

* support codetr with ema

* fixed face model eval bug
学卿 hace 10 meses
padre
commit
c6d4c935a0

+ 1 - 1
docs/practical_tutorials/small_object_detection_tutorial.en.md

@@ -300,4 +300,4 @@ For more parameters, please refer to [Small Object Detection Pipeline Usage Tuto
 * Service deployment: Service deployment is a common deployment form in actual production environments. By encapsulating inference functions into services, clients can access these services through network requests to obtain inference results. PaddleX supports users to achieve service deployment of the pipeline at a low cost. For detailed service deployment processes, please refer to [PaddleX Service Deployment Guide](../pipeline_deploy/service_deploy.md).
 * Edge deployment: Edge deployment is a way of placing computing and data processing functions on user devices themselves, where devices can directly process data without relying on remote servers. PaddleX supports deploying models on edge devices such as Android. For detailed edge deployment processes, please refer to [PaddleX Edge Deployment Guide](../pipeline_deploy/edge_deploy.md).
 
-You can choose an appropriate method to deploy the model pipeline according to your needs and proceed with subsequent AI application integration.
+You can choose an appropriate method to deploy the model pipeline according to your needs and proceed with subsequent AI application integration.

+ 40 - 0
paddlex/configs/modules/object_detection/Co-DINO-R50.yaml

@@ -0,0 +1,40 @@
+Global:
+  model: Co-DINO-R50
+  mode: check_dataset # check_dataset/train/evaluate/predict
+  dataset_dir: "/paddle/dataset/paddlex/det/det_coco_examples"
+  device: gpu:0,1,2,3
+  output: "output"
+
+CheckDataset:
+  convert:
+    enable: False
+    src_dataset_type: null
+  split:
+    enable: False
+    train_percent: null
+    val_percent: null
+
+Train:
+  num_classes: 4
+  epochs_iters: 50
+  batch_size: 1
+  learning_rate: 0.0001
+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-DINO-R50_pretrained.pdparams
+  warmup_steps: 100
+  resume_path: null
+  log_interval: 10
+  eval_interval: 1
+
+Evaluate:
+  weight_path: "output/best_model/best_model.pdparams"
+  log_interval: 10
+
+Predict:
+  batch_size: 1
+  model_dir: "output/best_model/inference"
+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_object_detection_002.png"
+  kernel_option:
+    run_mode: paddle
+
+Export:
+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-DINO-R50_pretrained.pdparams

+ 40 - 0
paddlex/configs/modules/object_detection/Co-DINO-Swin-L.yaml

@@ -0,0 +1,40 @@
+Global:
+  model: Co-DINO-Swin-L
+  mode: check_dataset # check_dataset/train/evaluate/predict
+  dataset_dir: "/paddle/dataset/paddlex/det/det_coco_examples"
+  device: gpu:0,1,2,3
+  output: "output"
+
+CheckDataset:
+  convert:
+    enable: False
+    src_dataset_type: null
+  split:
+    enable: False
+    train_percent: null
+    val_percent: null
+
+Train:
+  num_classes: 4
+  epochs_iters: 50
+  batch_size: 1
+  learning_rate: 0.0001
+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-DINO-Swin-L_pretrained.pdparams
+  warmup_steps: 100
+  resume_path: null
+  log_interval: 10
+  eval_interval: 1
+
+Evaluate:
+  weight_path: "output/best_model/best_model.pdparams"
+  log_interval: 10
+
+Predict:
+  batch_size: 1
+  model_dir: "output/best_model/inference"
+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_object_detection_002.png"
+  kernel_option:
+    run_mode: paddle
+
+Export:
+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-DINO-Swin-L_pretrained.pdparams

+ 40 - 0
paddlex/configs/modules/object_detection/Co-Deformable-DETR-R50.yaml

@@ -0,0 +1,40 @@
+Global:
+  model: Co-Deformable-DETR-R50
+  mode: check_dataset # check_dataset/train/evaluate/predict
+  dataset_dir: "/paddle/dataset/paddlex/det/det_coco_examples"
+  device: gpu:0,1,2,3
+  output: "output"
+
+CheckDataset:
+  convert:
+    enable: False
+    src_dataset_type: null
+  split:
+    enable: False
+    train_percent: null
+    val_percent: null
+
+Train:
+  num_classes: 4
+  epochs_iters: 50
+  batch_size: 1
+  learning_rate: 0.0001
+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-Deformable-DETR-R50_pretrained.pdparams
+  warmup_steps: 100
+  resume_path: null
+  log_interval: 10
+  eval_interval: 1
+
+Evaluate:
+  weight_path: "output/best_model/best_model.pdparams"
+  log_interval: 10
+
+Predict:
+  batch_size: 1
+  model_dir: "output/best_model/inference"
+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_object_detection_002.png"
+  kernel_option:
+    run_mode: paddle
+
+Export:
+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-Deformable-DETR-R50_pretrained.pdparams

+ 40 - 0
paddlex/configs/modules/object_detection/Co-Deformable-DETR-Swin-T.yaml

@@ -0,0 +1,40 @@
+Global:
+  model: Co-Deformable-DETR-Swin-T
+  mode: check_dataset # check_dataset/train/evaluate/predict
+  dataset_dir: "/paddle/dataset/paddlex/det/det_coco_examples"
+  device: gpu:0,1,2,3
+  output: "output"
+
+CheckDataset:
+  convert:
+    enable: False
+    src_dataset_type: null
+  split:
+    enable: False
+    train_percent: null
+    val_percent: null
+
+Train:
+  num_classes: 4
+  epochs_iters: 50
+  batch_size: 1
+  learning_rate: 0.0001
+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-Deformable-DETR-Swin-T_pretrained.pdparams
+  warmup_steps: 100
+  resume_path: null
+  log_interval: 10
+  eval_interval: 1
+
+Evaluate:
+  weight_path: "output/best_model/best_model.pdparams"
+  log_interval: 10
+
+Predict:
+  batch_size: 1
+  model_dir: "output/best_model/inference"
+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_object_detection_002.png"
+  kernel_option:
+    run_mode: paddle
+
+Export:
+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/Co-Deformable-DETR-Swin-T_pretrained.pdparams

+ 1 - 0
paddlex/inference/models_new/__init__.py

@@ -39,6 +39,7 @@ from .image_multilabel_classification import MLClasPredictor
 # from .table_recognition import TablePredictor
 # from .general_recognition import ShiTuRecPredictor
 from .anomaly_detection import UadPredictor
+
 # from .face_recognition import FaceRecPredictor
 from .multilingual_speech_recognition import WhisperPredictor
 from .video_classification import VideoClasPredictor

+ 35 - 3
paddlex/inference/models_new/object_detection/predictor.py

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, List, Sequence, Optional
+from typing import Any, List, Sequence, Optional, Union, Tuple
 
 import numpy as np
 
@@ -34,6 +34,7 @@ from .processors import (
     WarpAffine,
 )
 from .result import DetResult
+from .utils import STATIC_SHAPE_MODEL_LIST
 
 
 class DetPredictor(BasicPredictor):
@@ -43,15 +44,36 @@ class DetPredictor(BasicPredictor):
     _FUNC_MAP = {}
     register = FuncRegister(_FUNC_MAP)
 
-    def __init__(self, *args, threshold: Optional[float] = None, **kwargs):
+    def __init__(
+        self,
+        *args,
+        imgsz: Optional[Union[int, Tuple[int, int]]] = None,
+        threshold: Optional[float] = None,
+        **kwargs,
+    ):
         """Initializes DetPredictor.
         Args:
             *args: Arbitrary positional arguments passed to the superclass.
+            imgsz (Optional[Union[int, Tuple[int, int]]], optional): The input image size (w, h). Defaults to None.
             threshold (Optional[float], optional): The threshold for filtering out low-confidence predictions.
                 Defaults to None.
             **kwargs: Arbitrary keyword arguments passed to the superclass.
         """
         super().__init__(*args, **kwargs)
+
+        if imgsz is not None:
+            assert (
+                self.model_name not in STATIC_SHAPE_MODEL_LIST
+            ), f"The model {self.model_name} is not supported set input shape"
+            if isinstance(imgsz, int):
+                imgsz = (imgsz, imgsz)
+            elif isinstance(imgsz, (tuple, list)):
+                assert len(imgsz) == 2, f"The length of `imgsz` should be 2."
+            else:
+                raise ValueError(
+                    f"The type of `imgsz` must be int or Tuple[int, int], but got {type(imgsz)}."
+                )
+        self.imgsz = imgsz
         self.threshold = threshold
         self.pre_ops, self.infer, self.post_op = self._build()
 
@@ -61,7 +83,12 @@ class DetPredictor(BasicPredictor):
     def _get_result_class(self):
         return DetResult
 
-    def _build(self):
+    def _build(self) -> Tuple:
+        """Build the preprocessors, inference engine, and postprocessors based on the configuration.
+
+        Returns:
+            tuple: A tuple containing the preprocessors, inference engine, and postprocessors.
+        """
         # build preprocess ops
         pre_ops = [ReadImage(format="RGB")]
         for cfg in self.config["Preprocess"]:
@@ -73,6 +100,10 @@ class DetPredictor(BasicPredictor):
             if op:
                 pre_ops.append(op)
         pre_ops.append(self.build_to_batch())
+        if self.imgsz is not None:
+            if isinstance(pre_ops[1], Resize):
+                pre_ops.pop(1)
+            pre_ops.insert(1, self.build_resize(self.imgsz, False, 2))
 
         # build infer
         infer = StaticInfer(
@@ -231,6 +262,7 @@ class DetPredictor(BasicPredictor):
     def build_to_batch(self):
         model_names_required_imgsize = [
             "DETR",
+            "DINO",
             "RCNN",
             "YOLOv3",
             "CenterNet",

+ 0 - 9
paddlex/inference/models_new/object_detection/processors.py

@@ -406,19 +406,10 @@ class WarpAffine:
             ori_img = data["img"]
             if "ori_img_size" not in data:
                 data["ori_img_size"] = [ori_img.shape[1], ori_img.shape[0]]
-            ori_img_size = data["ori_img_size"]
 
             img = self.apply(ori_img)
             data["img"] = img
 
-            img_size = [img.shape[1], img.shape[0]]
-            data["img_size"] = img_size  # [size_w, size_h]
-
-            data["scale_factors"] = [  # [w_scale, h_scale]
-                img_size[0] / ori_img_size[0],
-                img_size[1] / ori_img_size[1],
-            ]
-
         return datas
 
 

+ 65 - 0
paddlex/inference/models_new/object_detection/utils.py

@@ -0,0 +1,65 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+STATIC_SHAPE_MODEL_LIST = [
+    "CenterNet-DLA-34",
+    "CenterNet-ResNet50",
+    "Co-Deformable-DETR-Swin-T",
+    "Co-DINO-Swin-L",
+    "FasterRCNN-Swin-Tiny-FPN",
+    "Mask-RT-DETR-H",
+    "Mask-RT-DETR-L",
+    "Mask-RT-DETR-M",
+    "Mask-RT-DETR-S",
+    "Mask-RT-DETR-X",
+    "PicoDet_layout_1x_table",
+    "PicoDet_layout_1x",
+    "PicoDet-L_layout_17cls",
+    "PicoDet-L_layout_3cls",
+    "PicoDet-L",
+    "PicoDet-M",
+    "PicoDet-S_layout_17cls",
+    "PicoDet-S_layout_3cls",
+    "PicoDet-S",
+    "PicoDet-XS",
+    "PP-ShiTuV2_det",
+    "PP-YOLOE-L_human",
+    "PP-YOLOE-L_vehicle",
+    "PP-YOLOE_plus-L",
+    "PP-YOLOE_plus-M",
+    "PP-YOLOE_plus_SOD-largesize-L",
+    "PP-YOLOE_plus_SOD-L",
+    "PP-YOLOE_plus_SOD-S",
+    "PP-YOLOE_plus-S",
+    "PP-YOLOE_plus-X",
+    "PP-YOLOE_seg-S",
+    "PP-YOLOE-S_human",
+    "PP-YOLOE-S_vehicle",
+    "RT-DETR-H_layout_17cls",
+    "RT-DETR-H_layout_3cls",
+    "RT-DETR-H",
+    "RT-DETR-L",
+    "RT-DETR-R18",
+    "RT-DETR-R50",
+    "RT-DETR-X",
+    "YOLOv3-DarkNet53",
+    "YOLOv3-MobileNetV3",
+    "YOLOv3-ResNet50_vd_DCN",
+    "YOLOX-L",
+    "YOLOX-M",
+    "YOLOX-N",
+    "YOLOX-S",
+    "YOLOX-T",
+    "YOLOX-X",
+]

+ 4 - 0
paddlex/inference/utils/new_ir_blacklist.py

@@ -18,5 +18,9 @@ NEWIR_BLOCKLIST = [
     "TimesNet_ad",
     "Nonstationary_ad",
     "DLinear_ad",
+    "Co-Deformable-DETR-R50",
+    "Co-Deformable-DETR-Swin-T",
+    "Co-DINO-R50",
+    "Co-DINO-Swin-L",
     "LaTeX_OCR_rec",
 ]

+ 4 - 0
paddlex/inference/utils/official_models.py

@@ -304,6 +304,10 @@ PP-LCNet_x1_0_vehicle_attribute_infer.tar",
     "MobileFaceNet": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/MobileFaceNet_infer.tar",
     "ResNet50_face": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/ResNet50_face_infer.tar",
     "PP-YOLOE-R_L": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/PP-YOLOE-R_L_infer.tar",
+    "Co-Deformable-DETR-R50": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/Co-Deformable-DETR-R50_infer.tar",
+    "Co-Deformable-DETR-Swin-T": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/Co-Deformable-DETR-Swin-T_infer.tar",
+    "Co-DINO-R50": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/Co-DINO-R50_infer.tar",
+    "Co-DINO-Swin-L": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/Co-DINO-Swin-L_infer.tar",
     "whisper_large": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/whisper_large.tar",
     "whisper_base": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/whisper_base.tar",
     "whisper_medium": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/whisper_medium.tar",

+ 4 - 0
paddlex/modules/object_detection/model_list.py

@@ -72,6 +72,10 @@ MODELS = [
     "BlazeFace-FPN-SSH",
     "PP-YOLOE_plus-S_face",
     "PP-YOLOE-R_L",
+    "Co-Deformable-DETR-R50",
+    "Co-Deformable-DETR-Swin-T",
+    "Co-DINO-R50",
+    "Co-DINO-Swin-L",
     "RT-DETR-L_wired_table_cell_det",
     "RT-DETR-L_wireless_table_cell_det",
 ]

+ 300 - 0
paddlex/repo_apis/PaddleDetection_api/configs/Co-DINO-R50.yaml

@@ -0,0 +1,300 @@
+# Runtime
+find_unused_parameters: True
+use_gpu: true
+use_xpu: false
+use_mlu: false
+use_npu: false
+log_iter: 20
+save_dir: output
+snapshot_epoch: 1
+print_flops: false
+print_params: false
+use_ema: true
+
+
+# Dataset
+metric: COCO
+num_classes: 80
+
+TrainDataset:
+  name: COCODataSet
+  image_dir: train2017
+  anno_path: annotations/instances_train2017.json
+  dataset_dir: dataset/coco
+  allow_empty: true
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  name: COCODataSet
+  image_dir: val2017
+  anno_path: annotations/instances_val2017.json
+  dataset_dir: dataset/coco
+  allow_empty: true
+
+TestDataset:
+  name: ImageFolder
+  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
+  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
+
+
+# Reader
+worker_num: 2
+TrainReader:
+  sample_transforms:
+  - Decode: {}
+  - RandomFlip: {prob: 0.5}
+  - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ],
+                    transforms2: [
+                        RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] },
+                        RandomSizeCrop: { min_size: 384, max_size: 600 },
+                        RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ]
+  }
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true}
+  batch_size: 2
+  shuffle: true
+  drop_last: true
+  collate_batch: false
+  use_shared_memory: false
+
+EvalReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {target_size: [800, 1333], keep_ratio: True}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
+
+TestReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {target_size: [800, 1333], keep_ratio: True}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
+
+
+# Model
+architecture: CO_DETR
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams
+num_dec_layer: &num_dec_layer 6
+
+CO_DETR:
+  backbone: ResNet
+  backbone_lr_mult: 0.1
+  neck: ChannelMapper
+  query_head: CoDINOHead
+  rpn_head: RPNHead
+  roi_head: Co_RoiHead
+  bbox_head:
+    name: CoATSSHead
+    in_channels: 256
+    stacked_convs: 1
+    feat_channels: 256
+    bbox_weight: [10., 10., 5., 5.]
+    anchor_generator: 
+      name: CoAnchorGenerator
+      octave_base_scale: 8
+      scales_per_octave: 1
+      aspect_ratios: [1.0]
+      strides: [4., 8., 16., 32., 64., 128.]
+    assigner: 
+      name: ATSSAssigner
+      topk: 9
+      sm_use: True
+    loss_cls: 
+      name: Weighted_FocalLoss
+      use_sigmoid: true
+      gamma: 2.0
+      alpha: 0.25
+      loss_weight: 12.0
+    loss_bbox: 
+      name: GIoULoss
+      loss_weight: 24.0
+      reduction: sum
+    loss_cent_weight: 12.0
+
+ResNet:
+  # index 0 stands for res2
+  depth: 50
+  norm_type: bn
+  freeze_at: 0
+  return_idx: [0, 1, 2, 3]
+  num_stages: 4
+
+ChannelMapper:
+  in_channels: [256, 512, 1024, 2048]
+  kernel_size: 1
+  out_channels: 256
+  norm_type: "gn"
+  norm_groups: 32
+  act: None
+  num_outs: 5
+  strides: [4., 8., 16., 32., 64.]
+ 
+CoDINOHead:
+  num_query: 900
+  num_dn_query: 100
+  label_noise_ratio: 0.5
+  box_noise_scale: 1.0
+  in_channels: 2048
+  sync_cls_avg_factor: True
+  with_box_refine: True
+  as_two_stage: True
+  mixed_selection: True
+  transformer:
+    name: CoDINOTransformer
+    two_stage_num_proposals: 900
+    with_pos_coord: True
+    with_coord_feat: False
+    num_co_heads: 2
+    num_feature_levels: 5
+    as_two_stage: True
+    mixed_selection: True
+    embed_dims: &embed_dims 256
+    encoder:
+      name: DeformableTransformerEncoder
+      num_layers: *num_dec_layer
+      with_rp: 6
+      encoder_layer:
+        name: DeformableTransformerEncoderLayer
+        d_model: *embed_dims
+        n_head: 8
+        dim_feedforward: 2048
+        n_levels: 5
+        n_points: 4
+        dropout: 0.0
+    decoder:
+      name: DINOTransformerDecoder
+      hidden_dim: *embed_dims
+      num_layers: *num_dec_layer
+      decoder_layer:
+        name: DINOTransformerDecoderLayer
+        d_model: *embed_dims
+        n_head: 8
+        dim_feedforward: 2048
+        n_points: 4
+        n_levels: 5
+        dropout: 0.0
+  positional_encoding:
+    name: PositionEmbedding
+    num_pos_feats: 128
+    temperature: 20
+    normalize: true
+  loss_cls:
+    name: QualityFocalLoss
+    use_sigmoid: true
+    beta: 2.0
+    loss_weight: 1.0
+  loss_bbox:
+    name: L1Loss
+    loss_weight: 5.0
+  loss_iou:
+    name: GIoULoss
+    loss_weight: 2.0
+    reduction: sum
+  assigner:
+    name: HungarianAssigner
+    cls_cost:
+      name: FocalLossCost
+      weight: 2.0
+    reg_cost:
+      name: BBoxL1Cost
+      weight: 5.0
+      box_format: xywh
+    iou_cost:
+      name: IoUCost
+      iou_mode: giou
+      weight: 2.0
+  test_cfg:
+    max_per_img: 300
+    score_thr: 0.0
+  nms: 
+    name: MultiClassNMS
+    keep_top_k: -1
+    score_threshold: 0.0
+    nms_threshold: 0.8
+
+RPNHead:
+  loss_rpn_bbox: 
+    name: L1Loss
+    reduction: sum
+    loss_weight: 12.0
+  in_channel: 256
+  anchor_generator: 
+    name: RetinaAnchorGenerator
+    octave_base_scale: 4
+    scales_per_octave: 3
+    aspect_ratios: [0.5, 1.0, 2.0]
+    strides: [4., 8., 16., 32., 64., 128.]
+  rpn_target_assign:
+    batch_size_per_im: 256
+    fg_fraction: 0.5
+    negative_overlap: 0.3
+    positive_overlap: 0.7
+    use_random: True
+  train_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 4000
+    post_nms_top_n: 1000
+    topk_after_collect: True
+  test_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 1000
+    post_nms_top_n: 1000
+
+Co_RoiHead:
+  in_channel: 256
+  loss_normalize_pos: True
+  head: TwoFCHead
+  roi_extractor:
+    end_level: 4
+    resolution: 7
+    sampling_ratio: 0
+    aligned: True
+  bbox_assigner: 
+    name: BBoxAssigner
+    batch_size_per_im: 512
+    bg_thresh: 0.5
+    fg_thresh: 0.5
+    fg_fraction: 0.25
+    use_random: True
+  bbox_loss: 
+    name: GIoULoss
+    loss_weight: 120.0
+  cls_loss_weight: 12.0
+
+
+# Optimizer
+epoch: 12
+
+LearningRate:
+  base_lr: 0.0002
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones: [11]
+    use_warmup: false
+
+OptimizerBuilder:
+  clip_grad_by_norm: 0.1
+  regularizer: false
+  optimizer:
+    type: AdamW
+    weight_decay: 0.0001
+
+
+# Exporting the model
+export:
+  post_process: True  # Whether post-processing is included in the network when export model.
+  nms: True           # Whether NMS is included in the network when export model.
+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
+  fuse_conv_bn: False

+ 301 - 0
paddlex/repo_apis/PaddleDetection_api/configs/Co-DINO-Swin-L.yaml

@@ -0,0 +1,301 @@
+# Runtime
+find_unused_parameters: True
+use_gpu: true
+use_xpu: false
+use_mlu: false
+use_npu: false
+log_iter: 20
+save_dir: output
+snapshot_epoch: 1
+print_flops: false
+print_params: false
+use_ema: true
+
+
+# Dataset
+metric: COCO
+num_classes: 80
+
+TrainDataset:
+  name: COCODataSet
+  image_dir: train2017
+  anno_path: annotations/instances_train2017.json
+  dataset_dir: dataset/coco
+  allow_empty: true
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  name: COCODataSet
+  image_dir: val2017
+  anno_path: annotations/instances_val2017.json
+  dataset_dir: dataset/coco
+  allow_empty: true
+
+TestDataset:
+  name: ImageFolder
+  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
+  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
+
+
+# Reader
+worker_num: 1
+TrainReader:
+  sample_transforms:
+  - Decode: {}
+  - RandomFlip: {prob: 0.5}
+  - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ],
+                    transforms2: [
+                        RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] },
+                        RandomSizeCrop: { min_size: 384, max_size: 600 },
+                        RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ]
+  }
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true}
+  batch_size: 1
+  shuffle: true
+  drop_last: true
+  collate_batch: false
+  use_shared_memory: false
+
+EvalReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {target_size: [800, 1333], keep_ratio: True}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
+
+TestReader:
+  inputs_def:
+    image_shape: [-1, 3, 640, 640]
+  sample_transforms:
+  - Decode: {}
+  - Resize: {target_size: 640, keep_ratio: false}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
+
+
+# Model
+architecture: CO_DETR
+pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams
+num_dec_layer: &num_dec_layer 6
+
+CO_DETR:
+  backbone: SwinTransformer
+  backbone_lr_mult: 0.1
+  neck: ChannelMapper
+  query_head: CoDINOHead
+  rpn_head: RPNHead
+  roi_head: Co_RoiHead
+  bbox_head:
+    name: CoATSSHead
+    in_channels: 256
+    stacked_convs: 1
+    feat_channels: 256
+    bbox_weight: [10., 10., 5., 5.]
+    anchor_generator: 
+      name: CoAnchorGenerator
+      octave_base_scale: 8
+      scales_per_octave: 1
+      aspect_ratios: [1.0]
+      strides: [4., 8., 16., 32., 64., 128.]
+    assigner: 
+      name: ATSSAssigner
+      topk: 9
+      sm_use: True
+    loss_cls: 
+      name: Weighted_FocalLoss
+      use_sigmoid: true
+      gamma: 2.0
+      alpha: 0.25
+      loss_weight: 12.0
+    loss_bbox: 
+      name: GIoULoss
+      loss_weight: 24.0
+      reduction: sum
+    loss_cent_weight: 12.0
+
+SwinTransformer:
+  arch: 'swin_L_384' # ['swin_T_224', 'swin_S_224', 'swin_B_224', 'swin_L_224', 'swin_B_384', 'swin_L_384']
+  out_indices: [0, 1, 2, 3]
+  ape: false
+  drop_path_rate: 0.3
+  patch_norm: true
+
+ChannelMapper:
+  in_channels: [192, 384, 768, 1536]
+  kernel_size: 1
+  out_channels: 256
+  norm_type: "gn"
+  norm_groups: 32
+  act: None
+  num_outs: 5
+  strides: [4., 8., 16., 32., 64.]
+ 
+CoDINOHead:
+  num_query: 900
+  num_dn_query: 100
+  label_noise_ratio: 0.5
+  box_noise_scale: 1.0
+  in_channels: 2048
+  sync_cls_avg_factor: True
+  with_box_refine: True
+  as_two_stage: True
+  mixed_selection: True
+  transformer:
+    name: CoDINOTransformer
+    two_stage_num_proposals: 900
+    with_pos_coord: True
+    with_coord_feat: False
+    num_co_heads: 2
+    num_feature_levels: 5
+    as_two_stage: True
+    mixed_selection: True
+    embed_dims: &embed_dims 256
+    encoder:
+      name: DeformableTransformerEncoder
+      num_layers: *num_dec_layer
+      with_rp: 6
+      encoder_layer:
+        name: DeformableTransformerEncoderLayer
+        d_model: *embed_dims
+        n_head: 8
+        dim_feedforward: 2048
+        n_levels: 5
+        n_points: 4
+        dropout: 0.0
+    decoder:
+      name: DINOTransformerDecoder
+      hidden_dim: *embed_dims
+      num_layers: *num_dec_layer
+      decoder_layer:
+        name: DINOTransformerDecoderLayer
+        d_model: *embed_dims
+        n_head: 8
+        dim_feedforward: 2048
+        n_points: 4
+        n_levels: 5
+        dropout: 0.0
+  positional_encoding:
+    name: PositionEmbedding
+    num_pos_feats: 128
+    temperature: 20
+    normalize: true
+  loss_cls:
+    name: QualityFocalLoss
+    use_sigmoid: true
+    beta: 2.0
+    loss_weight: 1.0
+  loss_bbox:
+    name: L1Loss
+    loss_weight: 5.0
+  loss_iou:
+    name: GIoULoss
+    loss_weight: 2.0
+    reduction: sum
+  assigner:
+    name: HungarianAssigner
+    cls_cost:
+      name: FocalLossCost
+      weight: 2.0
+    reg_cost:
+      name: BBoxL1Cost
+      weight: 5.0
+      box_format: xywh
+    iou_cost:
+      name: IoUCost
+      iou_mode: giou
+      weight: 2.0
+  test_cfg:
+    max_per_img: 300
+    score_thr: 0.0
+  nms: 
+    name: MultiClassNMS
+    keep_top_k: -1
+    score_threshold: 0.0
+    nms_threshold: 0.8
+
+RPNHead:
+  loss_rpn_bbox: 
+    name: L1Loss
+    reduction: sum
+    loss_weight: 12.0
+  in_channel: 256
+  anchor_generator: 
+    name: RetinaAnchorGenerator
+    octave_base_scale: 4
+    scales_per_octave: 3
+    aspect_ratios: [0.5, 1.0, 2.0]
+    strides: [4., 8., 16., 32., 64., 128.]
+  rpn_target_assign:
+    batch_size_per_im: 256
+    fg_fraction: 0.5
+    negative_overlap: 0.3
+    positive_overlap: 0.7
+    use_random: True
+  train_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 4000
+    post_nms_top_n: 1000
+    topk_after_collect: True
+  test_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 1000
+    post_nms_top_n: 1000
+
+Co_RoiHead:
+  in_channel: 256
+  loss_normalize_pos: True
+  head: TwoFCHead
+  roi_extractor:
+    end_level: 4
+    resolution: 7
+    sampling_ratio: 0
+    aligned: True
+  bbox_assigner: 
+    name: BBoxAssigner
+    batch_size_per_im: 512
+    bg_thresh: 0.5
+    fg_thresh: 0.5
+    fg_fraction: 0.25
+    use_random: True
+  bbox_loss: 
+    name: GIoULoss
+    loss_weight: 120.0
+  cls_loss_weight: 12.0
+
+
+# Optimizer
+epoch: 12
+
+LearningRate:
+  base_lr: 0.0001
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones: [11]
+    use_warmup: false
+
+OptimizerBuilder:
+  clip_grad_by_norm: 0.1
+  regularizer: false
+  optimizer:
+    type: AdamW
+    weight_decay: 0.0001
+
+
+# Exporting the model
+export:
+  post_process: True  # Whether post-processing is included in the network when export model.
+  nms: True           # Whether NMS is included in the network when export model.
+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
+  fuse_conv_bn: False

+ 285 - 0
paddlex/repo_apis/PaddleDetection_api/configs/Co-Deformable-DETR-R50.yaml

@@ -0,0 +1,285 @@
+# Runtime
+find_unused_parameters: True
+use_gpu: true
+use_xpu: false
+use_mlu: false
+use_npu: false
+log_iter: 20
+save_dir: output
+snapshot_epoch: 1
+print_flops: false
+print_params: false
+use_ema: true
+
+
+# Dataset
+metric: COCO
+num_classes: 80
+
+TrainDataset:
+  name: COCODataSet
+  image_dir: train2017
+  anno_path: annotations/instances_train2017.json
+  dataset_dir: dataset/coco
+  allow_empty: true
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  name: COCODataSet
+  image_dir: val2017
+  anno_path: annotations/instances_val2017.json
+  dataset_dir: dataset/coco
+  allow_empty: true
+
+TestDataset:
+  name: ImageFolder
+  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
+  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
+
+
+# Reader
+worker_num: 2
+TrainReader:
+  sample_transforms:
+  - Decode: {}
+  - RandomFlip: {prob: 0.5}
+  - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ],
+                    transforms2: [
+                        RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] },
+                        RandomSizeCrop: { min_size: 384, max_size: 600 },
+                        RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ]
+  }
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true}
+  batch_size: 2
+  shuffle: true
+  drop_last: true
+  collate_batch: false
+  use_shared_memory: false
+
+EvalReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {target_size: [800, 1333], keep_ratio: True}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
+
+TestReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {target_size: [800, 1333], keep_ratio: True}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
+
+
+# Model
+architecture: CO_DETR
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams
+num_dec_layer: &num_dec_layer 6
+
+CO_DETR:
+  backbone: ResNet
+  backbone_lr_mult: 0.1
+  neck: ChannelMapper
+  query_head: CoDeformDETRHead
+  rpn_head: RPNHead
+  roi_head: Co_RoiHead
+  bbox_head:
+    name: CoATSSHead
+    in_channels: 256
+    stacked_convs: 1
+    feat_channels: 256
+    bbox_weight: [10., 10., 5., 5.]
+    anchor_generator: 
+      name: CoAnchorGenerator
+      octave_base_scale: 8
+      scales_per_octave: 1
+      aspect_ratios: [1.0]
+      strides: [8., 16., 32., 64., 128.]
+    assigner: 
+      name: ATSSAssigner
+      topk: 9
+      sm_use: True
+    loss_cls: 
+      name: Weighted_FocalLoss
+      use_sigmoid: true
+      gamma: 2.0
+      alpha: 0.25
+      loss_weight: 12.0
+    loss_bbox: 
+      name: GIoULoss
+      loss_weight: 24.0
+      reduction: sum
+    loss_cent_weight: 12.0
+
+ResNet:
+  # index 0 stands for res2
+  depth: 50
+  norm_type: bn
+  freeze_at: 0
+  return_idx: [1, 2, 3]
+  num_stages: 4
+
+ChannelMapper:
+  in_channels: [512, 1024, 2048]
+  kernel_size: 1
+  out_channels: 256
+  norm_type: "gn"
+  norm_groups: 32
+  act: None
+  num_outs: 4
+  strides: [8., 16., 32., 64.]
+ 
+CoDeformDETRHead:
+  num_query: 300
+  in_channels: 2048
+  sync_cls_avg_factor: True
+  with_box_refine: True
+  as_two_stage: True
+  mixed_selection: True
+  transformer:
+    name: CoDeformableDetrTransformer
+    num_co_heads: 2
+    as_two_stage: True
+    mixed_selection: True
+    embed_dims: &embed_dims 256
+    encoder:
+      name: DeformableTransformerEncoder
+      num_layers: *num_dec_layer
+      encoder_layer:
+        name: DeformableTransformerEncoderLayer
+        d_model: *embed_dims
+        n_head: 8
+        dim_feedforward: 2048
+        n_levels: 4
+        n_points: 4
+        dropout: 0.0
+    decoder:
+      name: CoDeformableDetrTransformerDecoder
+      num_layers: *num_dec_layer
+      return_intermediate: True
+      look_forward_twice: True
+      decoder_layer:
+        name: DeformableTransformerDecoderLayer
+        d_model: *embed_dims
+        dim_feedforward: 2048
+        dropout: 0.0
+  positional_encoding:
+    name: PositionEmbedding
+    num_pos_feats: 128
+    normalize: true
+    offset: -0.5
+  loss_cls:
+    name: Weighted_FocalLoss
+    use_sigmoid: true
+    gamma: 2.0
+    alpha: 0.25
+    loss_weight: 2.0
+  loss_bbox:
+    name: L1Loss
+    loss_weight: 5.0
+  loss_iou:
+    name: GIoULoss
+    loss_weight: 2.0
+    reduction: sum
+  assigner:
+    name: HungarianAssigner
+    cls_cost:
+      name: FocalLossCost
+      weight: 2.0
+    reg_cost:
+      name: BBoxL1Cost
+      weight: 5.0
+      box_format: xywh
+    iou_cost:
+      name: IoUCost
+      iou_mode: giou
+      weight: 2.0
+  test_cfg:
+    max_per_img: 100
+    score_thr: 0.0
+
+RPNHead:
+  loss_rpn_bbox: 
+    name: L1Loss
+    reduction: sum
+    loss_weight: 12.0
+  in_channel: 256
+  anchor_generator: 
+    name: RetinaAnchorGenerator
+    octave_base_scale: 4
+    scales_per_octave: 3
+    aspect_ratios: [0.5, 1.0, 2.0]
+    strides: [8.0, 16.0, 32.0, 64.0, 128.0]
+  rpn_target_assign:
+    batch_size_per_im: 256
+    fg_fraction: 0.5
+    negative_overlap: 0.3
+    positive_overlap: 0.7
+    use_random: True
+  train_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 4000
+    post_nms_top_n: 1000
+    topk_after_collect: True
+  test_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 1000
+    post_nms_top_n: 1000
+
+Co_RoiHead:
+  in_channel: 256
+  loss_normalize_pos: True
+  head: TwoFCHead
+  roi_extractor:
+    resolution: 7
+    sampling_ratio: 0
+    aligned: True
+  bbox_assigner: 
+    name: BBoxAssigner
+    batch_size_per_im: 512
+    bg_thresh: 0.5
+    fg_thresh: 0.5
+    fg_fraction: 0.25
+    use_random: True
+  bbox_loss: 
+    name: GIoULoss
+    loss_weight: 120.0
+  cls_loss_weight: 12.0
+
+
+# Optimizer
+epoch: 12
+
+LearningRate:
+  base_lr: 0.0002
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones: [11]
+    use_warmup: false
+
+OptimizerBuilder:
+  clip_grad_by_norm: 0.1
+  regularizer: false
+  optimizer:
+    type: AdamW
+    weight_decay: 0.0001
+
+
+# Exporting the model
+export:
+  post_process: True  # Whether post-processing is included in the network when export model.
+  nms: True           # Whether NMS is included in the network when export model.
+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
+  fuse_conv_bn: False

+ 286 - 0
paddlex/repo_apis/PaddleDetection_api/configs/Co-Deformable-DETR-Swin-T.yaml

@@ -0,0 +1,286 @@
+# Runtime
+find_unused_parameters: True
+use_gpu: true
+use_xpu: false
+use_mlu: false
+use_npu: false
+log_iter: 20
+save_dir: output
+snapshot_epoch: 1
+print_flops: false
+print_params: false
+use_ema: true
+
+
+# Dataset
+metric: COCO
+num_classes: 80
+
+TrainDataset:
+  name: COCODataSet
+  image_dir: train2017
+  anno_path: annotations/instances_train2017.json
+  dataset_dir: dataset/coco
+  allow_empty: true
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  name: COCODataSet
+  image_dir: val2017
+  anno_path: annotations/instances_val2017.json
+  dataset_dir: dataset/coco
+  allow_empty: true
+
+TestDataset:
+  name: ImageFolder
+  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
+  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
+
+
+# Reader
+worker_num: 2
+TrainReader:
+  sample_transforms:
+  - Decode: {}
+  - RandomFlip: {prob: 0.5}
+  - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ],
+                    transforms2: [
+                        RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] },
+                        RandomSizeCrop: { min_size: 384, max_size: 600 },
+                        RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ]
+  }
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true}
+  batch_size: 2
+  shuffle: true
+  drop_last: true
+  collate_batch: false
+  use_shared_memory: false
+
+EvalReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {target_size: [800, 1333], keep_ratio: True}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
+
+TestReader:
+  inputs_def:
+    image_shape: [-1, 3, 640, 640]
+  sample_transforms:
+  - Decode: {}
+  - Resize: {target_size: 640, keep_ratio: false}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
+
+
+# Model
+architecture: CO_DETR
+pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams
+num_dec_layer: &num_dec_layer 6
+
+CO_DETR:
+  backbone: SwinTransformer
+  backbone_lr_mult: 0.1
+  neck: ChannelMapper
+  query_head: CoDeformDETRHead
+  rpn_head: RPNHead
+  roi_head: Co_RoiHead
+  bbox_head:
+    name: CoATSSHead
+    in_channels: 256
+    stacked_convs: 1
+    feat_channels: 256
+    bbox_weight: [10., 10., 5., 5.]
+    anchor_generator: 
+      name: CoAnchorGenerator
+      octave_base_scale: 8
+      scales_per_octave: 1
+      aspect_ratios: [1.0]
+      strides: [8., 16., 32., 64., 128.]
+    assigner: 
+      name: ATSSAssigner
+      topk: 9
+      sm_use: True
+    loss_cls: 
+      name: Weighted_FocalLoss
+      use_sigmoid: true
+      gamma: 2.0
+      alpha: 0.25
+      loss_weight: 12.0
+    loss_bbox: 
+      name: GIoULoss
+      loss_weight: 24.0
+      reduction: sum
+    loss_cent_weight: 12.0
+
+SwinTransformer:
+  arch: 'swin_T_224' # ['swin_T_224', 'swin_S_224', 'swin_B_224', 'swin_L_224', 'swin_B_384', 'swin_L_384']
+  out_indices: [1, 2, 3]
+  ape: false
+  drop_path_rate: 0.2
+  patch_norm: true
+
+ChannelMapper:
+  in_channels: [192, 384, 768]
+  kernel_size: 1
+  out_channels: 256
+  norm_type: "gn"
+  norm_groups: 32
+  act: None
+  num_outs: 4
+  strides: [8., 16., 32., 64.]
+ 
+CoDeformDETRHead:
+  num_query: 300
+  in_channels: 2048
+  sync_cls_avg_factor: True
+  with_box_refine: True
+  as_two_stage: True
+  mixed_selection: True
+  transformer:
+    name: CoDeformableDetrTransformer
+    num_co_heads: 2
+    as_two_stage: True
+    mixed_selection: True
+    embed_dims: &embed_dims 256
+    encoder:
+      name: DeformableTransformerEncoder
+      num_layers: *num_dec_layer
+      encoder_layer:
+        name: DeformableTransformerEncoderLayer
+        d_model: *embed_dims
+        n_head: 8
+        dim_feedforward: 2048
+        n_levels: 4
+        n_points: 4
+        dropout: 0.0
+    decoder:
+      name: CoDeformableDetrTransformerDecoder
+      num_layers: *num_dec_layer
+      return_intermediate: True
+      look_forward_twice: True
+      decoder_layer:
+        name: DeformableTransformerDecoderLayer
+        d_model: *embed_dims
+        dim_feedforward: 2048
+        dropout: 0.0
+  positional_encoding:
+    name: PositionEmbedding
+    num_pos_feats: 128
+    normalize: true
+    offset: -0.5
+  loss_cls:
+    name: Weighted_FocalLoss
+    use_sigmoid: true
+    gamma: 2.0
+    alpha: 0.25
+    loss_weight: 2.0
+  loss_bbox:
+    name: L1Loss
+    loss_weight: 5.0
+  loss_iou:
+    name: GIoULoss
+    loss_weight: 2.0
+    reduction: sum
+  assigner:
+    name: HungarianAssigner
+    cls_cost:
+      name: FocalLossCost
+      weight: 2.0
+    reg_cost:
+      name: BBoxL1Cost
+      weight: 5.0
+      box_format: xywh
+    iou_cost:
+      name: IoUCost
+      iou_mode: giou
+      weight: 2.0
+  test_cfg:
+    max_per_img: 100
+    score_thr: 0.0
+
+RPNHead:
+  loss_rpn_bbox: 
+    name: L1Loss
+    reduction: sum
+    loss_weight: 12.0
+  in_channel: 256
+  anchor_generator: 
+    name: RetinaAnchorGenerator
+    octave_base_scale: 4
+    scales_per_octave: 3
+    aspect_ratios: [0.5, 1.0, 2.0]
+    strides: [8.0, 16.0, 32.0, 64.0, 128.0]
+  rpn_target_assign:
+    batch_size_per_im: 256
+    fg_fraction: 0.5
+    negative_overlap: 0.3
+    positive_overlap: 0.7
+    use_random: True
+  train_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 4000
+    post_nms_top_n: 1000
+    topk_after_collect: True
+  test_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 1000
+    post_nms_top_n: 1000
+
+Co_RoiHead:
+  in_channel: 256
+  loss_normalize_pos: True
+  head: TwoFCHead
+  roi_extractor:
+    resolution: 7
+    sampling_ratio: 0
+    aligned: True
+  bbox_assigner: 
+    name: BBoxAssigner
+    batch_size_per_im: 512
+    bg_thresh: 0.5
+    fg_thresh: 0.5
+    fg_fraction: 0.25
+    use_random: True
+  bbox_loss: 
+    name: GIoULoss
+    loss_weight: 120.0
+  cls_loss_weight: 12.0
+
+
+# Optimizer
+epoch: 12
+
+LearningRate:
+  base_lr: 0.0002
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 0.1
+    milestones: [11]
+    use_warmup: false
+
+OptimizerBuilder:
+  clip_grad_by_norm: 0.1
+  regularizer: false
+  optimizer:
+    type: AdamW
+    weight_decay: 0.0001
+
+
+# Exporting the model
+export:
+  post_process: True  # Whether post-processing is included in the network when export model.
+  nms: True           # Whether NMS is included in the network when export model.
+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
+  fuse_conv_bn: False

+ 1 - 0
paddlex/repo_apis/PaddleDetection_api/configs/PP-YOLOE_plus-S.yaml

@@ -98,6 +98,7 @@ CSPResNet:
   channels: [64, 128, 256, 512, 1024]
   return_idx: [1, 2, 3]
   use_large_stem: true
+  use_alpha: True
 
 CustomCSPPAN:
   out_channels: [768, 384, 192]

+ 2 - 8
paddlex/repo_apis/PaddleDetection_api/configs/PP-YOLOE_plus-S_face.yaml

@@ -17,10 +17,6 @@ metric: COCO
 num_classes: 1
 
 worker_num: 4
-eval_height: &eval_height 1088
-eval_width: &eval_width 1088
-eval_size: &eval_size [*eval_height, *eval_width]
-
 TrainDataset:
   name: COCODataSet
   image_dir: WIDER_train/images
@@ -62,17 +58,15 @@ TrainReader:
 EvalReader:
   sample_transforms:
     - Decode: {}
-    - Resize: {target_size: *eval_size, keep_ratio: False, interp: 2}
+    - Resize: {target_size: [1088, 1088], keep_ratio: False, interp: 2}
     - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
     - Permute: {}
   batch_size: 2
 
 TestReader:
-  inputs_def:
-    image_shape: [3, *eval_height, *eval_width]
   sample_transforms:
     - Decode: {}
-    - Resize: {target_size: *eval_size, keep_ratio: False, interp: 2}
+    - Resize: {target_size: [1088, 1088], keep_ratio: False, interp: 2}
     - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
     - Permute: {}
   batch_size: 1

+ 2 - 8
paddlex/repo_apis/PaddleDetection_api/configs/PicoDet_LCNet_x2_5_face.yaml

@@ -93,10 +93,6 @@ OptimizerBuilder:
     type: L2
 
 worker_num: 6
-eval_height: &eval_height 1088
-eval_width: &eval_width 1088
-eval_size: &eval_size [*eval_height, *eval_width]
-
 TrainReader:
   sample_transforms:
   - Decode: {}
@@ -116,7 +112,7 @@ TrainReader:
 EvalReader:
   sample_transforms:
   - Decode: {}
-  - Resize: {interp: 2, target_size: *eval_size, keep_ratio: False}
+  - Resize: {interp: 2, target_size: [1088, 1088], keep_ratio: False}
   - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
   - Permute: {}
   batch_transforms:
@@ -126,11 +122,9 @@ EvalReader:
 
 
 TestReader:
-  inputs_def:
-    image_shape: [3, *eval_height, *eval_width]
   sample_transforms:
   - Decode: {}
-  - Resize: {interp: 2, target_size: *eval_size, keep_ratio: False}
+  - Resize: {interp: 2, target_size: [1088, 1088], keep_ratio: False}
   - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
   - Permute: {}
   batch_size: 1

+ 60 - 0
paddlex/repo_apis/PaddleDetection_api/object_det/register.py

@@ -972,3 +972,63 @@ register_model_info(
         },
     }
 )
+
+register_model_info(
+    {
+        "model_name": "Co-Deformable-DETR-R50",
+        "suite": "Det",
+        "config_path": osp.join(PDX_CONFIG_DIR, "Co-Deformable-DETR-R50.yaml"),
+        "supported_apis": ["train", "evaluate", "predict", "export", "infer"],
+        "supported_dataset_types": ["COCODetDataset"],
+        "supported_train_opts": {
+            "device": ["cpu", "gpu_nxcx", "xpu", "npu", "mlu"],
+            "dy2st": False,
+            "amp": ["OFF"],
+        },
+    }
+)
+
+register_model_info(
+    {
+        "model_name": "Co-Deformable-DETR-Swin-T",
+        "suite": "Det",
+        "config_path": osp.join(PDX_CONFIG_DIR, "Co-Deformable-DETR-Swin-T.yaml"),
+        "supported_apis": ["train", "evaluate", "predict", "export", "infer"],
+        "supported_dataset_types": ["COCODetDataset"],
+        "supported_train_opts": {
+            "device": ["cpu", "gpu_nxcx", "xpu", "npu", "mlu"],
+            "dy2st": False,
+            "amp": ["OFF"],
+        },
+    }
+)
+
+register_model_info(
+    {
+        "model_name": "Co-DINO-R50",
+        "suite": "Det",
+        "config_path": osp.join(PDX_CONFIG_DIR, "Co-DINO-R50.yaml"),
+        "supported_apis": ["train", "evaluate", "predict", "export", "infer"],
+        "supported_dataset_types": ["COCODetDataset"],
+        "supported_train_opts": {
+            "device": ["cpu", "gpu_nxcx", "xpu", "npu", "mlu"],
+            "dy2st": False,
+            "amp": ["OFF"],
+        },
+    }
+)
+
+register_model_info(
+    {
+        "model_name": "Co-DINO-Swin-L",
+        "suite": "Det",
+        "config_path": osp.join(PDX_CONFIG_DIR, "Co-DINO-Swin-L.yaml"),
+        "supported_apis": ["train", "evaluate", "predict", "export", "infer"],
+        "supported_dataset_types": ["COCODetDataset"],
+        "supported_train_opts": {
+            "device": ["cpu", "gpu_nxcx", "xpu", "npu", "mlu"],
+            "dy2st": False,
+            "amp": ["OFF"],
+        },
+    }
+)