zhengchun
/
PaddleX


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
							# Runtime
find_unused_parameters: True
use_gpu: true
use_xpu: false
use_mlu: false
use_npu: false
log_iter: 20
save_dir: output
snapshot_epoch: 1
print_flops: false
print_params: false
use_ema: true


# Dataset
metric: COCO
num_classes: 80

TrainDataset:
  name: COCODataSet
  image_dir: train2017
  anno_path: annotations/instances_train2017.json
  dataset_dir: dataset/coco
  allow_empty: true
  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']

EvalDataset:
  name: COCODataSet
  image_dir: val2017
  anno_path: annotations/instances_val2017.json
  dataset_dir: dataset/coco
  allow_empty: true

TestDataset:
  name: ImageFolder
  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'


# Reader
worker_num: 2
TrainReader:
  sample_transforms:
  - Decode: {}
  - RandomFlip: {prob: 0.5}
  - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ],
                    transforms2: [
                        RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] },
                        RandomSizeCrop: { min_size: 384, max_size: 600 },
                        RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ]
  }
  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
  - Permute: {}
  batch_transforms:
  - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true}
  batch_size: 2
  shuffle: true
  drop_last: true
  collate_batch: false
  use_shared_memory: false

EvalReader:
  sample_transforms:
  - Decode: {}
  - Resize: {target_size: [800, 1333], keep_ratio: True}
  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
  - Permute: {}
  batch_size: 1
  shuffle: false
  drop_last: false

TestReader:
  sample_transforms:
  - Decode: {}
  - Resize: {target_size: [800, 1333], keep_ratio: True}
  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
  - Permute: {}
  batch_size: 1
  shuffle: false
  drop_last: false


# Model
architecture: CO_DETR
pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams
num_dec_layer: &num_dec_layer 6

CO_DETR:
  backbone: ResNet
  backbone_lr_mult: 0.1
  neck: ChannelMapper
  query_head: CoDINOHead
  rpn_head: RPNHead
  roi_head: Co_RoiHead
  bbox_head:
    name: CoATSSHead
    in_channels: 256
    stacked_convs: 1
    feat_channels: 256
    bbox_weight: [10., 10., 5., 5.]
    anchor_generator: 
      name: CoAnchorGenerator
      octave_base_scale: 8
      scales_per_octave: 1
      aspect_ratios: [1.0]
      strides: [4., 8., 16., 32., 64., 128.]
    assigner: 
      name: ATSSAssigner
      topk: 9
      sm_use: True
    loss_cls: 
      name: Weighted_FocalLoss
      use_sigmoid: true
      gamma: 2.0
      alpha: 0.25
      loss_weight: 12.0
    loss_bbox: 
      name: GIoULoss
      loss_weight: 24.0
      reduction: sum
    loss_cent_weight: 12.0

ResNet:
  # index 0 stands for res2
  depth: 50
  norm_type: bn
  freeze_at: 0
  return_idx: [0, 1, 2, 3]
  num_stages: 4

ChannelMapper:
  in_channels: [256, 512, 1024, 2048]
  kernel_size: 1
  out_channels: 256
  norm_type: "gn"
  norm_groups: 32
  act: None
  num_outs: 5
  strides: [4., 8., 16., 32., 64.]
 
CoDINOHead:
  num_query: 900
  num_dn_query: 100
  label_noise_ratio: 0.5
  box_noise_scale: 1.0
  in_channels: 2048
  sync_cls_avg_factor: True
  with_box_refine: True
  as_two_stage: True
  mixed_selection: True
  transformer:
    name: CoDINOTransformer
    two_stage_num_proposals: 900
    with_pos_coord: True
    with_coord_feat: False
    num_co_heads: 2
    num_feature_levels: 5
    as_two_stage: True
    mixed_selection: True
    embed_dims: &embed_dims 256
    encoder:
      name: DeformableTransformerEncoder
      num_layers: *num_dec_layer
      with_rp: 6
      encoder_layer:
        name: DeformableTransformerEncoderLayer
        d_model: *embed_dims
        n_head: 8
        dim_feedforward: 2048
        n_levels: 5
        n_points: 4
        dropout: 0.0
    decoder:
      name: DINOTransformerDecoder
      hidden_dim: *embed_dims
      num_layers: *num_dec_layer
      decoder_layer:
        name: DINOTransformerDecoderLayer
        d_model: *embed_dims
        n_head: 8
        dim_feedforward: 2048
        n_points: 4
        n_levels: 5
        dropout: 0.0
  positional_encoding:
    name: PositionEmbedding
    num_pos_feats: 128
    temperature: 20
    normalize: true
  loss_cls:
    name: QualityFocalLoss
    use_sigmoid: true
    beta: 2.0
    loss_weight: 1.0
  loss_bbox:
    name: L1Loss
    loss_weight: 5.0
  loss_iou:
    name: GIoULoss
    loss_weight: 2.0
    reduction: sum
  assigner:
    name: HungarianAssigner
    cls_cost:
      name: FocalLossCost
      weight: 2.0
    reg_cost:
      name: BBoxL1Cost
      weight: 5.0
      box_format: xywh
    iou_cost:
      name: IoUCost
      iou_mode: giou
      weight: 2.0
  test_cfg:
    max_per_img: 300
    score_thr: 0.0
  nms: 
    name: MultiClassNMS
    keep_top_k: -1
    score_threshold: 0.0
    nms_threshold: 0.8

RPNHead:
  loss_rpn_bbox: 
    name: L1Loss
    reduction: sum
    loss_weight: 12.0
  in_channel: 256
  anchor_generator: 
    name: RetinaAnchorGenerator
    octave_base_scale: 4
    scales_per_octave: 3
    aspect_ratios: [0.5, 1.0, 2.0]
    strides: [4., 8., 16., 32., 64., 128.]
  rpn_target_assign:
    batch_size_per_im: 256
    fg_fraction: 0.5
    negative_overlap: 0.3
    positive_overlap: 0.7
    use_random: True
  train_proposal:
    min_size: 0.0
    nms_thresh: 0.7
    pre_nms_top_n: 4000
    post_nms_top_n: 1000
    topk_after_collect: True
  test_proposal:
    min_size: 0.0
    nms_thresh: 0.7
    pre_nms_top_n: 1000
    post_nms_top_n: 1000

Co_RoiHead:
  in_channel: 256
  loss_normalize_pos: True
  head: TwoFCHead
  roi_extractor:
    end_level: 4
    resolution: 7
    sampling_ratio: 0
    aligned: True
  bbox_assigner: 
    name: BBoxAssigner
    batch_size_per_im: 512
    bg_thresh: 0.5
    fg_thresh: 0.5
    fg_fraction: 0.25
    use_random: True
  bbox_loss: 
    name: GIoULoss
    loss_weight: 120.0
  cls_loss_weight: 12.0


# Optimizer
epoch: 12

LearningRate:
  base_lr: 0.0002
  schedulers:
  - !PiecewiseDecay
    gamma: 0.1
    milestones: [11]
    use_warmup: false

OptimizerBuilder:
  clip_grad_by_norm: 0.1
  regularizer: false
  optimizer:
    type: AdamW
    weight_decay: 0.0001


# Exporting the model
export:
  post_process: True  # Whether post-processing is included in the network when export model.
  nms: True           # Whether NMS is included in the network when export model.
  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
  fuse_conv_bn: False