batch_size: 2 # 8 gpu, total bs=16
epochs: 12

train_dataset:
  type: NuscenesMMDataset
  ann_file: ./data/nuscenes/nuscenes_infos_train.pkl
  data_root: ./data/nuscenes
  class_names: [
            'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
            'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
        ]
  modality: multimodal
  transforms:
    - type: LoadPointsFromFile
      load_dim: 5
      use_dim: 5
    - type: LoadPointsFromMultiSweeps
      sweeps_num: 10
    - type: LoadAnnotations3D
      with_bbox_3d: true
      with_label_3d: true
    - type: LoadMultiViewImageFromFiles
      project_pts_to_img_depth: true
    - type: PointsRangeFilter
      point_cloud_range: [-50, -50, -5, 50, 50, 3]
    - type: SampleRangeFilter
      point_cloud_range: [-50, -50, -5, 50, 50, 3]
    - type: SampleNameFilter
      classes: ['car', 'truck', 'trailer', 'bus', 'construction_vehicle',
                'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone','barrier'
                ]
    - type: PointShuffle
    - type: ResizeImage
      img_scale: [[800, 448]]
      keep_ratio: true
    - type: NormalizeImage
      mean: [123.675, 116.28, 103.53]
      std: [58.395, 57.12, 57.375]
      to_rgb: true
    - type: PadImage
      size_divisor: 32
    - type: SampleFilterByKey
      keys: ['img', 'img_depth', 'points', 'gt_bboxes_3d', 'gt_labels_3d']
  mode: train

val_dataset:
  type: NuscenesMMDataset
  ann_file: ./data/nuscenes/nuscenes_infos_val.pkl
  data_root: ./data/nuscenes
  class_names: [
            'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
            'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
        ]
  modality: multimodal
  transforms:
    - type: LoadPointsFromFile
      load_dim: 5
      use_dim: 5
    - type: LoadPointsFromMultiSweeps
      sweeps_num: 10
    - type: LoadMultiViewImageFromFiles
    - type: ResizeImage
      img_scale: [[800, 448]]
      keep_ratio: true
    - type: NormalizeImage
      mean: [123.675, 116.28, 103.53]
      std: [58.395, 57.12, 57.375]
      to_rgb: true
    - type: PadImage
      size_divisor: 32
    - type: SampleFilterByKey
      keys: ['points', 'img']
  mode: val

model:
  type: BEVFFasterRCNN
  se: True
  lc_fusion: True
  camera_stream: True
  lss: False
  grid: 0.5
  num_views: 6
  final_dim: [900, 1600]
  downsample: 8
  pts_voxel_layer:
    max_num_points_in_voxel: 64
    point_cloud_range: [-50., -50., -5., 50., 50., 3.]
    voxel_size: [0.25, 0.25, 8.]
    max_num_voxels: [30000, 40000]
  pts_voxel_encoder:
    type: HardVFE
    in_channels: 4
    feat_channels: [64, 64]
    with_distance: False
    voxel_size: [0.25, 0.25, 8]
    with_cluster_center: True
    with_voxel_center: True
    point_cloud_range: [-50, -50, -5, 50, 50, 3]
  pts_middle_encoder:
    type: PointPillarsScatter
    in_channels: 64
    point_cloud_range: [-50, -50, -5, 50, 50, 3]
    voxel_size: [0.25, 0.25, 8]
  pts_backbone:
    type: SecondBackbone
    in_channels: 64
    layer_nums: [3, 5, 5]
    downsample_strides: [2, 2, 2]
    out_channels: [64, 128, 256]
  pts_neck:
    type: SecondFPN
    in_channels: [64, 128, 256]
    upsample_strides: [1, 2, 4]
    out_channels: [128, 128, 128]
  img_backbone:
      type: CBSwinTransformer
      embed_dim: 96
      depths: [2, 2, 6, 2]
      num_heads: [3, 6, 12, 24]
      window_size: 7
      mlp_ratio: 4.0
      qkv_bias: true
      qk_scale: null
      drop_rate: 0.0
      attn_drop_rate: 0.0
      drop_path_rate: 0.2
      ape: false
      patch_norm: true
      out_indices: [0, 1, 2, 3]
  img_neck:
      type: FPNC
      final_dim: [900, 1600]
      downsample: 8
      in_channels: [96, 192, 384, 768]
      out_channels: 256
      outC: 256
      use_adp: true
      num_outs: 5
  pts_bbox_head:
        type: Anchor3DHead
        num_classes: 10
        in_channels: 384
        feat_channels: 384
        use_direction_classifier: true
        anchor_generator:
          type: AlignedAnchor3DRangeGenerator
          ranges: [[-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
                    [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
                    [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
                    [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
                    [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
                    [-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
                    [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965]]
          sizes: [[1.95017717, 4.60718145, 1.72270761],
                   [2.4560939, 6.73778078, 2.73004906],
                   [2.87427237, 12.01320693, 3.81509561],
                   [0.60058911, 1.68452161, 1.27192197],
                   [0.66344886, 0.7256437, 1.75748069],
                   [0.39694519, 0.40359262, 1.06232151],
                   [2.49008838, 0.48578221, 0.98297065]]
          custom_values: [0, 0]
          rotations: [0, 1.57]
          reshape_out: true
        assigner_per_size: false
        diff_rad_by_sin: true
        dir_offset: 0.7854  # pi/4
        dir_limit_offset: 0
        bbox_coder:
          type: DeltaXYZWLHRBBoxCoder
          code_size: 9
        loss_cls:
            type: WeightedFocalLoss
            use_sigmoid: true
            gamma: 2.0
            alpha: 0.25
            loss_weight: 1.0
        loss_bbox:
            type: SmoothL1Loss
            beta: 0.1111111111111111
            loss_weight: 1.0
        loss_dir:
            type: CrossEntropyLoss
            use_sigmoid: false
            loss_weight: 0.2
        use_sigmoid_cls: true
        train_cfg:
          code_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
          pos_weight: -1
        test_cfg:
          use_rotate_nms: true
          nms_across_levels: false
          nms_pre: 1000
          nms_thr: 0.2
          score_thr: 0.05
          min_bbox_size: 0
          max_num: 500

optimizer:
  type: AdamW
  beta1: 0.9
  beta2: 0.999
  weight_decay: 0.05
  grad_clip:
    type: ClipGradByGlobalNorm
    clip_norm: 35

lr_scheduler:
  type: LinearWarmup
  learning_rate:
    type: MultiStepDecay
    milestones: [6032, 8669] # [879*8e-1000, 879*11e-1000]
    learning_rate: 0.001
  warmup_steps: 1000
  start_lr: 1.0e-6
  end_lr: 0.001