| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215 |
- batch_size: 2 # 8 gpu, total bs=16
- epochs: 12
- train_dataset:
- type: NuscenesMMDataset
- ann_file: ./data/nuscenes/nuscenes_infos_train.pkl
- data_root: ./data/nuscenes
- class_names: [
- 'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
- 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
- ]
- modality: multimodal
- transforms:
- - type: LoadPointsFromFile
- load_dim: 5
- use_dim: 5
- - type: LoadPointsFromMultiSweeps
- sweeps_num: 10
- - type: LoadAnnotations3D
- with_bbox_3d: true
- with_label_3d: true
- - type: LoadMultiViewImageFromFiles
- project_pts_to_img_depth: true
- - type: PointsRangeFilter
- point_cloud_range: [-50, -50, -5, 50, 50, 3]
- - type: SampleRangeFilter
- point_cloud_range: [-50, -50, -5, 50, 50, 3]
- - type: SampleNameFilter
- classes: ['car', 'truck', 'trailer', 'bus', 'construction_vehicle',
- 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone','barrier'
- ]
- - type: PointShuffle
- - type: ResizeImage
- img_scale: [[800, 448]]
- keep_ratio: true
- - type: NormalizeImage
- mean: [123.675, 116.28, 103.53]
- std: [58.395, 57.12, 57.375]
- to_rgb: true
- - type: PadImage
- size_divisor: 32
- - type: SampleFilterByKey
- keys: ['img', 'img_depth', 'points', 'gt_bboxes_3d', 'gt_labels_3d']
- mode: train
- val_dataset:
- type: NuscenesMMDataset
- ann_file: ./data/nuscenes/nuscenes_infos_val.pkl
- data_root: ./data/nuscenes
- class_names: [
- 'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
- 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
- ]
- modality: multimodal
- transforms:
- - type: LoadPointsFromFile
- load_dim: 5
- use_dim: 5
- - type: LoadPointsFromMultiSweeps
- sweeps_num: 10
- - type: LoadMultiViewImageFromFiles
- - type: ResizeImage
- img_scale: [[800, 448]]
- keep_ratio: true
- - type: NormalizeImage
- mean: [123.675, 116.28, 103.53]
- std: [58.395, 57.12, 57.375]
- to_rgb: true
- - type: PadImage
- size_divisor: 32
- - type: SampleFilterByKey
- keys: ['points', 'img']
- mode: val
- model:
- type: BEVFFasterRCNN
- se: True
- lc_fusion: True
- camera_stream: True
- lss: False
- grid: 0.5
- num_views: 6
- final_dim: [900, 1600]
- downsample: 8
- pts_voxel_layer:
- max_num_points_in_voxel: 64
- point_cloud_range: [-50., -50., -5., 50., 50., 3.]
- voxel_size: [0.25, 0.25, 8.]
- max_num_voxels: [30000, 40000]
- pts_voxel_encoder:
- type: HardVFE
- in_channels: 4
- feat_channels: [64, 64]
- with_distance: False
- voxel_size: [0.25, 0.25, 8]
- with_cluster_center: True
- with_voxel_center: True
- point_cloud_range: [-50, -50, -5, 50, 50, 3]
- pts_middle_encoder:
- type: PointPillarsScatter
- in_channels: 64
- point_cloud_range: [-50, -50, -5, 50, 50, 3]
- voxel_size: [0.25, 0.25, 8]
- pts_backbone:
- type: SecondBackbone
- in_channels: 64
- layer_nums: [3, 5, 5]
- downsample_strides: [2, 2, 2]
- out_channels: [64, 128, 256]
- pts_neck:
- type: SecondFPN
- in_channels: [64, 128, 256]
- upsample_strides: [1, 2, 4]
- out_channels: [128, 128, 128]
- img_backbone:
- type: CBSwinTransformer
- embed_dim: 96
- depths: [2, 2, 6, 2]
- num_heads: [3, 6, 12, 24]
- window_size: 7
- mlp_ratio: 4.0
- qkv_bias: true
- qk_scale: null
- drop_rate: 0.0
- attn_drop_rate: 0.0
- drop_path_rate: 0.2
- ape: false
- patch_norm: true
- out_indices: [0, 1, 2, 3]
- img_neck:
- type: FPNC
- final_dim: [900, 1600]
- downsample: 8
- in_channels: [96, 192, 384, 768]
- out_channels: 256
- outC: 256
- use_adp: true
- num_outs: 5
- pts_bbox_head:
- type: Anchor3DHead
- num_classes: 10
- in_channels: 384
- feat_channels: 384
- use_direction_classifier: true
- anchor_generator:
- type: AlignedAnchor3DRangeGenerator
- ranges: [[-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
- [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
- [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
- [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
- [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
- [-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
- [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965]]
- sizes: [[1.95017717, 4.60718145, 1.72270761],
- [2.4560939, 6.73778078, 2.73004906],
- [2.87427237, 12.01320693, 3.81509561],
- [0.60058911, 1.68452161, 1.27192197],
- [0.66344886, 0.7256437, 1.75748069],
- [0.39694519, 0.40359262, 1.06232151],
- [2.49008838, 0.48578221, 0.98297065]]
- custom_values: [0, 0]
- rotations: [0, 1.57]
- reshape_out: true
- assigner_per_size: false
- diff_rad_by_sin: true
- dir_offset: 0.7854 # pi/4
- dir_limit_offset: 0
- bbox_coder:
- type: DeltaXYZWLHRBBoxCoder
- code_size: 9
- loss_cls:
- type: WeightedFocalLoss
- use_sigmoid: true
- gamma: 2.0
- alpha: 0.25
- loss_weight: 1.0
- loss_bbox:
- type: SmoothL1Loss
- beta: 0.1111111111111111
- loss_weight: 1.0
- loss_dir:
- type: CrossEntropyLoss
- use_sigmoid: false
- loss_weight: 0.2
- use_sigmoid_cls: true
- train_cfg:
- code_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
- pos_weight: -1
- test_cfg:
- use_rotate_nms: true
- nms_across_levels: false
- nms_pre: 1000
- nms_thr: 0.2
- score_thr: 0.05
- min_bbox_size: 0
- max_num: 500
- optimizer:
- type: AdamW
- beta1: 0.9
- beta2: 0.999
- weight_decay: 0.05
- grad_clip:
- type: ClipGradByGlobalNorm
- clip_norm: 35
- lr_scheduler:
- type: LinearWarmup
- learning_rate:
- type: MultiStepDecay
- milestones: [6032, 8669] # [879*8e-1000, 879*11e-1000]
- learning_rate: 0.001
- warmup_steps: 1000
- start_lr: 1.0e-6
- end_lr: 0.001
|