BEVFusion.yaml 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. batch_size: 2 # 8 gpu, total bs=16
  2. epochs: 12
  3. train_dataset:
  4. type: NuscenesMMDataset
  5. ann_file: ./data/nuscenes/nuscenes_infos_train.pkl
  6. data_root: ./data/nuscenes
  7. class_names: [
  8. 'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
  9. 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
  10. ]
  11. modality: multimodal
  12. transforms:
  13. - type: LoadPointsFromFile
  14. load_dim: 5
  15. use_dim: 5
  16. - type: LoadPointsFromMultiSweeps
  17. sweeps_num: 10
  18. - type: LoadAnnotations3D
  19. with_bbox_3d: true
  20. with_label_3d: true
  21. - type: LoadMultiViewImageFromFiles
  22. project_pts_to_img_depth: true
  23. - type: PointsRangeFilter
  24. point_cloud_range: [-50, -50, -5, 50, 50, 3]
  25. - type: SampleRangeFilter
  26. point_cloud_range: [-50, -50, -5, 50, 50, 3]
  27. - type: SampleNameFilter
  28. classes: ['car', 'truck', 'trailer', 'bus', 'construction_vehicle',
  29. 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone','barrier'
  30. ]
  31. - type: PointShuffle
  32. - type: ResizeImage
  33. img_scale: [[800, 448]]
  34. keep_ratio: true
  35. - type: NormalizeImage
  36. mean: [123.675, 116.28, 103.53]
  37. std: [58.395, 57.12, 57.375]
  38. to_rgb: true
  39. - type: PadImage
  40. size_divisor: 32
  41. - type: SampleFilterByKey
  42. keys: ['img', 'img_depth', 'points', 'gt_bboxes_3d', 'gt_labels_3d']
  43. mode: train
  44. val_dataset:
  45. type: NuscenesMMDataset
  46. ann_file: ./data/nuscenes/nuscenes_infos_val.pkl
  47. data_root: ./data/nuscenes
  48. class_names: [
  49. 'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
  50. 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
  51. ]
  52. modality: multimodal
  53. transforms:
  54. - type: LoadPointsFromFile
  55. load_dim: 5
  56. use_dim: 5
  57. - type: LoadPointsFromMultiSweeps
  58. sweeps_num: 10
  59. - type: LoadMultiViewImageFromFiles
  60. - type: ResizeImage
  61. img_scale: [[800, 448]]
  62. keep_ratio: true
  63. - type: NormalizeImage
  64. mean: [123.675, 116.28, 103.53]
  65. std: [58.395, 57.12, 57.375]
  66. to_rgb: true
  67. - type: PadImage
  68. size_divisor: 32
  69. - type: SampleFilterByKey
  70. keys: ['points', 'img']
  71. mode: val
  72. model:
  73. type: BEVFFasterRCNN
  74. se: True
  75. lc_fusion: True
  76. camera_stream: True
  77. lss: False
  78. grid: 0.5
  79. num_views: 6
  80. final_dim: [900, 1600]
  81. downsample: 8
  82. pts_voxel_layer:
  83. max_num_points_in_voxel: 64
  84. point_cloud_range: [-50., -50., -5., 50., 50., 3.]
  85. voxel_size: [0.25, 0.25, 8.]
  86. max_num_voxels: [30000, 40000]
  87. pts_voxel_encoder:
  88. type: HardVFE
  89. in_channels: 4
  90. feat_channels: [64, 64]
  91. with_distance: False
  92. voxel_size: [0.25, 0.25, 8]
  93. with_cluster_center: True
  94. with_voxel_center: True
  95. point_cloud_range: [-50, -50, -5, 50, 50, 3]
  96. pts_middle_encoder:
  97. type: PointPillarsScatter
  98. in_channels: 64
  99. point_cloud_range: [-50, -50, -5, 50, 50, 3]
  100. voxel_size: [0.25, 0.25, 8]
  101. pts_backbone:
  102. type: SecondBackbone
  103. in_channels: 64
  104. layer_nums: [3, 5, 5]
  105. downsample_strides: [2, 2, 2]
  106. out_channels: [64, 128, 256]
  107. pts_neck:
  108. type: SecondFPN
  109. in_channels: [64, 128, 256]
  110. upsample_strides: [1, 2, 4]
  111. out_channels: [128, 128, 128]
  112. img_backbone:
  113. type: CBSwinTransformer
  114. embed_dim: 96
  115. depths: [2, 2, 6, 2]
  116. num_heads: [3, 6, 12, 24]
  117. window_size: 7
  118. mlp_ratio: 4.0
  119. qkv_bias: true
  120. qk_scale: null
  121. drop_rate: 0.0
  122. attn_drop_rate: 0.0
  123. drop_path_rate: 0.2
  124. ape: false
  125. patch_norm: true
  126. out_indices: [0, 1, 2, 3]
  127. img_neck:
  128. type: FPNC
  129. final_dim: [900, 1600]
  130. downsample: 8
  131. in_channels: [96, 192, 384, 768]
  132. out_channels: 256
  133. outC: 256
  134. use_adp: true
  135. num_outs: 5
  136. pts_bbox_head:
  137. type: Anchor3DHead
  138. num_classes: 10
  139. in_channels: 384
  140. feat_channels: 384
  141. use_direction_classifier: true
  142. anchor_generator:
  143. type: AlignedAnchor3DRangeGenerator
  144. ranges: [[-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
  145. [-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
  146. [-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
  147. [-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
  148. [-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
  149. [-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
  150. [-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965]]
  151. sizes: [[1.95017717, 4.60718145, 1.72270761],
  152. [2.4560939, 6.73778078, 2.73004906],
  153. [2.87427237, 12.01320693, 3.81509561],
  154. [0.60058911, 1.68452161, 1.27192197],
  155. [0.66344886, 0.7256437, 1.75748069],
  156. [0.39694519, 0.40359262, 1.06232151],
  157. [2.49008838, 0.48578221, 0.98297065]]
  158. custom_values: [0, 0]
  159. rotations: [0, 1.57]
  160. reshape_out: true
  161. assigner_per_size: false
  162. diff_rad_by_sin: true
  163. dir_offset: 0.7854 # pi/4
  164. dir_limit_offset: 0
  165. bbox_coder:
  166. type: DeltaXYZWLHRBBoxCoder
  167. code_size: 9
  168. loss_cls:
  169. type: WeightedFocalLoss
  170. use_sigmoid: true
  171. gamma: 2.0
  172. alpha: 0.25
  173. loss_weight: 1.0
  174. loss_bbox:
  175. type: SmoothL1Loss
  176. beta: 0.1111111111111111
  177. loss_weight: 1.0
  178. loss_dir:
  179. type: CrossEntropyLoss
  180. use_sigmoid: false
  181. loss_weight: 0.2
  182. use_sigmoid_cls: true
  183. train_cfg:
  184. code_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
  185. pos_weight: -1
  186. test_cfg:
  187. use_rotate_nms: true
  188. nms_across_levels: false
  189. nms_pre: 1000
  190. nms_thr: 0.2
  191. score_thr: 0.05
  192. min_bbox_size: 0
  193. max_num: 500
  194. optimizer:
  195. type: AdamW
  196. beta1: 0.9
  197. beta2: 0.999
  198. weight_decay: 0.05
  199. grad_clip:
  200. type: ClipGradByGlobalNorm
  201. clip_norm: 35
  202. lr_scheduler:
  203. type: LinearWarmup
  204. learning_rate:
  205. type: MultiStepDecay
  206. milestones: [6032, 8669] # [879*8e-1000, 879*11e-1000]
  207. learning_rate: 0.001
  208. warmup_steps: 1000
  209. start_lr: 1.0e-6
  210. end_lr: 0.001