Zizizi-hao commited on
Commit
39bf596
·
verified ·
1 Parent(s): 38b9ba0

UniAD V2.0 training config file

Browse files
config/base_bevformer.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
2
+ class_names = [
3
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
4
+ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
5
+ ]
6
+ dataset_type = 'CustomNuScenesDataset'
7
+ data_root = 'data/nuscenes/'
8
+ input_modality = dict(
9
+ use_lidar=False,
10
+ use_camera=True,
11
+ use_radar=False,
12
+ use_map=False,
13
+ use_external=True)
14
+ file_client_args = dict(backend='disk')
15
+ train_pipeline = [
16
+ dict(
17
+ type='LoadMultiViewImageFromFilesInCeph',
18
+ to_float32=True,
19
+ file_client_args=dict(backend='disk'),
20
+ img_root=''),
21
+ dict(type='PhotoMetricDistortionMultiViewImage'),
22
+ dict(
23
+ type='LoadAnnotations3D',
24
+ with_bbox_3d=True,
25
+ with_label_3d=True,
26
+ with_attr_label=False),
27
+ dict(
28
+ type='ObjectRangeFilter',
29
+ point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
30
+ dict(
31
+ type='ObjectNameFilter',
32
+ classes=[
33
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
34
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
35
+ ]),
36
+ dict(
37
+ type='NormalizeMultiviewImage',
38
+ mean=[103.53, 116.28, 123.675],
39
+ std=[1.0, 1.0, 1.0],
40
+ to_rgb=False),
41
+ dict(type='PadMultiViewImage', size_divisor=32),
42
+ dict(
43
+ type='DefaultFormatBundle3D',
44
+ class_names=[
45
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
46
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
47
+ ]),
48
+ dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
49
+ ]
50
+ test_pipeline = [
51
+ dict(
52
+ type='LoadMultiViewImageFromFilesInCeph',
53
+ to_float32=True,
54
+ file_client_args=dict(backend='disk'),
55
+ img_root=''),
56
+ dict(
57
+ type='NormalizeMultiviewImage',
58
+ mean=[103.53, 116.28, 123.675],
59
+ std=[1.0, 1.0, 1.0],
60
+ to_rgb=False),
61
+ dict(type='PadMultiViewImage', size_divisor=32),
62
+ dict(
63
+ type='MultiScaleFlipAug3D',
64
+ img_scale=(1600, 900),
65
+ pts_scale_ratio=1,
66
+ flip=False,
67
+ transforms=[
68
+ dict(
69
+ type='DefaultFormatBundle3D',
70
+ class_names=[
71
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
72
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian',
73
+ 'traffic_cone'
74
+ ],
75
+ with_label=False),
76
+ dict(type='CustomCollect3D', keys=['img'])
77
+ ])
78
+ ]
79
+ eval_pipeline = [
80
+ dict(
81
+ type='LoadPointsFromFile',
82
+ coord_type='LIDAR',
83
+ load_dim=5,
84
+ use_dim=5,
85
+ file_client_args=dict(backend='disk')),
86
+ dict(
87
+ type='LoadPointsFromMultiSweeps',
88
+ sweeps_num=10,
89
+ file_client_args=dict(backend='disk')),
90
+ dict(
91
+ type='DefaultFormatBundle3D',
92
+ class_names=[
93
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
94
+ 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
95
+ ],
96
+ with_label=False),
97
+ dict(type='Collect3D', keys=['points'])
98
+ ]
99
+ data = dict(
100
+ samples_per_gpu=1,
101
+ workers_per_gpu=4,
102
+ train=dict(
103
+ type='CustomNuScenesDataset',
104
+ data_root='data/nuscenes/',
105
+ ann_file='data/infos/nuscenes_infos_temporal_train.pkl',
106
+ pipeline=[
107
+ dict(
108
+ type='LoadMultiViewImageFromFilesInCeph',
109
+ to_float32=True,
110
+ file_client_args=dict(backend='disk'),
111
+ img_root=''),
112
+ dict(type='PhotoMetricDistortionMultiViewImage'),
113
+ dict(
114
+ type='LoadAnnotations3D',
115
+ with_bbox_3d=True,
116
+ with_label_3d=True,
117
+ with_attr_label=False),
118
+ dict(
119
+ type='ObjectRangeFilter',
120
+ point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
121
+ dict(
122
+ type='ObjectNameFilter',
123
+ classes=[
124
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
125
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian',
126
+ 'traffic_cone'
127
+ ]),
128
+ dict(
129
+ type='NormalizeMultiviewImage',
130
+ mean=[103.53, 116.28, 123.675],
131
+ std=[1.0, 1.0, 1.0],
132
+ to_rgb=False),
133
+ dict(type='PadMultiViewImage', size_divisor=32),
134
+ dict(
135
+ type='DefaultFormatBundle3D',
136
+ class_names=[
137
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
138
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian',
139
+ 'traffic_cone'
140
+ ]),
141
+ dict(
142
+ type='CustomCollect3D',
143
+ keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
144
+ ],
145
+ classes=[
146
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
147
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
148
+ ],
149
+ modality=dict(
150
+ use_lidar=False,
151
+ use_camera=True,
152
+ use_radar=False,
153
+ use_map=False,
154
+ use_external=True),
155
+ test_mode=False,
156
+ box_type_3d='LiDAR',
157
+ use_valid_flag=True,
158
+ bev_size=(200, 200),
159
+ queue_length=4),
160
+ val=dict(
161
+ type='CustomNuScenesDataset',
162
+ data_root='data/nuscenes/',
163
+ ann_file='data/infos/nuscenes_infos_temporal_val.pkl',
164
+ pipeline=[
165
+ dict(
166
+ type='LoadMultiViewImageFromFilesInCeph',
167
+ to_float32=True,
168
+ file_client_args=dict(backend='disk'),
169
+ img_root=''),
170
+ dict(
171
+ type='NormalizeMultiviewImage',
172
+ mean=[103.53, 116.28, 123.675],
173
+ std=[1.0, 1.0, 1.0],
174
+ to_rgb=False),
175
+ dict(type='PadMultiViewImage', size_divisor=32),
176
+ dict(
177
+ type='MultiScaleFlipAug3D',
178
+ img_scale=(1600, 900),
179
+ pts_scale_ratio=1,
180
+ flip=False,
181
+ transforms=[
182
+ dict(
183
+ type='DefaultFormatBundle3D',
184
+ class_names=[
185
+ 'car', 'truck', 'construction_vehicle', 'bus',
186
+ 'trailer', 'barrier', 'motorcycle', 'bicycle',
187
+ 'pedestrian', 'traffic_cone'
188
+ ],
189
+ with_label=False),
190
+ dict(type='CustomCollect3D', keys=['img'])
191
+ ])
192
+ ],
193
+ classes=[
194
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
195
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
196
+ ],
197
+ modality=dict(
198
+ use_lidar=False,
199
+ use_camera=True,
200
+ use_radar=False,
201
+ use_map=False,
202
+ use_external=True),
203
+ test_mode=True,
204
+ box_type_3d='LiDAR',
205
+ bev_size=(200, 200),
206
+ samples_per_gpu=1),
207
+ test=dict(
208
+ type='CustomNuScenesDataset',
209
+ data_root='data/nuscenes/',
210
+ ann_file='data/infos/nuscenes_infos_temporal_val.pkl',
211
+ pipeline=[
212
+ dict(
213
+ type='LoadMultiViewImageFromFilesInCeph',
214
+ to_float32=True,
215
+ file_client_args=dict(backend='disk'),
216
+ img_root=''),
217
+ dict(
218
+ type='NormalizeMultiviewImage',
219
+ mean=[103.53, 116.28, 123.675],
220
+ std=[1.0, 1.0, 1.0],
221
+ to_rgb=False),
222
+ dict(type='PadMultiViewImage', size_divisor=32),
223
+ dict(
224
+ type='MultiScaleFlipAug3D',
225
+ img_scale=(1600, 900),
226
+ pts_scale_ratio=1,
227
+ flip=False,
228
+ transforms=[
229
+ dict(
230
+ type='DefaultFormatBundle3D',
231
+ class_names=[
232
+ 'car', 'truck', 'construction_vehicle', 'bus',
233
+ 'trailer', 'barrier', 'motorcycle', 'bicycle',
234
+ 'pedestrian', 'traffic_cone'
235
+ ],
236
+ with_label=False),
237
+ dict(type='CustomCollect3D', keys=['img'])
238
+ ])
239
+ ],
240
+ classes=[
241
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
242
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
243
+ ],
244
+ modality=dict(
245
+ use_lidar=False,
246
+ use_camera=True,
247
+ use_radar=False,
248
+ use_map=False,
249
+ use_external=True),
250
+ test_mode=True,
251
+ box_type_3d='LiDAR',
252
+ bev_size=(200, 200)),
253
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
254
+ nonshuffler_sampler=dict(type='DistributedSampler'))
255
+ evaluation = dict(
256
+ interval=6,
257
+ pipeline=[
258
+ dict(
259
+ type='LoadMultiViewImageFromFilesInCeph',
260
+ to_float32=True,
261
+ file_client_args=dict(backend='disk'),
262
+ img_root=''),
263
+ dict(
264
+ type='NormalizeMultiviewImage',
265
+ mean=[103.53, 116.28, 123.675],
266
+ std=[1.0, 1.0, 1.0],
267
+ to_rgb=False),
268
+ dict(type='PadMultiViewImage', size_divisor=32),
269
+ dict(
270
+ type='MultiScaleFlipAug3D',
271
+ img_scale=(1600, 900),
272
+ pts_scale_ratio=1,
273
+ flip=False,
274
+ transforms=[
275
+ dict(
276
+ type='DefaultFormatBundle3D',
277
+ class_names=[
278
+ 'car', 'truck', 'construction_vehicle', 'bus',
279
+ 'trailer', 'barrier', 'motorcycle', 'bicycle',
280
+ 'pedestrian', 'traffic_cone'
281
+ ],
282
+ with_label=False),
283
+ dict(type='CustomCollect3D', keys=['img'])
284
+ ])
285
+ ])
286
+ checkpoint_config = dict(interval=1)
287
+ log_config = dict(
288
+ interval=50,
289
+ hooks=[dict(type='TextLoggerHook'),
290
+ dict(type='TensorboardLoggerHook')])
291
+ dist_params = dict(backend='nccl')
292
+ log_level = 'INFO'
293
+ work_dir = 'projects/work_dirs/bevformer/base_bevformer/'
294
+ load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth'
295
+ resume_from = None
296
+ workflow = [('train', 1)]
297
+ plugin = True
298
+ plugin_dir = 'projects/mmdet3d_plugin/'
299
+ voxel_size = [0.2, 0.2, 8]
300
+ img_norm_cfg = dict(
301
+ mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
302
+ _dim_ = 256
303
+ _pos_dim_ = 128
304
+ _ffn_dim_ = 512
305
+ _num_levels_ = 4
306
+ bev_h_ = 200
307
+ bev_w_ = 200
308
+ queue_length = 4
309
+ model = dict(
310
+ type='BEVFormer',
311
+ use_grid_mask=True,
312
+ video_test_mode=True,
313
+ img_backbone=dict(
314
+ type='ResNet',
315
+ depth=101,
316
+ num_stages=4,
317
+ out_indices=(1, 2, 3),
318
+ frozen_stages=1,
319
+ norm_cfg=dict(type='BN2d', requires_grad=False),
320
+ norm_eval=True,
321
+ style='caffe',
322
+ dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
323
+ stage_with_dcn=(False, False, True, True)),
324
+ img_neck=dict(
325
+ type='FPN',
326
+ in_channels=[512, 1024, 2048],
327
+ out_channels=256,
328
+ start_level=0,
329
+ add_extra_convs='on_output',
330
+ num_outs=4,
331
+ relu_before_extra_convs=True),
332
+ pts_bbox_head=dict(
333
+ type='BEVFormerHead',
334
+ bev_h=200,
335
+ bev_w=200,
336
+ num_query=900,
337
+ num_classes=10,
338
+ in_channels=256,
339
+ sync_cls_avg_factor=True,
340
+ with_box_refine=True,
341
+ as_two_stage=False,
342
+ transformer=dict(
343
+ type='PerceptionTransformer',
344
+ rotate_prev_bev=True,
345
+ use_shift=True,
346
+ use_can_bus=True,
347
+ embed_dims=256,
348
+ encoder=dict(
349
+ type='BEVFormerEncoder',
350
+ num_layers=6,
351
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
352
+ num_points_in_pillar=4,
353
+ return_intermediate=False,
354
+ transformerlayers=dict(
355
+ type='BEVFormerLayer',
356
+ attn_cfgs=[
357
+ dict(
358
+ type='TemporalSelfAttention',
359
+ embed_dims=256,
360
+ num_levels=1),
361
+ dict(
362
+ type='SpatialCrossAttention',
363
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
364
+ deformable_attention=dict(
365
+ type='MSDeformableAttention3D',
366
+ embed_dims=256,
367
+ num_points=8,
368
+ num_levels=4),
369
+ embed_dims=256)
370
+ ],
371
+ feedforward_channels=512,
372
+ ffn_dropout=0.1,
373
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
374
+ 'ffn', 'norm'))),
375
+ decoder=dict(
376
+ type='DetectionTransformerDecoder',
377
+ num_layers=6,
378
+ return_intermediate=True,
379
+ transformerlayers=dict(
380
+ type='DetrTransformerDecoderLayer',
381
+ attn_cfgs=[
382
+ dict(
383
+ type='MultiheadAttention',
384
+ embed_dims=256,
385
+ num_heads=8,
386
+ dropout=0.1),
387
+ dict(
388
+ type='CustomMSDeformableAttention',
389
+ embed_dims=256,
390
+ num_levels=1)
391
+ ],
392
+ feedforward_channels=512,
393
+ ffn_dropout=0.1,
394
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
395
+ 'ffn', 'norm')))),
396
+ bbox_coder=dict(
397
+ type='NMSFreeCoder',
398
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
399
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
400
+ max_num=300,
401
+ voxel_size=[0.2, 0.2, 8],
402
+ num_classes=10),
403
+ positional_encoding=dict(
404
+ type='LearnedPositionalEncoding',
405
+ num_feats=128,
406
+ row_num_embed=200,
407
+ col_num_embed=200),
408
+ loss_cls=dict(
409
+ type='FocalLoss',
410
+ use_sigmoid=True,
411
+ gamma=2.0,
412
+ alpha=0.25,
413
+ loss_weight=2.0),
414
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
415
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
416
+ train_cfg=dict(
417
+ pts=dict(
418
+ grid_size=[512, 512, 1],
419
+ voxel_size=[0.2, 0.2, 8],
420
+ point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
421
+ out_size_factor=4,
422
+ assigner=dict(
423
+ type='HungarianAssigner3D',
424
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
425
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
426
+ iou_cost=dict(type='IoUCost', weight=0.0),
427
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]))))
428
+ info_root = 'data/infos/'
429
+ ann_file_train = 'data/infos/nuscenes_infos_temporal_train.pkl'
430
+ ann_file_val = 'data/infos/nuscenes_infos_temporal_val.pkl'
431
+ ann_file_test = 'data/infos/nuscenes_infos_temporal_val.pkl'
432
+ optimizer = dict(
433
+ type='AdamW',
434
+ lr=0.0002,
435
+ paramwise_cfg=dict(custom_keys=dict(img_backbone=dict(lr_mult=0.1))),
436
+ weight_decay=0.01)
437
+ optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
438
+ lr_config = dict(
439
+ policy='CosineAnnealing',
440
+ warmup='linear',
441
+ warmup_iters=500,
442
+ warmup_ratio=0.3333333333333333,
443
+ min_lr_ratio=0.001)
444
+ total_epochs = 24
445
+ runner = dict(type='EpochBasedRunner', max_epochs=24)
446
+ logger_name = 'mmdet'
447
+ gpu_ids = range(0, 1)
config/base_e2e.py ADDED
@@ -0,0 +1,897 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
2
+ class_names = [
3
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
4
+ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
5
+ ]
6
+ dataset_type = 'NuScenesE2EDataset'
7
+ data_root = 'data/nuscenes/'
8
+ input_modality = dict(
9
+ use_lidar=False,
10
+ use_camera=True,
11
+ use_radar=False,
12
+ use_map=False,
13
+ use_external=True)
14
+ file_client_args = dict(backend='disk')
15
+ train_pipeline = [
16
+ dict(
17
+ type='LoadMultiViewImageFromFilesInCeph',
18
+ to_float32=True,
19
+ file_client_args=dict(backend='disk'),
20
+ img_root=''),
21
+ dict(type='PhotoMetricDistortionMultiViewImage'),
22
+ dict(
23
+ type='LoadAnnotations3D_E2E',
24
+ with_bbox_3d=True,
25
+ with_label_3d=True,
26
+ with_attr_label=False,
27
+ with_future_anns=True,
28
+ with_ins_inds_3d=True,
29
+ ins_inds_add_1=True),
30
+ dict(
31
+ type='GenerateOccFlowLabels',
32
+ grid_conf=dict(
33
+ xbound=[-50.0, 50.0, 0.5],
34
+ ybound=[-50.0, 50.0, 0.5],
35
+ zbound=[-10.0, 10.0, 20.0]),
36
+ ignore_index=255,
37
+ only_vehicle=True,
38
+ filter_invisible=False),
39
+ dict(
40
+ type='ObjectRangeFilterTrack',
41
+ point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
42
+ dict(
43
+ type='ObjectNameFilterTrack',
44
+ classes=[
45
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
46
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
47
+ ]),
48
+ dict(
49
+ type='NormalizeMultiviewImage',
50
+ mean=[103.53, 116.28, 123.675],
51
+ std=[1.0, 1.0, 1.0],
52
+ to_rgb=False),
53
+ dict(type='PadMultiViewImage', size_divisor=32),
54
+ dict(
55
+ type='DefaultFormatBundle3D',
56
+ class_names=[
57
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
58
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
59
+ ]),
60
+ dict(
61
+ type='CustomCollect3D',
62
+ keys=[
63
+ 'gt_bboxes_3d', 'gt_labels_3d', 'gt_inds', 'img', 'timestamp',
64
+ 'l2g_r_mat', 'l2g_t', 'gt_fut_traj', 'gt_fut_traj_mask',
65
+ 'gt_past_traj', 'gt_past_traj_mask', 'gt_sdc_bbox', 'gt_sdc_label',
66
+ 'gt_sdc_fut_traj', 'gt_sdc_fut_traj_mask', 'gt_lane_labels',
67
+ 'gt_lane_bboxes', 'gt_lane_masks', 'gt_segmentation',
68
+ 'gt_instance', 'gt_centerness', 'gt_offset', 'gt_flow',
69
+ 'gt_backward_flow', 'gt_occ_has_invalid_frame',
70
+ 'gt_occ_img_is_valid', 'gt_future_boxes', 'gt_future_labels',
71
+ 'sdc_planning', 'sdc_planning_mask', 'command'
72
+ ])
73
+ ]
74
+ test_pipeline = [
75
+ dict(
76
+ type='LoadMultiViewImageFromFilesInCeph',
77
+ to_float32=True,
78
+ file_client_args=dict(backend='disk'),
79
+ img_root=''),
80
+ dict(
81
+ type='NormalizeMultiviewImage',
82
+ mean=[103.53, 116.28, 123.675],
83
+ std=[1.0, 1.0, 1.0],
84
+ to_rgb=False),
85
+ dict(type='PadMultiViewImage', size_divisor=32),
86
+ dict(
87
+ type='LoadAnnotations3D_E2E',
88
+ with_bbox_3d=False,
89
+ with_label_3d=False,
90
+ with_attr_label=False,
91
+ with_future_anns=True,
92
+ with_ins_inds_3d=False,
93
+ ins_inds_add_1=True),
94
+ dict(
95
+ type='GenerateOccFlowLabels',
96
+ grid_conf=dict(
97
+ xbound=[-50.0, 50.0, 0.5],
98
+ ybound=[-50.0, 50.0, 0.5],
99
+ zbound=[-10.0, 10.0, 20.0]),
100
+ ignore_index=255,
101
+ only_vehicle=True,
102
+ filter_invisible=False),
103
+ dict(
104
+ type='MultiScaleFlipAug3D',
105
+ img_scale=(1600, 900),
106
+ pts_scale_ratio=1,
107
+ flip=False,
108
+ transforms=[
109
+ dict(
110
+ type='DefaultFormatBundle3D',
111
+ class_names=[
112
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
113
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian',
114
+ 'traffic_cone'
115
+ ],
116
+ with_label=False),
117
+ dict(
118
+ type='CustomCollect3D',
119
+ keys=[
120
+ 'img', 'timestamp', 'l2g_r_mat', 'l2g_t', 'gt_lane_labels',
121
+ 'gt_lane_bboxes', 'gt_lane_masks', 'gt_segmentation',
122
+ 'gt_instance', 'gt_centerness', 'gt_offset', 'gt_flow',
123
+ 'gt_backward_flow', 'gt_occ_has_invalid_frame',
124
+ 'gt_occ_img_is_valid', 'sdc_planning', 'sdc_planning_mask',
125
+ 'command'
126
+ ])
127
+ ])
128
+ ]
129
+ eval_pipeline = [
130
+ dict(
131
+ type='LoadPointsFromFile',
132
+ coord_type='LIDAR',
133
+ load_dim=5,
134
+ use_dim=5,
135
+ file_client_args=dict(backend='disk')),
136
+ dict(
137
+ type='LoadPointsFromMultiSweeps',
138
+ sweeps_num=10,
139
+ file_client_args=dict(backend='disk')),
140
+ dict(
141
+ type='DefaultFormatBundle3D',
142
+ class_names=[
143
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
144
+ 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
145
+ ],
146
+ with_label=False),
147
+ dict(type='Collect3D', keys=['points'])
148
+ ]
149
+ data = dict(
150
+ samples_per_gpu=1,
151
+ workers_per_gpu=8,
152
+ train=dict(
153
+ type='NuScenesE2EDataset',
154
+ data_root='data/nuscenes/',
155
+ ann_file='data/infos/nuscenes_infos_temporal_train.pkl',
156
+ pipeline=[
157
+ dict(
158
+ type='LoadMultiViewImageFromFilesInCeph',
159
+ to_float32=True,
160
+ file_client_args=dict(backend='disk'),
161
+ img_root=''),
162
+ dict(type='PhotoMetricDistortionMultiViewImage'),
163
+ dict(
164
+ type='LoadAnnotations3D_E2E',
165
+ with_bbox_3d=True,
166
+ with_label_3d=True,
167
+ with_attr_label=False,
168
+ with_future_anns=True,
169
+ with_ins_inds_3d=True,
170
+ ins_inds_add_1=True),
171
+ dict(
172
+ type='GenerateOccFlowLabels',
173
+ grid_conf=dict(
174
+ xbound=[-50.0, 50.0, 0.5],
175
+ ybound=[-50.0, 50.0, 0.5],
176
+ zbound=[-10.0, 10.0, 20.0]),
177
+ ignore_index=255,
178
+ only_vehicle=True,
179
+ filter_invisible=False),
180
+ dict(
181
+ type='ObjectRangeFilterTrack',
182
+ point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
183
+ dict(
184
+ type='ObjectNameFilterTrack',
185
+ classes=[
186
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
187
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian',
188
+ 'traffic_cone'
189
+ ]),
190
+ dict(
191
+ type='NormalizeMultiviewImage',
192
+ mean=[103.53, 116.28, 123.675],
193
+ std=[1.0, 1.0, 1.0],
194
+ to_rgb=False),
195
+ dict(type='PadMultiViewImage', size_divisor=32),
196
+ dict(
197
+ type='DefaultFormatBundle3D',
198
+ class_names=[
199
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
200
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian',
201
+ 'traffic_cone'
202
+ ]),
203
+ dict(
204
+ type='CustomCollect3D',
205
+ keys=[
206
+ 'gt_bboxes_3d', 'gt_labels_3d', 'gt_inds', 'img',
207
+ 'timestamp', 'l2g_r_mat', 'l2g_t', 'gt_fut_traj',
208
+ 'gt_fut_traj_mask', 'gt_past_traj', 'gt_past_traj_mask',
209
+ 'gt_sdc_bbox', 'gt_sdc_label', 'gt_sdc_fut_traj',
210
+ 'gt_sdc_fut_traj_mask', 'gt_lane_labels', 'gt_lane_bboxes',
211
+ 'gt_lane_masks', 'gt_segmentation', 'gt_instance',
212
+ 'gt_centerness', 'gt_offset', 'gt_flow',
213
+ 'gt_backward_flow', 'gt_occ_has_invalid_frame',
214
+ 'gt_occ_img_is_valid', 'gt_future_boxes',
215
+ 'gt_future_labels', 'sdc_planning', 'sdc_planning_mask',
216
+ 'command'
217
+ ])
218
+ ],
219
+ classes=[
220
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
221
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
222
+ ],
223
+ modality=dict(
224
+ use_lidar=False,
225
+ use_camera=True,
226
+ use_radar=False,
227
+ use_map=False,
228
+ use_external=True),
229
+ test_mode=False,
230
+ box_type_3d='LiDAR',
231
+ file_client_args=dict(backend='disk'),
232
+ use_valid_flag=True,
233
+ patch_size=[102.4, 102.4],
234
+ canvas_size=(200, 200),
235
+ bev_size=(200, 200),
236
+ queue_length=3,
237
+ predict_steps=12,
238
+ past_steps=4,
239
+ fut_steps=4,
240
+ use_nonlinear_optimizer=True,
241
+ occ_receptive_field=3,
242
+ occ_n_future=6,
243
+ occ_filter_invalid_sample=False),
244
+ val=dict(
245
+ type='NuScenesE2EDataset',
246
+ data_root='data/nuscenes/',
247
+ ann_file='data/infos/nuscenes_infos_temporal_val.pkl',
248
+ pipeline=[
249
+ dict(
250
+ type='LoadMultiViewImageFromFilesInCeph',
251
+ to_float32=True,
252
+ file_client_args=dict(backend='disk'),
253
+ img_root=''),
254
+ dict(
255
+ type='NormalizeMultiviewImage',
256
+ mean=[103.53, 116.28, 123.675],
257
+ std=[1.0, 1.0, 1.0],
258
+ to_rgb=False),
259
+ dict(type='PadMultiViewImage', size_divisor=32),
260
+ dict(
261
+ type='LoadAnnotations3D_E2E',
262
+ with_bbox_3d=False,
263
+ with_label_3d=False,
264
+ with_attr_label=False,
265
+ with_future_anns=True,
266
+ with_ins_inds_3d=False,
267
+ ins_inds_add_1=True),
268
+ dict(
269
+ type='GenerateOccFlowLabels',
270
+ grid_conf=dict(
271
+ xbound=[-50.0, 50.0, 0.5],
272
+ ybound=[-50.0, 50.0, 0.5],
273
+ zbound=[-10.0, 10.0, 20.0]),
274
+ ignore_index=255,
275
+ only_vehicle=True,
276
+ filter_invisible=False),
277
+ dict(
278
+ type='MultiScaleFlipAug3D',
279
+ img_scale=(1600, 900),
280
+ pts_scale_ratio=1,
281
+ flip=False,
282
+ transforms=[
283
+ dict(
284
+ type='DefaultFormatBundle3D',
285
+ class_names=[
286
+ 'car', 'truck', 'construction_vehicle', 'bus',
287
+ 'trailer', 'barrier', 'motorcycle', 'bicycle',
288
+ 'pedestrian', 'traffic_cone'
289
+ ],
290
+ with_label=False),
291
+ dict(
292
+ type='CustomCollect3D',
293
+ keys=[
294
+ 'img', 'timestamp', 'l2g_r_mat', 'l2g_t',
295
+ 'gt_lane_labels', 'gt_lane_bboxes',
296
+ 'gt_lane_masks', 'gt_segmentation', 'gt_instance',
297
+ 'gt_centerness', 'gt_offset', 'gt_flow',
298
+ 'gt_backward_flow', 'gt_occ_has_invalid_frame',
299
+ 'gt_occ_img_is_valid', 'sdc_planning',
300
+ 'sdc_planning_mask', 'command'
301
+ ])
302
+ ])
303
+ ],
304
+ classes=[
305
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
306
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
307
+ ],
308
+ modality=dict(
309
+ use_lidar=False,
310
+ use_camera=True,
311
+ use_radar=False,
312
+ use_map=False,
313
+ use_external=True),
314
+ test_mode=True,
315
+ box_type_3d='LiDAR',
316
+ file_client_args=dict(backend='disk'),
317
+ patch_size=[102.4, 102.4],
318
+ canvas_size=(200, 200),
319
+ bev_size=(200, 200),
320
+ predict_steps=12,
321
+ past_steps=4,
322
+ fut_steps=4,
323
+ use_nonlinear_optimizer=True,
324
+ samples_per_gpu=1,
325
+ eval_mod=['det', 'map', 'track', 'motion'],
326
+ occ_receptive_field=3,
327
+ occ_n_future=6,
328
+ occ_filter_invalid_sample=False),
329
+ test=dict(
330
+ type='NuScenesE2EDataset',
331
+ data_root='data/nuscenes/',
332
+ ann_file='data/infos/nuscenes_infos_temporal_val.pkl',
333
+ pipeline=[
334
+ dict(
335
+ type='LoadMultiViewImageFromFilesInCeph',
336
+ to_float32=True,
337
+ file_client_args=dict(backend='disk'),
338
+ img_root=''),
339
+ dict(
340
+ type='NormalizeMultiviewImage',
341
+ mean=[103.53, 116.28, 123.675],
342
+ std=[1.0, 1.0, 1.0],
343
+ to_rgb=False),
344
+ dict(type='PadMultiViewImage', size_divisor=32),
345
+ dict(
346
+ type='LoadAnnotations3D_E2E',
347
+ with_bbox_3d=False,
348
+ with_label_3d=False,
349
+ with_attr_label=False,
350
+ with_future_anns=True,
351
+ with_ins_inds_3d=False,
352
+ ins_inds_add_1=True),
353
+ dict(
354
+ type='GenerateOccFlowLabels',
355
+ grid_conf=dict(
356
+ xbound=[-50.0, 50.0, 0.5],
357
+ ybound=[-50.0, 50.0, 0.5],
358
+ zbound=[-10.0, 10.0, 20.0]),
359
+ ignore_index=255,
360
+ only_vehicle=True,
361
+ filter_invisible=False),
362
+ dict(
363
+ type='MultiScaleFlipAug3D',
364
+ img_scale=(1600, 900),
365
+ pts_scale_ratio=1,
366
+ flip=False,
367
+ transforms=[
368
+ dict(
369
+ type='DefaultFormatBundle3D',
370
+ class_names=[
371
+ 'car', 'truck', 'construction_vehicle', 'bus',
372
+ 'trailer', 'barrier', 'motorcycle', 'bicycle',
373
+ 'pedestrian', 'traffic_cone'
374
+ ],
375
+ with_label=False),
376
+ dict(
377
+ type='CustomCollect3D',
378
+ keys=[
379
+ 'img', 'timestamp', 'l2g_r_mat', 'l2g_t',
380
+ 'gt_lane_labels', 'gt_lane_bboxes',
381
+ 'gt_lane_masks', 'gt_segmentation', 'gt_instance',
382
+ 'gt_centerness', 'gt_offset', 'gt_flow',
383
+ 'gt_backward_flow', 'gt_occ_has_invalid_frame',
384
+ 'gt_occ_img_is_valid', 'sdc_planning',
385
+ 'sdc_planning_mask', 'command'
386
+ ])
387
+ ])
388
+ ],
389
+ classes=[
390
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
391
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
392
+ ],
393
+ modality=dict(
394
+ use_lidar=False,
395
+ use_camera=True,
396
+ use_radar=False,
397
+ use_map=False,
398
+ use_external=True),
399
+ test_mode=True,
400
+ box_type_3d='LiDAR',
401
+ file_client_args=dict(backend='disk'),
402
+ patch_size=[102.4, 102.4],
403
+ canvas_size=(200, 200),
404
+ bev_size=(200, 200),
405
+ predict_steps=12,
406
+ past_steps=4,
407
+ fut_steps=4,
408
+ occ_n_future=6,
409
+ use_nonlinear_optimizer=True,
410
+ eval_mod=['det', 'map', 'track', 'motion']),
411
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
412
+ nonshuffler_sampler=dict(type='DistributedSampler'))
413
+ evaluation = dict(
414
+ interval=20,
415
+ pipeline=[
416
+ dict(
417
+ type='LoadMultiViewImageFromFilesInCeph',
418
+ to_float32=True,
419
+ file_client_args=dict(backend='disk'),
420
+ img_root=''),
421
+ dict(
422
+ type='NormalizeMultiviewImage',
423
+ mean=[103.53, 116.28, 123.675],
424
+ std=[1.0, 1.0, 1.0],
425
+ to_rgb=False),
426
+ dict(type='PadMultiViewImage', size_divisor=32),
427
+ dict(
428
+ type='LoadAnnotations3D_E2E',
429
+ with_bbox_3d=False,
430
+ with_label_3d=False,
431
+ with_attr_label=False,
432
+ with_future_anns=True,
433
+ with_ins_inds_3d=False,
434
+ ins_inds_add_1=True),
435
+ dict(
436
+ type='GenerateOccFlowLabels',
437
+ grid_conf=dict(
438
+ xbound=[-50.0, 50.0, 0.5],
439
+ ybound=[-50.0, 50.0, 0.5],
440
+ zbound=[-10.0, 10.0, 20.0]),
441
+ ignore_index=255,
442
+ only_vehicle=True,
443
+ filter_invisible=False),
444
+ dict(
445
+ type='MultiScaleFlipAug3D',
446
+ img_scale=(1600, 900),
447
+ pts_scale_ratio=1,
448
+ flip=False,
449
+ transforms=[
450
+ dict(
451
+ type='DefaultFormatBundle3D',
452
+ class_names=[
453
+ 'car', 'truck', 'construction_vehicle', 'bus',
454
+ 'trailer', 'barrier', 'motorcycle', 'bicycle',
455
+ 'pedestrian', 'traffic_cone'
456
+ ],
457
+ with_label=False),
458
+ dict(
459
+ type='CustomCollect3D',
460
+ keys=[
461
+ 'img', 'timestamp', 'l2g_r_mat', 'l2g_t',
462
+ 'gt_lane_labels', 'gt_lane_bboxes', 'gt_lane_masks',
463
+ 'gt_segmentation', 'gt_instance', 'gt_centerness',
464
+ 'gt_offset', 'gt_flow', 'gt_backward_flow',
465
+ 'gt_occ_has_invalid_frame', 'gt_occ_img_is_valid',
466
+ 'sdc_planning', 'sdc_planning_mask', 'command'
467
+ ])
468
+ ])
469
+ ],
470
+ planning_evaluation_strategy='uniad')
471
+ checkpoint_config = dict(interval=4)
472
+ log_config = dict(
473
+ interval=10,
474
+ hooks=[dict(type='TextLoggerHook'),
475
+ dict(type='TensorboardLoggerHook')])
476
+ dist_params = dict(backend='nccl')
477
+ log_level = 'INFO'
478
+ work_dir = 'projects/work_dirs/stage2_e2e/base_e2e/'
479
+ load_from = 'ckpts/uniad_base_track_map.pth'
480
+ resume_from = None
481
+ workflow = [('train', 1)]
482
+ plugin = True
483
+ plugin_dir = 'projects/mmdet3d_plugin/'
484
+ voxel_size = [0.2, 0.2, 8]
485
+ patch_size = [102.4, 102.4]
486
+ img_norm_cfg = dict(
487
+ mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
488
+ vehicle_id_list = [0, 1, 2, 3, 4, 6, 7]
489
+ group_id_list = [[0, 1, 2, 3, 4], [6, 7], [8], [5, 9]]
490
+ _dim_ = 256
491
+ _pos_dim_ = 128
492
+ _ffn_dim_ = 512
493
+ _num_levels_ = 4
494
+ bev_h_ = 200
495
+ bev_w_ = 200
496
+ _feed_dim_ = 512
497
+ _dim_half_ = 128
498
+ canvas_size = (200, 200)
499
+ queue_length = 3
500
+ predict_steps = 12
501
+ predict_modes = 6
502
+ fut_steps = 4
503
+ past_steps = 4
504
+ use_nonlinear_optimizer = True
505
+ occ_n_future = 4
506
+ occ_n_future_plan = 6
507
+ occ_n_future_max = 6
508
+ planning_steps = 6
509
+ use_col_optim = True
510
+ planning_evaluation_strategy = 'uniad'
511
+ occflow_grid_conf = dict(
512
+ xbound=[-50.0, 50.0, 0.5],
513
+ ybound=[-50.0, 50.0, 0.5],
514
+ zbound=[-10.0, 10.0, 20.0])
515
+ train_gt_iou_threshold = 0.3
516
+ model = dict(
517
+ type='UniAD',
518
+ gt_iou_threshold=0.3,
519
+ queue_length=3,
520
+ use_grid_mask=True,
521
+ video_test_mode=True,
522
+ num_query=900,
523
+ num_classes=10,
524
+ vehicle_id_list=[0, 1, 2, 3, 4, 6, 7],
525
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
526
+ img_backbone=dict(
527
+ type='ResNet',
528
+ depth=101,
529
+ num_stages=4,
530
+ out_indices=(1, 2, 3),
531
+ frozen_stages=4,
532
+ norm_cfg=dict(type='BN2d', requires_grad=False),
533
+ norm_eval=True,
534
+ style='caffe',
535
+ dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
536
+ stage_with_dcn=(False, False, True, True)),
537
+ img_neck=dict(
538
+ type='FPN',
539
+ in_channels=[512, 1024, 2048],
540
+ out_channels=256,
541
+ start_level=0,
542
+ add_extra_convs='on_output',
543
+ num_outs=4,
544
+ relu_before_extra_convs=True),
545
+ freeze_img_backbone=True,
546
+ freeze_img_neck=True,
547
+ freeze_bn=True,
548
+ freeze_bev_encoder=True,
549
+ score_thresh=0.4,
550
+ filter_score_thresh=0.35,
551
+ qim_args=dict(
552
+ qim_type='QIMBase',
553
+ merger_dropout=0,
554
+ update_query_pos=True,
555
+ fp_ratio=0.3,
556
+ random_drop=0.1),
557
+ mem_args=dict(
558
+ memory_bank_type='MemoryBank',
559
+ memory_bank_score_thresh=0.0,
560
+ memory_bank_len=4),
561
+ loss_cfg=dict(
562
+ type='ClipMatcher',
563
+ num_classes=10,
564
+ weight_dict=None,
565
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
566
+ assigner=dict(
567
+ type='HungarianAssigner3DTrack',
568
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
569
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
570
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
571
+ loss_cls=dict(
572
+ type='FocalLoss',
573
+ use_sigmoid=True,
574
+ gamma=2.0,
575
+ alpha=0.25,
576
+ loss_weight=2.0),
577
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25)),
578
+ pts_bbox_head=dict(
579
+ type='BEVFormerTrackHead',
580
+ bev_h=200,
581
+ bev_w=200,
582
+ num_query=900,
583
+ num_classes=10,
584
+ in_channels=256,
585
+ sync_cls_avg_factor=True,
586
+ with_box_refine=True,
587
+ as_two_stage=False,
588
+ past_steps=4,
589
+ fut_steps=4,
590
+ transformer=dict(
591
+ type='PerceptionTransformer',
592
+ rotate_prev_bev=True,
593
+ use_shift=True,
594
+ use_can_bus=True,
595
+ embed_dims=256,
596
+ encoder=dict(
597
+ type='BEVFormerEncoder',
598
+ num_layers=6,
599
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
600
+ num_points_in_pillar=4,
601
+ return_intermediate=False,
602
+ transformerlayers=dict(
603
+ type='BEVFormerLayer',
604
+ attn_cfgs=[
605
+ dict(
606
+ type='TemporalSelfAttention',
607
+ embed_dims=256,
608
+ num_levels=1),
609
+ dict(
610
+ type='SpatialCrossAttention',
611
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
612
+ deformable_attention=dict(
613
+ type='MSDeformableAttention3D',
614
+ embed_dims=256,
615
+ num_points=8,
616
+ num_levels=4),
617
+ embed_dims=256)
618
+ ],
619
+ feedforward_channels=512,
620
+ ffn_dropout=0.1,
621
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
622
+ 'ffn', 'norm'))),
623
+ decoder=dict(
624
+ type='DetectionTransformerDecoder',
625
+ num_layers=6,
626
+ return_intermediate=True,
627
+ transformerlayers=dict(
628
+ type='DetrTransformerDecoderLayer',
629
+ attn_cfgs=[
630
+ dict(
631
+ type='MultiheadAttention',
632
+ embed_dims=256,
633
+ num_heads=8,
634
+ dropout=0.1),
635
+ dict(
636
+ type='CustomMSDeformableAttention',
637
+ embed_dims=256,
638
+ num_levels=1)
639
+ ],
640
+ feedforward_channels=512,
641
+ ffn_dropout=0.1,
642
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
643
+ 'ffn', 'norm')))),
644
+ bbox_coder=dict(
645
+ type='NMSFreeCoder',
646
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
647
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
648
+ max_num=300,
649
+ voxel_size=[0.2, 0.2, 8],
650
+ num_classes=10),
651
+ positional_encoding=dict(
652
+ type='LearnedPositionalEncoding',
653
+ num_feats=128,
654
+ row_num_embed=200,
655
+ col_num_embed=200),
656
+ loss_cls=dict(
657
+ type='FocalLoss',
658
+ use_sigmoid=True,
659
+ gamma=2.0,
660
+ alpha=0.25,
661
+ loss_weight=2.0),
662
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
663
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
664
+ seg_head=dict(
665
+ type='PansegformerHead',
666
+ bev_h=200,
667
+ bev_w=200,
668
+ canvas_size=(200, 200),
669
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
670
+ num_query=300,
671
+ num_classes=4,
672
+ num_things_classes=3,
673
+ num_stuff_classes=1,
674
+ in_channels=2048,
675
+ sync_cls_avg_factor=True,
676
+ as_two_stage=False,
677
+ with_box_refine=True,
678
+ transformer=dict(
679
+ type='SegDeformableTransformer',
680
+ encoder=dict(
681
+ type='DetrTransformerEncoder',
682
+ num_layers=6,
683
+ transformerlayers=dict(
684
+ type='BaseTransformerLayer',
685
+ attn_cfgs=dict(
686
+ type='MultiScaleDeformableAttention',
687
+ embed_dims=256,
688
+ num_levels=4),
689
+ feedforward_channels=512,
690
+ ffn_dropout=0.1,
691
+ operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
692
+ decoder=dict(
693
+ type='DeformableDetrTransformerDecoder',
694
+ num_layers=6,
695
+ return_intermediate=True,
696
+ transformerlayers=dict(
697
+ type='DetrTransformerDecoderLayer',
698
+ attn_cfgs=[
699
+ dict(
700
+ type='MultiheadAttention',
701
+ embed_dims=256,
702
+ num_heads=8,
703
+ dropout=0.1),
704
+ dict(
705
+ type='MultiScaleDeformableAttention',
706
+ embed_dims=256,
707
+ num_levels=4)
708
+ ],
709
+ feedforward_channels=512,
710
+ ffn_dropout=0.1,
711
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
712
+ 'ffn', 'norm')))),
713
+ positional_encoding=dict(
714
+ type='SinePositionalEncoding',
715
+ num_feats=128,
716
+ normalize=True,
717
+ offset=-0.5),
718
+ loss_cls=dict(
719
+ type='FocalLoss',
720
+ use_sigmoid=True,
721
+ gamma=2.0,
722
+ alpha=0.25,
723
+ loss_weight=2.0),
724
+ loss_bbox=dict(type='L1Loss', loss_weight=5.0),
725
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0),
726
+ loss_mask=dict(type='DiceLoss', loss_weight=2.0),
727
+ thing_transformer_head=dict(
728
+ type='SegMaskHead', d_model=256, nhead=8, num_decoder_layers=4),
729
+ stuff_transformer_head=dict(
730
+ type='SegMaskHead',
731
+ d_model=256,
732
+ nhead=8,
733
+ num_decoder_layers=6,
734
+ self_attn=True),
735
+ train_cfg=dict(
736
+ assigner=dict(
737
+ type='HungarianAssigner',
738
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
739
+ reg_cost=dict(
740
+ type='BBoxL1Cost', weight=5.0, box_format='xywh'),
741
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)),
742
+ assigner_with_mask=dict(
743
+ type='HungarianAssigner_multi_info',
744
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
745
+ reg_cost=dict(
746
+ type='BBoxL1Cost', weight=5.0, box_format='xywh'),
747
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
748
+ mask_cost=dict(type='DiceCost', weight=2.0)),
749
+ sampler=dict(type='PseudoSampler'),
750
+ sampler_with_mask=dict(type='PseudoSampler_segformer'))),
751
+ occ_head=dict(
752
+ type='OccHead',
753
+ grid_conf=dict(
754
+ xbound=[-50.0, 50.0, 0.5],
755
+ ybound=[-50.0, 50.0, 0.5],
756
+ zbound=[-10.0, 10.0, 20.0]),
757
+ ignore_index=255,
758
+ bev_proj_dim=256,
759
+ bev_proj_nlayers=4,
760
+ attn_mask_thresh=0.3,
761
+ transformer_decoder=dict(
762
+ type='DetrTransformerDecoder',
763
+ return_intermediate=True,
764
+ num_layers=5,
765
+ transformerlayers=dict(
766
+ type='DetrTransformerDecoderLayer',
767
+ attn_cfgs=dict(
768
+ type='MultiheadAttention',
769
+ embed_dims=256,
770
+ num_heads=8,
771
+ attn_drop=0.0,
772
+ proj_drop=0.0,
773
+ dropout_layer=None,
774
+ batch_first=False),
775
+ ffn_cfgs=dict(
776
+ embed_dims=256,
777
+ feedforward_channels=2048,
778
+ num_fcs=2,
779
+ act_cfg=dict(type='ReLU', inplace=True),
780
+ ffn_drop=0.0,
781
+ dropout_layer=None,
782
+ add_identity=True),
783
+ feedforward_channels=2048,
784
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
785
+ 'ffn', 'norm')),
786
+ init_cfg=None),
787
+ query_dim=256,
788
+ query_mlp_layers=3,
789
+ aux_loss_weight=1.0,
790
+ loss_mask=dict(
791
+ type='FieryBinarySegmentationLoss',
792
+ use_top_k=True,
793
+ top_k_ratio=0.25,
794
+ future_discount=0.95,
795
+ loss_weight=5.0,
796
+ ignore_index=255),
797
+ loss_dice=dict(
798
+ type='DiceLossWithMasks',
799
+ use_sigmoid=True,
800
+ activate=True,
801
+ reduction='mean',
802
+ naive_dice=True,
803
+ eps=1.0,
804
+ ignore_index=255,
805
+ loss_weight=1.0),
806
+ pan_eval=True,
807
+ test_seg_thresh=0.1,
808
+ test_with_track_score=True),
809
+ motion_head=dict(
810
+ type='MotionHead',
811
+ bev_h=200,
812
+ bev_w=200,
813
+ num_query=300,
814
+ num_classes=10,
815
+ predict_steps=12,
816
+ predict_modes=6,
817
+ embed_dims=256,
818
+ loss_traj=dict(
819
+ type='TrajLoss',
820
+ use_variance=True,
821
+ cls_loss_weight=0.5,
822
+ nll_loss_weight=0.5,
823
+ loss_weight_minade=0.0,
824
+ loss_weight_minfde=0.25),
825
+ num_cls_fcs=3,
826
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
827
+ group_id_list=[[0, 1, 2, 3, 4], [6, 7], [8], [5, 9]],
828
+ num_anchor=6,
829
+ use_nonlinear_optimizer=True,
830
+ anchor_info_path='data/others/motion_anchor_infos_mode6_new.pkl',
831
+ transformerlayers=dict(
832
+ type='MotionTransformerDecoder',
833
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
834
+ embed_dims=256,
835
+ num_layers=3,
836
+ transformerlayers=dict(
837
+ type='MotionTransformerAttentionLayer',
838
+ batch_first=True,
839
+ attn_cfgs=[
840
+ dict(
841
+ type='MotionDeformableAttention',
842
+ num_steps=12,
843
+ embed_dims=256,
844
+ num_levels=1,
845
+ num_heads=8,
846
+ num_points=4,
847
+ sample_index=-1)
848
+ ],
849
+ feedforward_channels=512,
850
+ ffn_dropout=0.1,
851
+ operation_order=('cross_attn', 'norm', 'ffn', 'norm')))),
852
+ planning_head=dict(
853
+ type='PlanningHeadSingleMode',
854
+ embed_dims=256,
855
+ planning_steps=6,
856
+ loss_planning=dict(type='PlanningLoss'),
857
+ loss_collision=[
858
+ dict(type='CollisionLoss', delta=0.0, weight=2.5),
859
+ dict(type='CollisionLoss', delta=0.5, weight=1.0),
860
+ dict(type='CollisionLoss', delta=1.0, weight=0.25)
861
+ ],
862
+ use_col_optim=True,
863
+ planning_eval=True,
864
+ with_adapter=True),
865
+ train_cfg=dict(
866
+ pts=dict(
867
+ grid_size=[512, 512, 1],
868
+ voxel_size=[0.2, 0.2, 8],
869
+ point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
870
+ out_size_factor=4,
871
+ assigner=dict(
872
+ type='HungarianAssigner3D',
873
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
874
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
875
+ iou_cost=dict(type='IoUCost', weight=0.0),
876
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]))))
877
+ info_root = 'data/infos/'
878
+ ann_file_train = 'data/infos/nuscenes_infos_temporal_train.pkl'
879
+ ann_file_val = 'data/infos/nuscenes_infos_temporal_val.pkl'
880
+ ann_file_test = 'data/infos/nuscenes_infos_temporal_val.pkl'
881
+ optimizer = dict(
882
+ type='AdamW',
883
+ lr=0.0002,
884
+ paramwise_cfg=dict(custom_keys=dict(img_backbone=dict(lr_mult=0.1))),
885
+ weight_decay=0.01)
886
+ optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
887
+ lr_config = dict(
888
+ policy='CosineAnnealing',
889
+ warmup='linear',
890
+ warmup_iters=500,
891
+ warmup_ratio=0.3333333333333333,
892
+ min_lr_ratio=0.001)
893
+ total_epochs = 20
894
+ runner = dict(type='EpochBasedRunner', max_epochs=20)
895
+ find_unused_parameters = True
896
+ logger_name = 'mmdet'
897
+ gpu_ids = range(0, 16)
config/base_track_map.py ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
2
+ class_names = [
3
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
4
+ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
5
+ ]
6
+ dataset_type = 'NuScenesE2EDataset'
7
+ data_root = 'data/nuscenes/'
8
+ input_modality = dict(
9
+ use_lidar=False,
10
+ use_camera=True,
11
+ use_radar=False,
12
+ use_map=False,
13
+ use_external=True)
14
+ file_client_args = dict(backend='disk')
15
+ train_pipeline = [
16
+ dict(
17
+ type='LoadMultiViewImageFromFilesInCeph',
18
+ to_float32=True,
19
+ file_client_args=dict(backend='disk'),
20
+ img_root=''),
21
+ dict(type='PhotoMetricDistortionMultiViewImage'),
22
+ dict(
23
+ type='LoadAnnotations3D_E2E',
24
+ with_bbox_3d=True,
25
+ with_label_3d=True,
26
+ with_attr_label=False,
27
+ with_future_anns=True,
28
+ with_ins_inds_3d=True,
29
+ ins_inds_add_1=True),
30
+ dict(
31
+ type='GenerateOccFlowLabels',
32
+ grid_conf=dict(
33
+ xbound=[-50.0, 50.0, 0.5],
34
+ ybound=[-50.0, 50.0, 0.5],
35
+ zbound=[-10.0, 10.0, 20.0]),
36
+ ignore_index=255,
37
+ only_vehicle=True,
38
+ filter_invisible=False),
39
+ dict(
40
+ type='ObjectRangeFilterTrack',
41
+ point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
42
+ dict(
43
+ type='ObjectNameFilterTrack',
44
+ classes=[
45
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
46
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
47
+ ]),
48
+ dict(
49
+ type='NormalizeMultiviewImage',
50
+ mean=[103.53, 116.28, 123.675],
51
+ std=[1.0, 1.0, 1.0],
52
+ to_rgb=False),
53
+ dict(type='PadMultiViewImage', size_divisor=32),
54
+ dict(
55
+ type='DefaultFormatBundle3D',
56
+ class_names=[
57
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
58
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
59
+ ]),
60
+ dict(
61
+ type='CustomCollect3D',
62
+ keys=[
63
+ 'gt_bboxes_3d', 'gt_labels_3d', 'gt_inds', 'img', 'timestamp',
64
+ 'l2g_r_mat', 'l2g_t', 'gt_fut_traj', 'gt_fut_traj_mask',
65
+ 'gt_past_traj', 'gt_past_traj_mask', 'gt_sdc_bbox', 'gt_sdc_label',
66
+ 'gt_sdc_fut_traj', 'gt_sdc_fut_traj_mask', 'gt_lane_labels',
67
+ 'gt_lane_bboxes', 'gt_lane_masks', 'gt_segmentation',
68
+ 'gt_instance', 'gt_centerness', 'gt_offset', 'gt_flow',
69
+ 'gt_backward_flow', 'gt_occ_has_invalid_frame',
70
+ 'gt_occ_img_is_valid', 'gt_future_boxes', 'gt_future_labels',
71
+ 'sdc_planning', 'sdc_planning_mask', 'command'
72
+ ])
73
+ ]
74
+ test_pipeline = [
75
+ dict(
76
+ type='LoadMultiViewImageFromFilesInCeph',
77
+ to_float32=True,
78
+ file_client_args=dict(backend='disk'),
79
+ img_root=''),
80
+ dict(
81
+ type='NormalizeMultiviewImage',
82
+ mean=[103.53, 116.28, 123.675],
83
+ std=[1.0, 1.0, 1.0],
84
+ to_rgb=False),
85
+ dict(type='PadMultiViewImage', size_divisor=32),
86
+ dict(
87
+ type='LoadAnnotations3D_E2E',
88
+ with_bbox_3d=False,
89
+ with_label_3d=False,
90
+ with_attr_label=False,
91
+ with_future_anns=True,
92
+ with_ins_inds_3d=False,
93
+ ins_inds_add_1=True),
94
+ dict(
95
+ type='GenerateOccFlowLabels',
96
+ grid_conf=dict(
97
+ xbound=[-50.0, 50.0, 0.5],
98
+ ybound=[-50.0, 50.0, 0.5],
99
+ zbound=[-10.0, 10.0, 20.0]),
100
+ ignore_index=255,
101
+ only_vehicle=True,
102
+ filter_invisible=False),
103
+ dict(
104
+ type='MultiScaleFlipAug3D',
105
+ img_scale=(1600, 900),
106
+ pts_scale_ratio=1,
107
+ flip=False,
108
+ transforms=[
109
+ dict(
110
+ type='DefaultFormatBundle3D',
111
+ class_names=[
112
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
113
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian',
114
+ 'traffic_cone'
115
+ ],
116
+ with_label=False),
117
+ dict(
118
+ type='CustomCollect3D',
119
+ keys=[
120
+ 'img', 'timestamp', 'l2g_r_mat', 'l2g_t', 'gt_lane_labels',
121
+ 'gt_lane_bboxes', 'gt_lane_masks', 'gt_segmentation',
122
+ 'gt_instance', 'gt_centerness', 'gt_offset', 'gt_flow',
123
+ 'gt_backward_flow', 'gt_occ_has_invalid_frame',
124
+ 'gt_occ_img_is_valid', 'sdc_planning', 'sdc_planning_mask',
125
+ 'command'
126
+ ])
127
+ ])
128
+ ]
129
+ eval_pipeline = [
130
+ dict(
131
+ type='LoadPointsFromFile',
132
+ coord_type='LIDAR',
133
+ load_dim=5,
134
+ use_dim=5,
135
+ file_client_args=dict(backend='disk')),
136
+ dict(
137
+ type='LoadPointsFromMultiSweeps',
138
+ sweeps_num=10,
139
+ file_client_args=dict(backend='disk')),
140
+ dict(
141
+ type='DefaultFormatBundle3D',
142
+ class_names=[
143
+ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
144
+ 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
145
+ ],
146
+ with_label=False),
147
+ dict(type='Collect3D', keys=['points'])
148
+ ]
149
+ data = dict(
150
+ samples_per_gpu=1,
151
+ workers_per_gpu=8,
152
+ train=dict(
153
+ type='NuScenesE2EDataset',
154
+ data_root='data/nuscenes/',
155
+ ann_file='data/infos/nuscenes_infos_temporal_train.pkl',
156
+ pipeline=[
157
+ dict(
158
+ type='LoadMultiViewImageFromFilesInCeph',
159
+ to_float32=True,
160
+ file_client_args=dict(backend='disk'),
161
+ img_root=''),
162
+ dict(type='PhotoMetricDistortionMultiViewImage'),
163
+ dict(
164
+ type='LoadAnnotations3D_E2E',
165
+ with_bbox_3d=True,
166
+ with_label_3d=True,
167
+ with_attr_label=False,
168
+ with_future_anns=True,
169
+ with_ins_inds_3d=True,
170
+ ins_inds_add_1=True),
171
+ dict(
172
+ type='GenerateOccFlowLabels',
173
+ grid_conf=dict(
174
+ xbound=[-50.0, 50.0, 0.5],
175
+ ybound=[-50.0, 50.0, 0.5],
176
+ zbound=[-10.0, 10.0, 20.0]),
177
+ ignore_index=255,
178
+ only_vehicle=True,
179
+ filter_invisible=False),
180
+ dict(
181
+ type='ObjectRangeFilterTrack',
182
+ point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
183
+ dict(
184
+ type='ObjectNameFilterTrack',
185
+ classes=[
186
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
187
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian',
188
+ 'traffic_cone'
189
+ ]),
190
+ dict(
191
+ type='NormalizeMultiviewImage',
192
+ mean=[103.53, 116.28, 123.675],
193
+ std=[1.0, 1.0, 1.0],
194
+ to_rgb=False),
195
+ dict(type='PadMultiViewImage', size_divisor=32),
196
+ dict(
197
+ type='DefaultFormatBundle3D',
198
+ class_names=[
199
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
200
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian',
201
+ 'traffic_cone'
202
+ ]),
203
+ dict(
204
+ type='CustomCollect3D',
205
+ keys=[
206
+ 'gt_bboxes_3d', 'gt_labels_3d', 'gt_inds', 'img',
207
+ 'timestamp', 'l2g_r_mat', 'l2g_t', 'gt_fut_traj',
208
+ 'gt_fut_traj_mask', 'gt_past_traj', 'gt_past_traj_mask',
209
+ 'gt_sdc_bbox', 'gt_sdc_label', 'gt_sdc_fut_traj',
210
+ 'gt_sdc_fut_traj_mask', 'gt_lane_labels', 'gt_lane_bboxes',
211
+ 'gt_lane_masks', 'gt_segmentation', 'gt_instance',
212
+ 'gt_centerness', 'gt_offset', 'gt_flow',
213
+ 'gt_backward_flow', 'gt_occ_has_invalid_frame',
214
+ 'gt_occ_img_is_valid', 'gt_future_boxes',
215
+ 'gt_future_labels', 'sdc_planning', 'sdc_planning_mask',
216
+ 'command'
217
+ ])
218
+ ],
219
+ classes=[
220
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
221
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
222
+ ],
223
+ modality=dict(
224
+ use_lidar=False,
225
+ use_camera=True,
226
+ use_radar=False,
227
+ use_map=False,
228
+ use_external=True),
229
+ test_mode=False,
230
+ box_type_3d='LiDAR',
231
+ file_client_args=dict(backend='disk'),
232
+ use_valid_flag=True,
233
+ patch_size=[102.4, 102.4],
234
+ canvas_size=(200, 200),
235
+ bev_size=(200, 200),
236
+ queue_length=5,
237
+ predict_steps=12,
238
+ past_steps=4,
239
+ fut_steps=4,
240
+ use_nonlinear_optimizer=True,
241
+ occ_receptive_field=3,
242
+ occ_n_future=6,
243
+ occ_filter_invalid_sample=False),
244
+ val=dict(
245
+ type='NuScenesE2EDataset',
246
+ data_root='data/nuscenes/',
247
+ ann_file='data/infos/nuscenes_infos_temporal_val.pkl',
248
+ pipeline=[
249
+ dict(
250
+ type='LoadMultiViewImageFromFilesInCeph',
251
+ to_float32=True,
252
+ file_client_args=dict(backend='disk'),
253
+ img_root=''),
254
+ dict(
255
+ type='NormalizeMultiviewImage',
256
+ mean=[103.53, 116.28, 123.675],
257
+ std=[1.0, 1.0, 1.0],
258
+ to_rgb=False),
259
+ dict(type='PadMultiViewImage', size_divisor=32),
260
+ dict(
261
+ type='LoadAnnotations3D_E2E',
262
+ with_bbox_3d=False,
263
+ with_label_3d=False,
264
+ with_attr_label=False,
265
+ with_future_anns=True,
266
+ with_ins_inds_3d=False,
267
+ ins_inds_add_1=True),
268
+ dict(
269
+ type='GenerateOccFlowLabels',
270
+ grid_conf=dict(
271
+ xbound=[-50.0, 50.0, 0.5],
272
+ ybound=[-50.0, 50.0, 0.5],
273
+ zbound=[-10.0, 10.0, 20.0]),
274
+ ignore_index=255,
275
+ only_vehicle=True,
276
+ filter_invisible=False),
277
+ dict(
278
+ type='MultiScaleFlipAug3D',
279
+ img_scale=(1600, 900),
280
+ pts_scale_ratio=1,
281
+ flip=False,
282
+ transforms=[
283
+ dict(
284
+ type='DefaultFormatBundle3D',
285
+ class_names=[
286
+ 'car', 'truck', 'construction_vehicle', 'bus',
287
+ 'trailer', 'barrier', 'motorcycle', 'bicycle',
288
+ 'pedestrian', 'traffic_cone'
289
+ ],
290
+ with_label=False),
291
+ dict(
292
+ type='CustomCollect3D',
293
+ keys=[
294
+ 'img', 'timestamp', 'l2g_r_mat', 'l2g_t',
295
+ 'gt_lane_labels', 'gt_lane_bboxes',
296
+ 'gt_lane_masks', 'gt_segmentation', 'gt_instance',
297
+ 'gt_centerness', 'gt_offset', 'gt_flow',
298
+ 'gt_backward_flow', 'gt_occ_has_invalid_frame',
299
+ 'gt_occ_img_is_valid', 'sdc_planning',
300
+ 'sdc_planning_mask', 'command'
301
+ ])
302
+ ])
303
+ ],
304
+ classes=[
305
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
306
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
307
+ ],
308
+ modality=dict(
309
+ use_lidar=False,
310
+ use_camera=True,
311
+ use_radar=False,
312
+ use_map=False,
313
+ use_external=True),
314
+ test_mode=True,
315
+ box_type_3d='LiDAR',
316
+ file_client_args=dict(backend='disk'),
317
+ patch_size=[102.4, 102.4],
318
+ canvas_size=(200, 200),
319
+ bev_size=(200, 200),
320
+ predict_steps=12,
321
+ past_steps=4,
322
+ fut_steps=4,
323
+ use_nonlinear_optimizer=True,
324
+ samples_per_gpu=1,
325
+ eval_mod=['det', 'track', 'map'],
326
+ occ_receptive_field=3,
327
+ occ_n_future=6,
328
+ occ_filter_invalid_sample=False),
329
+ test=dict(
330
+ type='NuScenesE2EDataset',
331
+ data_root='data/nuscenes/',
332
+ ann_file='data/infos/nuscenes_infos_temporal_val.pkl',
333
+ pipeline=[
334
+ dict(
335
+ type='LoadMultiViewImageFromFilesInCeph',
336
+ to_float32=True,
337
+ file_client_args=dict(backend='disk'),
338
+ img_root=''),
339
+ dict(
340
+ type='NormalizeMultiviewImage',
341
+ mean=[103.53, 116.28, 123.675],
342
+ std=[1.0, 1.0, 1.0],
343
+ to_rgb=False),
344
+ dict(type='PadMultiViewImage', size_divisor=32),
345
+ dict(
346
+ type='LoadAnnotations3D_E2E',
347
+ with_bbox_3d=False,
348
+ with_label_3d=False,
349
+ with_attr_label=False,
350
+ with_future_anns=True,
351
+ with_ins_inds_3d=False,
352
+ ins_inds_add_1=True),
353
+ dict(
354
+ type='GenerateOccFlowLabels',
355
+ grid_conf=dict(
356
+ xbound=[-50.0, 50.0, 0.5],
357
+ ybound=[-50.0, 50.0, 0.5],
358
+ zbound=[-10.0, 10.0, 20.0]),
359
+ ignore_index=255,
360
+ only_vehicle=True,
361
+ filter_invisible=False),
362
+ dict(
363
+ type='MultiScaleFlipAug3D',
364
+ img_scale=(1600, 900),
365
+ pts_scale_ratio=1,
366
+ flip=False,
367
+ transforms=[
368
+ dict(
369
+ type='DefaultFormatBundle3D',
370
+ class_names=[
371
+ 'car', 'truck', 'construction_vehicle', 'bus',
372
+ 'trailer', 'barrier', 'motorcycle', 'bicycle',
373
+ 'pedestrian', 'traffic_cone'
374
+ ],
375
+ with_label=False),
376
+ dict(
377
+ type='CustomCollect3D',
378
+ keys=[
379
+ 'img', 'timestamp', 'l2g_r_mat', 'l2g_t',
380
+ 'gt_lane_labels', 'gt_lane_bboxes',
381
+ 'gt_lane_masks', 'gt_segmentation', 'gt_instance',
382
+ 'gt_centerness', 'gt_offset', 'gt_flow',
383
+ 'gt_backward_flow', 'gt_occ_has_invalid_frame',
384
+ 'gt_occ_img_is_valid', 'sdc_planning',
385
+ 'sdc_planning_mask', 'command'
386
+ ])
387
+ ])
388
+ ],
389
+ classes=[
390
+ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
391
+ 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
392
+ ],
393
+ modality=dict(
394
+ use_lidar=False,
395
+ use_camera=True,
396
+ use_radar=False,
397
+ use_map=False,
398
+ use_external=True),
399
+ test_mode=True,
400
+ box_type_3d='LiDAR',
401
+ file_client_args=dict(backend='disk'),
402
+ patch_size=[102.4, 102.4],
403
+ canvas_size=(200, 200),
404
+ bev_size=(200, 200),
405
+ predict_steps=12,
406
+ past_steps=4,
407
+ fut_steps=4,
408
+ occ_n_future=6,
409
+ use_nonlinear_optimizer=True,
410
+ eval_mod=['det', 'map', 'track']),
411
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
412
+ nonshuffler_sampler=dict(type='DistributedSampler'))
413
+ evaluation = dict(
414
+ interval=6,
415
+ pipeline=[
416
+ dict(
417
+ type='LoadMultiViewImageFromFilesInCeph',
418
+ to_float32=True,
419
+ file_client_args=dict(backend='disk'),
420
+ img_root=''),
421
+ dict(
422
+ type='NormalizeMultiviewImage',
423
+ mean=[103.53, 116.28, 123.675],
424
+ std=[1.0, 1.0, 1.0],
425
+ to_rgb=False),
426
+ dict(type='PadMultiViewImage', size_divisor=32),
427
+ dict(
428
+ type='LoadAnnotations3D_E2E',
429
+ with_bbox_3d=False,
430
+ with_label_3d=False,
431
+ with_attr_label=False,
432
+ with_future_anns=True,
433
+ with_ins_inds_3d=False,
434
+ ins_inds_add_1=True),
435
+ dict(
436
+ type='GenerateOccFlowLabels',
437
+ grid_conf=dict(
438
+ xbound=[-50.0, 50.0, 0.5],
439
+ ybound=[-50.0, 50.0, 0.5],
440
+ zbound=[-10.0, 10.0, 20.0]),
441
+ ignore_index=255,
442
+ only_vehicle=True,
443
+ filter_invisible=False),
444
+ dict(
445
+ type='MultiScaleFlipAug3D',
446
+ img_scale=(1600, 900),
447
+ pts_scale_ratio=1,
448
+ flip=False,
449
+ transforms=[
450
+ dict(
451
+ type='DefaultFormatBundle3D',
452
+ class_names=[
453
+ 'car', 'truck', 'construction_vehicle', 'bus',
454
+ 'trailer', 'barrier', 'motorcycle', 'bicycle',
455
+ 'pedestrian', 'traffic_cone'
456
+ ],
457
+ with_label=False),
458
+ dict(
459
+ type='CustomCollect3D',
460
+ keys=[
461
+ 'img', 'timestamp', 'l2g_r_mat', 'l2g_t',
462
+ 'gt_lane_labels', 'gt_lane_bboxes', 'gt_lane_masks',
463
+ 'gt_segmentation', 'gt_instance', 'gt_centerness',
464
+ 'gt_offset', 'gt_flow', 'gt_backward_flow',
465
+ 'gt_occ_has_invalid_frame', 'gt_occ_img_is_valid',
466
+ 'sdc_planning', 'sdc_planning_mask', 'command'
467
+ ])
468
+ ])
469
+ ],
470
+ planning_evaluation_strategy='uniad')
471
+ checkpoint_config = dict(interval=1)
472
+ log_config = dict(
473
+ interval=10,
474
+ hooks=[dict(type='TextLoggerHook'),
475
+ dict(type='TensorboardLoggerHook')])
476
+ dist_params = dict(backend='nccl')
477
+ log_level = 'INFO'
478
+ work_dir = 'projects/work_dirs/stage1_track_map/base_track_map/'
479
+ load_from = 'ckpts/bevformer_r101_dcn_24ep.pth'
480
+ resume_from = None
481
+ workflow = [('train', 1)]
482
+ plugin = True
483
+ plugin_dir = 'projects/mmdet3d_plugin/'
484
+ voxel_size = [0.2, 0.2, 8]
485
+ patch_size = [102.4, 102.4]
486
+ img_norm_cfg = dict(
487
+ mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
488
+ _dim_ = 256
489
+ _pos_dim_ = 128
490
+ _ffn_dim_ = 512
491
+ _num_levels_ = 4
492
+ bev_h_ = 200
493
+ bev_w_ = 200
494
+ _feed_dim_ = 512
495
+ _dim_half_ = 128
496
+ canvas_size = (200, 200)
497
+ queue_length = 5
498
+ predict_steps = 12
499
+ predict_modes = 6
500
+ fut_steps = 4
501
+ past_steps = 4
502
+ use_nonlinear_optimizer = True
503
+ occ_n_future = 4
504
+ occ_n_future_plan = 6
505
+ occ_n_future_max = 6
506
+ planning_steps = 6
507
+ use_col_optim = True
508
+ planning_evaluation_strategy = 'uniad'
509
+ occflow_grid_conf = dict(
510
+ xbound=[-50.0, 50.0, 0.5],
511
+ ybound=[-50.0, 50.0, 0.5],
512
+ zbound=[-10.0, 10.0, 20.0])
513
+ train_gt_iou_threshold = 0.3
514
+ model = dict(
515
+ type='UniAD',
516
+ gt_iou_threshold=0.3,
517
+ queue_length=5,
518
+ use_grid_mask=True,
519
+ video_test_mode=True,
520
+ num_query=900,
521
+ num_classes=10,
522
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
523
+ img_backbone=dict(
524
+ type='ResNet',
525
+ depth=101,
526
+ num_stages=4,
527
+ out_indices=(1, 2, 3),
528
+ frozen_stages=4,
529
+ norm_cfg=dict(type='BN2d', requires_grad=False),
530
+ norm_eval=True,
531
+ style='caffe',
532
+ dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
533
+ stage_with_dcn=(False, False, True, True)),
534
+ img_neck=dict(
535
+ type='FPN',
536
+ in_channels=[512, 1024, 2048],
537
+ out_channels=256,
538
+ start_level=0,
539
+ add_extra_convs='on_output',
540
+ num_outs=4,
541
+ relu_before_extra_convs=True),
542
+ freeze_img_backbone=True,
543
+ freeze_img_neck=False,
544
+ freeze_bn=False,
545
+ score_thresh=0.4,
546
+ filter_score_thresh=0.35,
547
+ qim_args=dict(
548
+ qim_type='QIMBase',
549
+ merger_dropout=0,
550
+ update_query_pos=True,
551
+ fp_ratio=0.3,
552
+ random_drop=0.1),
553
+ mem_args=dict(
554
+ memory_bank_type='MemoryBank',
555
+ memory_bank_score_thresh=0.0,
556
+ memory_bank_len=4),
557
+ loss_cfg=dict(
558
+ type='ClipMatcher',
559
+ num_classes=10,
560
+ weight_dict=None,
561
+ code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
562
+ assigner=dict(
563
+ type='HungarianAssigner3DTrack',
564
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
565
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
566
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
567
+ loss_cls=dict(
568
+ type='FocalLoss',
569
+ use_sigmoid=True,
570
+ gamma=2.0,
571
+ alpha=0.25,
572
+ loss_weight=2.0),
573
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
574
+ loss_past_traj_weight=0.0),
575
+ pts_bbox_head=dict(
576
+ type='BEVFormerTrackHead',
577
+ bev_h=200,
578
+ bev_w=200,
579
+ num_query=900,
580
+ num_classes=10,
581
+ in_channels=256,
582
+ sync_cls_avg_factor=True,
583
+ with_box_refine=True,
584
+ as_two_stage=False,
585
+ past_steps=4,
586
+ fut_steps=4,
587
+ transformer=dict(
588
+ type='PerceptionTransformer',
589
+ rotate_prev_bev=True,
590
+ use_shift=True,
591
+ use_can_bus=True,
592
+ embed_dims=256,
593
+ encoder=dict(
594
+ type='BEVFormerEncoder',
595
+ num_layers=6,
596
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
597
+ num_points_in_pillar=4,
598
+ return_intermediate=False,
599
+ transformerlayers=dict(
600
+ type='BEVFormerLayer',
601
+ attn_cfgs=[
602
+ dict(
603
+ type='TemporalSelfAttention',
604
+ embed_dims=256,
605
+ num_levels=1),
606
+ dict(
607
+ type='SpatialCrossAttention',
608
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
609
+ deformable_attention=dict(
610
+ type='MSDeformableAttention3D',
611
+ embed_dims=256,
612
+ num_points=8,
613
+ num_levels=4),
614
+ embed_dims=256)
615
+ ],
616
+ feedforward_channels=512,
617
+ ffn_dropout=0.1,
618
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
619
+ 'ffn', 'norm'))),
620
+ decoder=dict(
621
+ type='DetectionTransformerDecoder',
622
+ num_layers=6,
623
+ return_intermediate=True,
624
+ transformerlayers=dict(
625
+ type='DetrTransformerDecoderLayer',
626
+ attn_cfgs=[
627
+ dict(
628
+ type='MultiheadAttention',
629
+ embed_dims=256,
630
+ num_heads=8,
631
+ dropout=0.1),
632
+ dict(
633
+ type='CustomMSDeformableAttention',
634
+ embed_dims=256,
635
+ num_levels=1)
636
+ ],
637
+ feedforward_channels=512,
638
+ ffn_dropout=0.1,
639
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
640
+ 'ffn', 'norm')))),
641
+ bbox_coder=dict(
642
+ type='NMSFreeCoder',
643
+ post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
644
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
645
+ max_num=300,
646
+ voxel_size=[0.2, 0.2, 8],
647
+ num_classes=10),
648
+ positional_encoding=dict(
649
+ type='LearnedPositionalEncoding',
650
+ num_feats=128,
651
+ row_num_embed=200,
652
+ col_num_embed=200),
653
+ loss_cls=dict(
654
+ type='FocalLoss',
655
+ use_sigmoid=True,
656
+ gamma=2.0,
657
+ alpha=0.25,
658
+ loss_weight=2.0),
659
+ loss_bbox=dict(type='L1Loss', loss_weight=0.25),
660
+ loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
661
+ seg_head=dict(
662
+ type='PansegformerHead',
663
+ bev_h=200,
664
+ bev_w=200,
665
+ canvas_size=(200, 200),
666
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
667
+ num_query=300,
668
+ num_classes=4,
669
+ num_things_classes=3,
670
+ num_stuff_classes=1,
671
+ in_channels=2048,
672
+ sync_cls_avg_factor=True,
673
+ as_two_stage=False,
674
+ with_box_refine=True,
675
+ transformer=dict(
676
+ type='SegDeformableTransformer',
677
+ encoder=dict(
678
+ type='DetrTransformerEncoder',
679
+ num_layers=6,
680
+ transformerlayers=dict(
681
+ type='BaseTransformerLayer',
682
+ attn_cfgs=dict(
683
+ type='MultiScaleDeformableAttention',
684
+ embed_dims=256,
685
+ num_levels=4),
686
+ feedforward_channels=512,
687
+ ffn_dropout=0.1,
688
+ operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
689
+ decoder=dict(
690
+ type='DeformableDetrTransformerDecoder',
691
+ num_layers=6,
692
+ return_intermediate=True,
693
+ transformerlayers=dict(
694
+ type='DetrTransformerDecoderLayer',
695
+ attn_cfgs=[
696
+ dict(
697
+ type='MultiheadAttention',
698
+ embed_dims=256,
699
+ num_heads=8,
700
+ dropout=0.1),
701
+ dict(
702
+ type='MultiScaleDeformableAttention',
703
+ embed_dims=256,
704
+ num_levels=4)
705
+ ],
706
+ feedforward_channels=512,
707
+ ffn_dropout=0.1,
708
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
709
+ 'ffn', 'norm')))),
710
+ positional_encoding=dict(
711
+ type='SinePositionalEncoding',
712
+ num_feats=128,
713
+ normalize=True,
714
+ offset=-0.5),
715
+ loss_cls=dict(
716
+ type='FocalLoss',
717
+ use_sigmoid=True,
718
+ gamma=2.0,
719
+ alpha=0.25,
720
+ loss_weight=2.0),
721
+ loss_bbox=dict(type='L1Loss', loss_weight=5.0),
722
+ loss_iou=dict(type='GIoULoss', loss_weight=2.0),
723
+ loss_mask=dict(type='DiceLoss', loss_weight=2.0),
724
+ thing_transformer_head=dict(
725
+ type='SegMaskHead', d_model=256, nhead=8, num_decoder_layers=4),
726
+ stuff_transformer_head=dict(
727
+ type='SegMaskHead',
728
+ d_model=256,
729
+ nhead=8,
730
+ num_decoder_layers=6,
731
+ self_attn=True),
732
+ train_cfg=dict(
733
+ assigner=dict(
734
+ type='HungarianAssigner',
735
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
736
+ reg_cost=dict(
737
+ type='BBoxL1Cost', weight=5.0, box_format='xywh'),
738
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)),
739
+ assigner_with_mask=dict(
740
+ type='HungarianAssigner_multi_info',
741
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
742
+ reg_cost=dict(
743
+ type='BBoxL1Cost', weight=5.0, box_format='xywh'),
744
+ iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
745
+ mask_cost=dict(type='DiceCost', weight=2.0)),
746
+ sampler=dict(type='PseudoSampler'),
747
+ sampler_with_mask=dict(type='PseudoSampler_segformer'))),
748
+ train_cfg=dict(
749
+ pts=dict(
750
+ grid_size=[512, 512, 1],
751
+ voxel_size=[0.2, 0.2, 8],
752
+ point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
753
+ out_size_factor=4,
754
+ assigner=dict(
755
+ type='HungarianAssigner3D',
756
+ cls_cost=dict(type='FocalLossCost', weight=2.0),
757
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
758
+ iou_cost=dict(type='IoUCost', weight=0.0),
759
+ pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]))))
760
+ info_root = 'data/infos/'
761
+ ann_file_train = 'data/infos/nuscenes_infos_temporal_train.pkl'
762
+ ann_file_val = 'data/infos/nuscenes_infos_temporal_val.pkl'
763
+ ann_file_test = 'data/infos/nuscenes_infos_temporal_val.pkl'
764
+ optimizer = dict(
765
+ type='AdamW',
766
+ lr=0.0002,
767
+ paramwise_cfg=dict(custom_keys=dict(img_backbone=dict(lr_mult=0.1))),
768
+ weight_decay=0.01)
769
+ optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
770
+ lr_config = dict(
771
+ policy='CosineAnnealing',
772
+ warmup='linear',
773
+ warmup_iters=500,
774
+ warmup_ratio=0.3333333333333333,
775
+ min_lr_ratio=0.001)
776
+ total_epochs = 6
777
+ runner = dict(type='EpochBasedRunner', max_epochs=6)
778
+ find_unused_parameters = True
779
+ logger_name = 'mmdet'
780
+ gpu_ids = range(0, 1)