You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Why do our reproduced results always stabilize around 65.0, compared to 66.4 in your paper. The configuration is as follows:
`base = [
'../datasets/custom_nus-3d.py',
'../base/default_runtime.py'
]
There may be some random for the final results. According to our experience, it is normal to get results around 66.0. How many ‘grad_norm: nan’ in your log? Too many ‘grad_norm: nan’ may cause a lower result, which should not happen.
Why do our reproduced results always stabilize around 65.0, compared to 66.4 in your paper. The configuration is as follows:
`base = [
'../datasets/custom_nus-3d.py',
'../base/default_runtime.py'
]
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
If point cloud range is changed, the models should also change their point
cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range = [-15.0, -30.0,-2.0, 15.0, 30.0, 2.0]
voxel_size = [0.15, 0.15, 4.0]
dbound=[1.0, 35.0, 0.5]
grid_config = {
'x': [-30.0, -30.0, 0.15], # useless
'y': [-15.0, -15.0, 0.15], # useless
'z': [-10, 10, 20], # useless
'depth': [1.0, 35.0, 0.5], # useful
}
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
map has classes: divider, ped_crossing, boundary
map_classes = ['divider', 'ped_crossing','boundary']
fixed_ptsnum_per_line = 20
map_classes = ['divider',]
num_vec=50
fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
fixed_ptsnum_per_pred_line = 20
eval_use_same_gt_sample_num_flag=True
num_map_classes = len(map_classes)
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True)
dim = 256
pos_dim = dim//2
ffn_dim = dim*2
num_levels = 1
num_points_in_pillar = 8
bev_h_ = 50
bev_w_ = 50
bev_h_ = 200
bev_w_ = 100
queue_length = 1 # each sequence contains
queue_length
frames.aux_seg_cfg = dict(
use_aux_seg=True,
bev_seg=True,
pv_seg=True,
seg_classes=1,
feat_down_sample=32,
pv_thickness=1,
)
model = dict(
type='MapTRv2',
use_grid_mask=True,
video_test_mode=False,
pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(3,),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
style='pytorch'),
img_neck=dict(
type='FPN',
in_channels=[2048],
out_channels=dim,
start_level=0,
add_extra_convs='on_output',
num_outs=num_levels,
relu_before_extra_convs=True),
pts_bbox_head=dict(
type='MapTRv2Head',
bev_h=bev_h_,
bev_w=bev_w_,
num_query=900,
num_vec_one2one=100,
num_vec_one2many=600,
k_one2many=6,
num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox
num_pts_per_gt_vec=fixed_ptsnum_per_gt_line,
dir_interval=1,
# query_embed_type='instance_pts',
query_embed_type='instance',
transform_method='minmax',
gt_shift_pts_pattern='v2',
num_classes=num_map_classes,
in_channels=dim,
sync_cls_avg_factor=True,
with_box_refine=True,
as_two_stage=False,
code_size=2,
code_weights=[1.0, 1.0, 1.0, 1.0],
aux_seg=aux_seg_cfg,
# z_cfg=z_cfg,
transformer=dict(
type='MapTRPerceptionTransformer',
rotate_prev_bev=True,
use_shift=True,
use_can_bus=True,
embed_dims=dim,
encoder=dict(
type='BEVFormerEncoder',
num_layers=3,
pc_range=point_cloud_range,
num_points_in_pillar=num_points_in_pillar,
return_intermediate=False,
with_height_refine=True,
transformerlayers=dict(
type='BEVFormerLayer',
attn_cfgs=[
dict(
type='TemporalSelfAttention',
embed_dims=dim,
num_levels=1),
dict(
type='HeightKernelAttention',
pc_range=point_cloud_range,
num_points_in_pillar=num_points_in_pillar,
attention=dict(
type='MSDeformableAttentionKernel',
embed_dims=dim,
num_heads=num_points_in_pillar,
dilation=1,
kernel_size=(2, 4),
num_levels=num_levels),
embed_dims=dim,
)
],
feedforward_channels=ffn_dim,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm'))),
decoder=dict(
type='MapTRDecoder',
num_layers=6,
return_intermediate=True,
query_pos_embedding='instance',
num_pts_per_vec=fixed_ptsnum_per_pred_line,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=dim,
num_heads=8,
dropout=0.1),
dict(
type='InstancePointAttention',
embed_dims=dim,
num_levels=1,
num_pts_per_vec=fixed_ptsnum_per_pred_line,
),
],
feedforward_channels=ffn_dim,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm'))
)),
bbox_coder=dict(
type='MapTRNMSFreeCoder',
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
pc_range=point_cloud_range,
max_num=50,
voxel_size=voxel_size,
num_classes=num_map_classes),
positional_encoding=dict(
type='LearnedPositionalEncoding',
num_feats=pos_dim,
row_num_embed=bev_h_,
col_num_embed=bev_w_,
),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='L1Loss', loss_weight=0.0),
loss_iou=dict(type='GIoULoss', loss_weight=0.0),
loss_pts=dict(type='PtsL1Loss',
loss_weight=5.0),
loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005),
loss_seg=dict(type='SimpleLoss',
pos_weight=4.0,
loss_weight=1.0),
loss_pv_seg=dict(type='SimpleLoss',
pos_weight=1.0,
loss_weight=2.0),),
# model training and testing settings
train_cfg=dict(pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='MapTRAssigner',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
pts_cost=dict(type='OrderedPtsL1Cost',
weight=5),
pc_range=point_cloud_range))))
dataset_type = 'CustomNuScenesOfflineLocalMapDataset'
data_root =
file_client_args = dict(backend='disk')
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(type='CustomPointToMultiViewDepth', downsample=1, grid_config=grid_config),
dict(type='PadMultiViewImageDepth', size_divisor=32),
dict(type='DefaultFormatBundle3D', with_gt=False, with_label=False,class_names=map_classes),
dict(type='CustomCollect3D', keys=['img', 'gt_depth'])
]
test_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
]
data = dict(
samples_per_gpu=4,
workers_per_gpu=4, # TODO
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_map_infos_temporal_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
aux_seg=aux_seg_cfg,
test_mode=False,
use_valid_flag=True,
bev_size=(bev_h_, bev_w_),
pc_range=point_cloud_range,
fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
padding_value=-10000,
map_classes=map_classes,
queue_length=queue_length,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR'),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_map_infos_temporal_val.pkl',
map_ann_file=data_root + 'nuscenes_map_anns_val.json',
pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
pc_range=point_cloud_range,
fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
padding_value=-10000,
map_classes=map_classes,
classes=class_names, modality=input_modality, samples_per_gpu=1),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_map_infos_temporal_val.pkl',
map_ann_file=data_root + 'nuscenes_map_anns_val.json',
pipeline=test_pipeline,
bev_size=(bev_h_, bev_w_),
pc_range=point_cloud_range,
fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
padding_value=-10000,
map_classes=map_classes,
classes=class_names,
modality=input_modality),
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler')
)
optimizer = dict(
type='AdamW',
lr=6e-4,
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),
}),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
learning policy
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
min_lr_ratio=1e-3)
total_epochs = 24
evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer',
save_best='NuscMap_chamfer/mAP', rule='greater')
total_epochs = 50
evaluation = dict(interval=1, pipeline=test_pipeline)
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
fp16 = dict(loss_scale=512.)
checkpoint_config = dict(max_keep_ckpts=1, interval=2)
find_unused_parameters=True`
The text was updated successfully, but these errors were encountered: