SparseBev / configs /r50_nuimg_704x256.py

Haisong Liu

Release model: vit_eva02_1600x640_trainval_future (#46)

102ac67 unverified over 2 years ago

7.05 kB

	dataset_type = 'CustomNuScenesDataset'
	dataset_root = 'data/nuscenes/'

	input_modality = dict(
	use_lidar=False,
	use_camera=True,
	use_radar=False,
	use_map=False,
	use_external=True
	)

	# For nuScenes we usually do 10-class detection
	class_names = [
	'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
	'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
	]

	# If point cloud range is changed, the models should also change their point
	# cloud range accordingly
	point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
	voxel_size = [0.2, 0.2, 8]

	# arch config
	embed_dims = 256
	num_layers = 6
	num_query = 900
	num_frames = 8
	num_levels = 4
	num_points = 4

	img_backbone = dict(
	type='ResNet',
	depth=50,
	num_stages=4,
	out_indices=(0, 1, 2, 3),
	frozen_stages=1,
	norm_cfg=dict(type='BN2d', requires_grad=True),
	norm_eval=True,
	style='pytorch',
	with_cp=True)
	img_neck = dict(
	type='FPN',
	in_channels=[256, 512, 1024, 2048],
	out_channels=embed_dims,
	num_outs=num_levels)
	img_norm_cfg = dict(
	mean=[123.675, 116.280, 103.530],
	std=[58.395, 57.120, 57.375],
	to_rgb=True)

	model = dict(
	type='SparseBEV',
	data_aug=dict(
	img_color_aug=True, # Move some augmentations to GPU
	img_norm_cfg=img_norm_cfg,
	img_pad_cfg=dict(size_divisor=32)),
	stop_prev_grad=0,
	img_backbone=img_backbone,
	img_neck=img_neck,
	pts_bbox_head=dict(
	type='SparseBEVHead',
	num_classes=10,
	in_channels=embed_dims,
	num_query=num_query,
	query_denoising=True,
	query_denoising_groups=10,
	code_size=10,
	code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
	sync_cls_avg_factor=True,
	transformer=dict(
	type='SparseBEVTransformer',
	embed_dims=embed_dims,
	num_frames=num_frames,
	num_points=num_points,
	num_layers=num_layers,
	num_levels=num_levels,
	num_classes=10,
	code_size=10,
	pc_range=point_cloud_range),
	bbox_coder=dict(
	type='NMSFreeCoder',
	post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
	pc_range=point_cloud_range,
	max_num=300,
	voxel_size=voxel_size,
	score_threshold=0.05,
	num_classes=10),
	positional_encoding=dict(
	type='SinePositionalEncoding',
	num_feats=embed_dims // 2,
	normalize=True,
	offset=-0.5),
	loss_cls=dict(
	type='FocalLoss',
	use_sigmoid=True,
	gamma=2.0,
	alpha=0.25,
	loss_weight=2.0),
	loss_bbox=dict(type='L1Loss', loss_weight=0.25),
	loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
	train_cfg=dict(pts=dict(
	grid_size=[512, 512, 1],
	voxel_size=voxel_size,
	point_cloud_range=point_cloud_range,
	out_size_factor=4,
	assigner=dict(
	type='HungarianAssigner3D',
	cls_cost=dict(type='FocalLossCost', weight=2.0),
	reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
	iou_cost=dict(type='IoUCost', weight=0.0),
	)
	))
	)

	ida_aug_conf = {
	'resize_lim': (0.38, 0.55),
	'final_dim': (256, 704),
	'bot_pct_lim': (0.0, 0.0),
	'rot_lim': (0.0, 0.0),
	'H': 900, 'W': 1600,
	'rand_flip': True,
	}

	train_pipeline = [
	dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
	dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1),
	dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
	dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
	dict(type='ObjectNameFilter', classes=class_names),
	dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True),
	dict(type='GlobalRotScaleTransImage', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05]),
	dict(type='DefaultFormatBundle3D', class_names=class_names),
	dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'], meta_keys=(
	'filename', 'ori_shape', 'img_shape', 'pad_shape', 'lidar2img', 'img_timestamp'))
	]

	test_pipeline = [
	dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
	dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True),
	dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False),
	dict(
	type='MultiScaleFlipAug3D',
	img_scale=(1600, 900),
	pts_scale_ratio=1,
	flip=False,
	transforms=[
	dict(type='DefaultFormatBundle3D', class_names=class_names, with_label=False),
	dict(type='Collect3D', keys=['img'], meta_keys=(
	'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape',
	'lidar2img', 'img_timestamp'))
	])
	]

	data = dict(
	workers_per_gpu=8,
	train=dict(
	type=dataset_type,
	data_root=dataset_root,
	ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl',
	pipeline=train_pipeline,
	classes=class_names,
	modality=input_modality,
	test_mode=False,
	use_valid_flag=True,
	box_type_3d='LiDAR'),
	val=dict(
	type=dataset_type,
	data_root=dataset_root,
	ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl',
	pipeline=test_pipeline,
	classes=class_names,
	modality=input_modality,
	test_mode=True,
	box_type_3d='LiDAR'),
	test=dict(
	type=dataset_type,
	data_root=dataset_root,
	ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl',
	pipeline=test_pipeline,
	classes=class_names,
	modality=input_modality,
	test_mode=True,
	box_type_3d='LiDAR')
	)

	optimizer = dict(
	type='AdamW',
	lr=2e-4,
	paramwise_cfg=dict(custom_keys={
	'img_backbone': dict(lr_mult=0.1),
	'sampling_offset': dict(lr_mult=0.1),
	}),
	weight_decay=0.01
	)

	optimizer_config = dict(
	type='Fp16OptimizerHook',
	loss_scale=512.0,
	grad_clip=dict(max_norm=35, norm_type=2)
	)

	# learning policy
	lr_config = dict(
	policy='CosineAnnealing',
	warmup='linear',
	warmup_iters=500,
	warmup_ratio=1.0 / 3,
	min_lr_ratio=1e-3
	)
	total_epochs = 24
	batch_size = 8

	# load pretrained weights
	load_from = 'pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth'
	revise_keys = [('backbone', 'img_backbone')]

	# resume the last training
	resume_from = None

	# checkpointing
	checkpoint_config = dict(interval=1, max_keep_ckpts=1)

	# logging
	log_config = dict(
	interval=1,
	hooks=[
	dict(type='MyTextLoggerHook', interval=1, reset_flag=True),
	dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True)
	]
	)

	# evaluation
	eval_config = dict(interval=total_epochs)

	# other flags
	debug = False