junjin0
/

Multi-view-VLA

Model card Files Files and versions

Multi-view-VLA / LIBERO /config.yaml

junjin0's picture

Upload folder using huggingface_hub

12f4088 verified 7 days ago

history blame contribute delete

3.72 kB

	run_id: 0428_liberoall
	run_root_dir: ./checkpoints
	seed: 42
	trackers:
	- jsonl
	- wandb
	wandb_entity: junjin
	wandb_project: 0428_liberoall
	is_debug: false
	framework:
	name: QwenAML
	qwenvl:
	base_vlm: ./checkpoints/Qwen3-VL-4B-Instruct-Action
	attn_implementation: flash_attention_2
	vl_hidden_dim: 2048
	dino:
	dino_backbone: dinov2_vits14
	action_model:
	action_model_type: DiT-B
	action_hidden_dim: 1024
	hidden_size: 1024
	add_pos_embed: true
	max_seq_len: 1024
	action_dim: 14
	state_dim: 14
	future_action_window_size: 9
	action_horizon: 10
	past_action_window_size: 0
	repeated_diffusion_steps: 8
	noise_beta_alpha: 1.5
	noise_beta_beta: 1.0
	noise_s: 0.999
	num_timestep_buckets: 1000
	num_inference_timesteps: 4
	num_target_vision_tokens: 32
	use_state: false
	diffusion_model_cfg:
	cross_attention_dim: 2048
	dropout: 0.2
	final_dropout: true
	interleave_self_attention: true
	norm_type: ada_norm
	num_layers: 16
	output_dim: 1024
	positional_embeddings: null
	spatial_model:
	model_name_or_path: ./checkpoints/vggt
	output_dim: 2048
	spatial_projector:
	hidden_dim: 2048
	output_dim: 2560
	fuser:
	type: cross_attention
	reduce_in_full_precision: true
	use_mv_images: false
	layer_qformer:
	num_layers: 4
	num_query_tokens: 128
	input_dim: 2560
	ouptput_dim: 2560
	image_edit_model:
	model_name_or_path: ./checkpoints/LongCat-Image-Edit
	lora_path: ./checkpoints/Multi-view-VLA/LongCat-lora
	view_num: 2
	fuser_type: mlp_gated_tranformer
	read_from_local: true
	num_inference_steps: 8
	datasets:
	vlm_data:
	dataset_py: vlm_datasets
	dataformat: llava_json
	dataset_use: sharegpt4v_coco
	eval_dataset: sharegpt4v_coco
	data_flatten: false
	base_interval: 2
	max_pixels: 307200
	min_pixels: 784
	model_max_length: 2048
	model_type: qwen2.5vl
	per_device_batch_size: 4
	vla_data:
	num_workers: 4
	dataset_py: lerobot_datasets
	data_root_dir: /mnt/xlab-nas-2/vla_dataset/benchmark/libero
	mv_data_root_dir: ./dataset/libero_mv_feats
	data_mix: libero_all_ration
	action_type: delta_qpos
	CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
	Locate their bounding boxes in [x1,y1,x2,y2] format.
	CoT_answer: bbox
	default_image_resolution:
	- 3
	- 224
	- 224
	per_device_batch_size: 16
	load_all_data_for_training: true
	obs:
	- image_0
	video_backend: torchvision_av
	trainer:
	epochs: 100
	max_train_steps: 40000
	num_warmup_steps: 5000
	save_interval: 5000
	eval_interval: 1000000
	learning_rate:
	base: 2.5e-05
	qwen_vl_interface: 1.0e-05
	action_model: 0.0001
	lr_scheduler_type: cosine_with_min_lr
	scheduler_specific_kwargs:
	min_lr: 1.0e-06
	freeze_modules: spatial_model,image_edit_model
	loss_scale:
	vla: 1.0
	vlm: 0.1
	forcing: 0.2
	max_grad_norm: 1.0
	warmup_ratio: 0.1
	weight_decay: 0.0
	logging_frequency: 100
	gradient_clipping: 1.0
	gradient_accumulation_steps: 1
	optimizer:
	name: AdamW
	betas:
	- 0.9
	- 0.95
	eps: 1.0e-08
	weight_decay: 1.0e-08
	is_resume: false
	resume_from_checkpoint: null
	pretrained_checkpoint: ./checkpoints/Multi-view-VLA/pretrained_model/checkpoints/steps_14000_pytorch_model.pt
	reload_modules: qwen_vl_interface,action_model
	resume_epoch: null
	resume_step: null
	enable_gradient_checkpointing: true
	enable_mixed_precision_training: true
	vla_data:
	video_backend: torchvision_av
	output_dir: ./checkpoints/0428_liberoall_Qwen3vlGR00TAML_vggt_longcat_view2_mlp_gated_tranformer_bs16_4gpus_reload_vlm_action_ration