InternRobotics
/

InternVLA-M1-LIBERO-Spatial

vision-language-action-model

vision-language-model

Model card Files Files and versions

InternVLA-M1-LIBERO-Spatial / config.yaml

Jinhuiye's picture

Update config.yaml

b1bd418 verified 6 months ago

history blame contribute delete

2.88 kB

	run_id: 0903_libero_spatial_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_16_pretrained_vlm
	run_root_dir: ./playground/Checkpoints
	seed: 42
	trackers:
	- jsonl
	- wandb
	wandb_entity: michaelyu-1101-fudanuniversity
	wandb_project: Internvla
	is_debug: false
	framework:
	framework_py: InternVLA-M1
	qwenvl:
	base_vlm: Qwen/Qwen2.5-VL-3B-Instruct
	attn_implementation: flash_attention_2
	vl_hidden_dim: 2048
	dino:
	dino_backbone: dinov2_vitl14
	layer_qformer:
	qformer_end_layer: 37
	qformer_start_layer: 36
	num_query_tokens: 64
	input_dim: 2048
	ouptput_dim: 768
	grad_scale: 0.5
	action_model:
	action_model_type: DiT-B
	action_hidden_dim: 768
	action_dim: 7
	input_dim: 2048
	ouptput_dim: 768
	use_ema: false
	future_action_window_size: 7
	past_action_window_size: 0
	repeated_diffusion_steps: 8
	reduce_in_full_precision: true
	datasets:
	vlm_data:
	dataformat: llava_json
	dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
	eval_dataset: aokvqa_cauldron_llava_format
	data_flatten: false
	base_interval: 2
	max_pixels: 50176
	min_pixels: 784
	fix_image_size:
	- 224
	- 224
	model_max_length: 1024
	model_type: qwen2.5vl
	per_device_batch_size: 4
	vla_data:
	dataset_py: lerobot_libero
	data_root_dir: playground/Datasets/LEROBOT_LIBERO_DATA
	data_mix: libero_spatial
	action_type: delta_qpos
	CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
	Locate their bounding boxes in [x1,y1,x2,y2] format.
	CoT_answer: bbox
	default_image_resolution:
	- 3
	- 224
	- 224
	per_device_batch_size: 16
	load_all_data_for_training: true
	obs:
	- image_0
	trainer:
	epochs: 100
	max_train_steps: 100000
	num_warmup_steps: 5000
	save_interval: 10000
	eval_interval: 1000
	learning_rate:
	base: 2.5e-05
	lr_scheduler_type: cosine_with_min_lr
	scheduler_specific_kwargs:
	min_lr: 1.0e-06
	freeze_modules: ''
	loss_scale:
	vla: 1.0
	vlm: 0.1
	max_grad_norm: 1.0
	warmup_ratio: 0.1
	weight_decay: 0.0
	logging_frequency: 10
	gradient_clipping: 1.0
	gradient_accumulation_steps: 1
	optimizer:
	name: AdamW
	betas:
	- 0.9
	- 0.95
	eps: 1.0e-08
	weight_decay: 1.0e-08
	is_resume: false
	resume_epoch: null
	resume_step: null
	enable_gradient_checkpointing: true
	enable_mixed_precision_training: true
	output_dir: ./playground/Checkpoints/0903_libero_spatial_augsteps_0_wo_flash_attention_wo_augsteps_two_view_action_chunk_16_pretrained_vlm