Multi-view-VLA / LIBERO /config.yaml
junjin0's picture
Upload folder using huggingface_hub
12f4088 verified
run_id: 0428_liberoall
run_root_dir: ./checkpoints
seed: 42
trackers:
- jsonl
- wandb
wandb_entity: junjin
wandb_project: 0428_liberoall
is_debug: false
framework:
name: QwenAML
qwenvl:
base_vlm: ./checkpoints/Qwen3-VL-4B-Instruct-Action
attn_implementation: flash_attention_2
vl_hidden_dim: 2048
dino:
dino_backbone: dinov2_vits14
action_model:
action_model_type: DiT-B
action_hidden_dim: 1024
hidden_size: 1024
add_pos_embed: true
max_seq_len: 1024
action_dim: 14
state_dim: 14
future_action_window_size: 9
action_horizon: 10
past_action_window_size: 0
repeated_diffusion_steps: 8
noise_beta_alpha: 1.5
noise_beta_beta: 1.0
noise_s: 0.999
num_timestep_buckets: 1000
num_inference_timesteps: 4
num_target_vision_tokens: 32
use_state: false
diffusion_model_cfg:
cross_attention_dim: 2048
dropout: 0.2
final_dropout: true
interleave_self_attention: true
norm_type: ada_norm
num_layers: 16
output_dim: 1024
positional_embeddings: null
spatial_model:
model_name_or_path: ./checkpoints/vggt
output_dim: 2048
spatial_projector:
hidden_dim: 2048
output_dim: 2560
fuser:
type: cross_attention
reduce_in_full_precision: true
use_mv_images: false
layer_qformer:
num_layers: 4
num_query_tokens: 128
input_dim: 2560
ouptput_dim: 2560
image_edit_model:
model_name_or_path: ./checkpoints/LongCat-Image-Edit
lora_path: ./checkpoints/Multi-view-VLA/LongCat-lora
view_num: 2
fuser_type: mlp_gated_tranformer
read_from_local: true
num_inference_steps: 8
datasets:
vlm_data:
dataset_py: vlm_datasets
dataformat: llava_json
dataset_use: sharegpt4v_coco
eval_dataset: sharegpt4v_coco
data_flatten: false
base_interval: 2
max_pixels: 307200
min_pixels: 784
model_max_length: 2048
model_type: qwen2.5vl
per_device_batch_size: 4
vla_data:
num_workers: 4
dataset_py: lerobot_datasets
data_root_dir: /mnt/xlab-nas-2/vla_dataset/benchmark/libero
mv_data_root_dir: ./dataset/libero_mv_feats
data_mix: libero_all_ration
action_type: delta_qpos
CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
Locate their bounding boxes in [x1,y1,x2,y2] format.
CoT_answer: bbox
default_image_resolution:
- 3
- 224
- 224
per_device_batch_size: 16
load_all_data_for_training: true
obs:
- image_0
video_backend: torchvision_av
trainer:
epochs: 100
max_train_steps: 40000
num_warmup_steps: 5000
save_interval: 5000
eval_interval: 1000000
learning_rate:
base: 2.5e-05
qwen_vl_interface: 1.0e-05
action_model: 0.0001
lr_scheduler_type: cosine_with_min_lr
scheduler_specific_kwargs:
min_lr: 1.0e-06
freeze_modules: spatial_model,image_edit_model
loss_scale:
vla: 1.0
vlm: 0.1
forcing: 0.2
max_grad_norm: 1.0
warmup_ratio: 0.1
weight_decay: 0.0
logging_frequency: 100
gradient_clipping: 1.0
gradient_accumulation_steps: 1
optimizer:
name: AdamW
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 1.0e-08
is_resume: false
resume_from_checkpoint: null
pretrained_checkpoint: ./checkpoints/Multi-view-VLA/pretrained_model/checkpoints/steps_14000_pytorch_model.pt
reload_modules: qwen_vl_interface,action_model
resume_epoch: null
resume_step: null
enable_gradient_checkpointing: true
enable_mixed_precision_training: true
vla_data:
video_backend: torchvision_av
output_dir: ./checkpoints/0428_liberoall_Qwen3vlGR00TAML_vggt_longcat_view2_mlp_gated_tranformer_bs16_4gpus_reload_vlm_action_ration