run_id: 0428_liberoall run_root_dir: ./checkpoints seed: 42 trackers: - jsonl - wandb wandb_entity: junjin wandb_project: 0428_liberoall is_debug: false framework: name: QwenAML qwenvl: base_vlm: ./checkpoints/Qwen3-VL-4B-Instruct-Action attn_implementation: flash_attention_2 vl_hidden_dim: 2048 dino: dino_backbone: dinov2_vits14 action_model: action_model_type: DiT-B action_hidden_dim: 1024 hidden_size: 1024 add_pos_embed: true max_seq_len: 1024 action_dim: 14 state_dim: 14 future_action_window_size: 9 action_horizon: 10 past_action_window_size: 0 repeated_diffusion_steps: 8 noise_beta_alpha: 1.5 noise_beta_beta: 1.0 noise_s: 0.999 num_timestep_buckets: 1000 num_inference_timesteps: 4 num_target_vision_tokens: 32 use_state: false diffusion_model_cfg: cross_attention_dim: 2048 dropout: 0.2 final_dropout: true interleave_self_attention: true norm_type: ada_norm num_layers: 16 output_dim: 1024 positional_embeddings: null spatial_model: model_name_or_path: ./checkpoints/vggt output_dim: 2048 spatial_projector: hidden_dim: 2048 output_dim: 2560 fuser: type: cross_attention reduce_in_full_precision: true use_mv_images: false layer_qformer: num_layers: 4 num_query_tokens: 128 input_dim: 2560 ouptput_dim: 2560 image_edit_model: model_name_or_path: ./checkpoints/LongCat-Image-Edit lora_path: ./checkpoints/Multi-view-VLA/LongCat-lora view_num: 2 fuser_type: mlp_gated_tranformer read_from_local: true num_inference_steps: 8 datasets: vlm_data: dataset_py: vlm_datasets dataformat: llava_json dataset_use: sharegpt4v_coco eval_dataset: sharegpt4v_coco data_flatten: false base_interval: 2 max_pixels: 307200 min_pixels: 784 model_max_length: 2048 model_type: qwen2.5vl per_device_batch_size: 4 vla_data: num_workers: 4 dataset_py: lerobot_datasets data_root_dir: /mnt/xlab-nas-2/vla_dataset/benchmark/libero mv_data_root_dir: ./dataset/libero_mv_feats data_mix: libero_all_ration action_type: delta_qpos CoT_prompt: Your task is {instruction}. To identify the key objects for your task. Locate their bounding boxes in [x1,y1,x2,y2] format. CoT_answer: bbox default_image_resolution: - 3 - 224 - 224 per_device_batch_size: 16 load_all_data_for_training: true obs: - image_0 video_backend: torchvision_av trainer: epochs: 100 max_train_steps: 40000 num_warmup_steps: 5000 save_interval: 5000 eval_interval: 1000000 learning_rate: base: 2.5e-05 qwen_vl_interface: 1.0e-05 action_model: 0.0001 lr_scheduler_type: cosine_with_min_lr scheduler_specific_kwargs: min_lr: 1.0e-06 freeze_modules: spatial_model,image_edit_model loss_scale: vla: 1.0 vlm: 0.1 forcing: 0.2 max_grad_norm: 1.0 warmup_ratio: 0.1 weight_decay: 0.0 logging_frequency: 100 gradient_clipping: 1.0 gradient_accumulation_steps: 1 optimizer: name: AdamW betas: - 0.9 - 0.95 eps: 1.0e-08 weight_decay: 1.0e-08 is_resume: false resume_from_checkpoint: null pretrained_checkpoint: ./checkpoints/Multi-view-VLA/pretrained_model/checkpoints/steps_14000_pytorch_model.pt reload_modules: qwen_vl_interface,action_model resume_epoch: null resume_step: null enable_gradient_checkpointing: true enable_mixed_precision_training: true vla_data: video_backend: torchvision_av output_dir: ./checkpoints/0428_liberoall_Qwen3vlGR00TAML_vggt_longcat_view2_mlp_gated_tranformer_bs16_4gpus_reload_vlm_action_ration