| run_id: 0428_liberoall |
| run_root_dir: ./checkpoints |
| seed: 42 |
| trackers: |
| - jsonl |
| - wandb |
| wandb_entity: junjin |
| wandb_project: 0428_liberoall |
| is_debug: false |
| framework: |
| name: QwenAML |
| qwenvl: |
| base_vlm: ./checkpoints/Qwen3-VL-4B-Instruct-Action |
| attn_implementation: flash_attention_2 |
| vl_hidden_dim: 2048 |
| dino: |
| dino_backbone: dinov2_vits14 |
| action_model: |
| action_model_type: DiT-B |
| action_hidden_dim: 1024 |
| hidden_size: 1024 |
| add_pos_embed: true |
| max_seq_len: 1024 |
| action_dim: 14 |
| state_dim: 14 |
| future_action_window_size: 9 |
| action_horizon: 10 |
| past_action_window_size: 0 |
| repeated_diffusion_steps: 8 |
| noise_beta_alpha: 1.5 |
| noise_beta_beta: 1.0 |
| noise_s: 0.999 |
| num_timestep_buckets: 1000 |
| num_inference_timesteps: 4 |
| num_target_vision_tokens: 32 |
| use_state: false |
| diffusion_model_cfg: |
| cross_attention_dim: 2048 |
| dropout: 0.2 |
| final_dropout: true |
| interleave_self_attention: true |
| norm_type: ada_norm |
| num_layers: 16 |
| output_dim: 1024 |
| positional_embeddings: null |
| spatial_model: |
| model_name_or_path: ./checkpoints/vggt |
| output_dim: 2048 |
| spatial_projector: |
| hidden_dim: 2048 |
| output_dim: 2560 |
| fuser: |
| type: cross_attention |
| reduce_in_full_precision: true |
| use_mv_images: false |
| layer_qformer: |
| num_layers: 4 |
| num_query_tokens: 128 |
| input_dim: 2560 |
| ouptput_dim: 2560 |
| image_edit_model: |
| model_name_or_path: ./checkpoints/LongCat-Image-Edit |
| lora_path: ./checkpoints/Multi-view-VLA/LongCat-lora |
| view_num: 2 |
| fuser_type: mlp_gated_tranformer |
| read_from_local: true |
| num_inference_steps: 8 |
| datasets: |
| vlm_data: |
| dataset_py: vlm_datasets |
| dataformat: llava_json |
| dataset_use: sharegpt4v_coco |
| eval_dataset: sharegpt4v_coco |
| data_flatten: false |
| base_interval: 2 |
| max_pixels: 307200 |
| min_pixels: 784 |
| model_max_length: 2048 |
| model_type: qwen2.5vl |
| per_device_batch_size: 4 |
| vla_data: |
| num_workers: 4 |
| dataset_py: lerobot_datasets |
| data_root_dir: /mnt/xlab-nas-2/vla_dataset/benchmark/libero |
| mv_data_root_dir: ./dataset/libero_mv_feats |
| data_mix: libero_all_ration |
| action_type: delta_qpos |
| CoT_prompt: Your task is {instruction}. To identify the key objects for your task. |
| Locate their bounding boxes in [x1,y1,x2,y2] format. |
| CoT_answer: bbox |
| default_image_resolution: |
| - 3 |
| - 224 |
| - 224 |
| per_device_batch_size: 16 |
| load_all_data_for_training: true |
| obs: |
| - image_0 |
| video_backend: torchvision_av |
| trainer: |
| epochs: 100 |
| max_train_steps: 40000 |
| num_warmup_steps: 5000 |
| save_interval: 5000 |
| eval_interval: 1000000 |
| learning_rate: |
| base: 2.5e-05 |
| qwen_vl_interface: 1.0e-05 |
| action_model: 0.0001 |
| lr_scheduler_type: cosine_with_min_lr |
| scheduler_specific_kwargs: |
| min_lr: 1.0e-06 |
| freeze_modules: spatial_model,image_edit_model |
| loss_scale: |
| vla: 1.0 |
| vlm: 0.1 |
| forcing: 0.2 |
| max_grad_norm: 1.0 |
| warmup_ratio: 0.1 |
| weight_decay: 0.0 |
| logging_frequency: 100 |
| gradient_clipping: 1.0 |
| gradient_accumulation_steps: 1 |
| optimizer: |
| name: AdamW |
| betas: |
| - 0.9 |
| - 0.95 |
| eps: 1.0e-08 |
| weight_decay: 1.0e-08 |
| is_resume: false |
| resume_from_checkpoint: null |
| pretrained_checkpoint: ./checkpoints/Multi-view-VLA/pretrained_model/checkpoints/steps_14000_pytorch_model.pt |
| reload_modules: qwen_vl_interface,action_model |
| resume_epoch: null |
| resume_step: null |
| enable_gradient_checkpointing: true |
| enable_mixed_precision_training: true |
| vla_data: |
| video_backend: torchvision_av |
| output_dir: ./checkpoints/0428_liberoall_Qwen3vlGR00TAML_vggt_longcat_view2_mlp_gated_tranformer_bs16_4gpus_reload_vlm_action_ration |
|
|