datasets: vla_data: data_mix: delta_state data_root_dir: /mnt/xlab-nas-2/vla_dataset dataset_py: lerobot_datasets image_size: - 224 - 224 num_workers: 6 per_device_batch_size: 16 sequential_step_sampling: false framework: action_model: action_dim: 14 action_horizon: 16 action_model_type: DiT-B add_pos_embed: true diffusion_model_cfg: cross_attention_dim: 2560 dropout: 0.2 final_dropout: true interleave_self_attention: true norm_type: ada_norm num_layers: 16 output_dim: 1024 positional_embeddings: null future_action_window_size: 15 hidden_size: 1024 max_seq_len: 1024 noise_beta_alpha: 1.5 noise_beta_beta: 1.0 noise_s: 0.999 num_inference_timesteps: 4 num_target_vision_tokens: 32 num_timestep_buckets: 1000 past_action_window_size: 0 state_dim: 14 name: QwenJAT qwenvl: base_vlm: ./checkpoints/Qwen3-VL-4B-Instruct-Action output_dir: /mnt/workspace/lintong.lt/output/vla_pretrain/0323_pretrain_Qwen3VL4BJAT_bs2048 run_id: 0323_pretrain_Qwen3VL4BJAT_bs2048 run_root_dir: /mnt/workspace/lintong.lt/output/vla_pretrain seed: 42 trainer: eval_interval: 2000 freeze_modules: null gradient_accumulation_steps: 2 gradient_clipping: 1.0 is_resume: false learning_rate: action_model: 0.0001 base: 3.0e-05 qwen_vl_interface: 1.0e-05 logging_frequency: 100 lr_scheduler_type: cosine_with_min_lr max_train_steps: 200000 num_warmup_steps: 2000 optimizer: betas: - 0.9 - 0.95 eps: 1.0e-08 weight_decay: 1.0e-08 repeated_diffusion_steps: 4 save_interval: 2000 scheduler_specific_kwargs: min_lr: 5.0e-07 wandb_entity: your_wandb_entity wandb_project: llavavla