run_id: 0428_liberoall
run_root_dir: ./checkpoints
seed: 42
trackers:
- jsonl
- wandb
wandb_entity: junjin
wandb_project: 0428_liberoall
is_debug: false
framework:
  name: QwenAML
  qwenvl:
    base_vlm: ./checkpoints/Qwen3-VL-4B-Instruct-Action
    attn_implementation: flash_attention_2
    vl_hidden_dim: 2048
  dino:
    dino_backbone: dinov2_vits14
  action_model:
    action_model_type: DiT-B
    action_hidden_dim: 1024
    hidden_size: 1024
    add_pos_embed: true
    max_seq_len: 1024
    action_dim: 14
    state_dim: 14
    future_action_window_size: 9
    action_horizon: 10
    past_action_window_size: 0
    repeated_diffusion_steps: 8
    noise_beta_alpha: 1.5
    noise_beta_beta: 1.0
    noise_s: 0.999
    num_timestep_buckets: 1000
    num_inference_timesteps: 4
    num_target_vision_tokens: 32
    use_state: false
    diffusion_model_cfg:
      cross_attention_dim: 2048
      dropout: 0.2
      final_dropout: true
      interleave_self_attention: true
      norm_type: ada_norm
      num_layers: 16
      output_dim: 1024
      positional_embeddings: null
  spatial_model:
    model_name_or_path: ./checkpoints/vggt
    output_dim: 2048
  spatial_projector:
    hidden_dim: 2048
    output_dim: 2560
  fuser:
    type: cross_attention
  reduce_in_full_precision: true
  use_mv_images: false
  layer_qformer:
    num_layers: 4
    num_query_tokens: 128
    input_dim: 2560
    ouptput_dim: 2560
  image_edit_model:
    model_name_or_path: ./checkpoints/LongCat-Image-Edit
    lora_path: ./checkpoints/Multi-view-VLA/LongCat-lora
    view_num: 2
    fuser_type: mlp_gated_tranformer
    read_from_local: true
    num_inference_steps: 8
datasets:
  vlm_data:
    dataset_py: vlm_datasets
    dataformat: llava_json
    dataset_use: sharegpt4v_coco
    eval_dataset: sharegpt4v_coco
    data_flatten: false
    base_interval: 2
    max_pixels: 307200
    min_pixels: 784
    model_max_length: 2048
    model_type: qwen2.5vl
    per_device_batch_size: 4
  vla_data:
    num_workers: 4
    dataset_py: lerobot_datasets
    data_root_dir: /mnt/xlab-nas-2/vla_dataset/benchmark/libero
    mv_data_root_dir: ./dataset/libero_mv_feats
    data_mix: libero_all_ration
    action_type: delta_qpos
    CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
      Locate their bounding boxes in [x1,y1,x2,y2] format.
    CoT_answer: bbox
    default_image_resolution:
    - 3
    - 224
    - 224
    per_device_batch_size: 16
    load_all_data_for_training: true
    obs:
    - image_0
    video_backend: torchvision_av
trainer:
  epochs: 100
  max_train_steps: 40000
  num_warmup_steps: 5000
  save_interval: 5000
  eval_interval: 1000000
  learning_rate:
    base: 2.5e-05
    qwen_vl_interface: 1.0e-05
    action_model: 0.0001
  lr_scheduler_type: cosine_with_min_lr
  scheduler_specific_kwargs:
    min_lr: 1.0e-06
  freeze_modules: spatial_model,image_edit_model
  loss_scale:
    vla: 1.0
    vlm: 0.1
    forcing: 0.2
  max_grad_norm: 1.0
  warmup_ratio: 0.1
  weight_decay: 0.0
  logging_frequency: 100
  gradient_clipping: 1.0
  gradient_accumulation_steps: 1
  optimizer:
    name: AdamW
    betas:
    - 0.9
    - 0.95
    eps: 1.0e-08
    weight_decay: 1.0e-08
  is_resume: false
  resume_from_checkpoint: null
  pretrained_checkpoint: ./checkpoints/Multi-view-VLA/pretrained_model/checkpoints/steps_14000_pytorch_model.pt
  reload_modules: qwen_vl_interface,action_model
  resume_epoch: null
  resume_step: null
  enable_gradient_checkpointing: true
  enable_mixed_precision_training: true
  vla_data:
    video_backend: torchvision_av
output_dir: ./checkpoints/0428_liberoall_Qwen3vlGR00TAML_vggt_longcat_view2_mlp_gated_tranformer_bs16_4gpus_reload_vlm_action_ration