File size: 3,722 Bytes
12f4088
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
run_id: 0428_liberoall
run_root_dir: ./checkpoints
seed: 42
trackers:
- jsonl
- wandb
wandb_entity: junjin
wandb_project: 0428_liberoall
is_debug: false
framework:
  name: QwenAML
  qwenvl:
    base_vlm: ./checkpoints/Qwen3-VL-4B-Instruct-Action
    attn_implementation: flash_attention_2
    vl_hidden_dim: 2048
  dino:
    dino_backbone: dinov2_vits14
  action_model:
    action_model_type: DiT-B
    action_hidden_dim: 1024
    hidden_size: 1024
    add_pos_embed: true
    max_seq_len: 1024
    action_dim: 14
    state_dim: 14
    future_action_window_size: 9
    action_horizon: 10
    past_action_window_size: 0
    repeated_diffusion_steps: 8
    noise_beta_alpha: 1.5
    noise_beta_beta: 1.0
    noise_s: 0.999
    num_timestep_buckets: 1000
    num_inference_timesteps: 4
    num_target_vision_tokens: 32
    use_state: false
    diffusion_model_cfg:
      cross_attention_dim: 2048
      dropout: 0.2
      final_dropout: true
      interleave_self_attention: true
      norm_type: ada_norm
      num_layers: 16
      output_dim: 1024
      positional_embeddings: null
  spatial_model:
    model_name_or_path: ./checkpoints/vggt
    output_dim: 2048
  spatial_projector:
    hidden_dim: 2048
    output_dim: 2560
  fuser:
    type: cross_attention
  reduce_in_full_precision: true
  use_mv_images: false
  layer_qformer:
    num_layers: 4
    num_query_tokens: 128
    input_dim: 2560
    ouptput_dim: 2560
  image_edit_model:
    model_name_or_path: ./checkpoints/LongCat-Image-Edit
    lora_path: ./checkpoints/Multi-view-VLA/LongCat-lora
    view_num: 2
    fuser_type: mlp_gated_tranformer
    read_from_local: true
    num_inference_steps: 8
datasets:
  vlm_data:
    dataset_py: vlm_datasets
    dataformat: llava_json
    dataset_use: sharegpt4v_coco
    eval_dataset: sharegpt4v_coco
    data_flatten: false
    base_interval: 2
    max_pixels: 307200
    min_pixels: 784
    model_max_length: 2048
    model_type: qwen2.5vl
    per_device_batch_size: 4
  vla_data:
    num_workers: 4
    dataset_py: lerobot_datasets
    data_root_dir: /mnt/xlab-nas-2/vla_dataset/benchmark/libero
    mv_data_root_dir: ./dataset/libero_mv_feats
    data_mix: libero_all_ration
    action_type: delta_qpos
    CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
      Locate their bounding boxes in [x1,y1,x2,y2] format.
    CoT_answer: bbox
    default_image_resolution:
    - 3
    - 224
    - 224
    per_device_batch_size: 16
    load_all_data_for_training: true
    obs:
    - image_0
    video_backend: torchvision_av
trainer:
  epochs: 100
  max_train_steps: 40000
  num_warmup_steps: 5000
  save_interval: 5000
  eval_interval: 1000000
  learning_rate:
    base: 2.5e-05
    qwen_vl_interface: 1.0e-05
    action_model: 0.0001
  lr_scheduler_type: cosine_with_min_lr
  scheduler_specific_kwargs:
    min_lr: 1.0e-06
  freeze_modules: spatial_model,image_edit_model
  loss_scale:
    vla: 1.0
    vlm: 0.1
    forcing: 0.2
  max_grad_norm: 1.0
  warmup_ratio: 0.1
  weight_decay: 0.0
  logging_frequency: 100
  gradient_clipping: 1.0
  gradient_accumulation_steps: 1
  optimizer:
    name: AdamW
    betas:
    - 0.9
    - 0.95
    eps: 1.0e-08
    weight_decay: 1.0e-08
  is_resume: false
  resume_from_checkpoint: null
  pretrained_checkpoint: ./checkpoints/Multi-view-VLA/pretrained_model/checkpoints/steps_14000_pytorch_model.pt
  reload_modules: qwen_vl_interface,action_model
  resume_epoch: null
  resume_step: null
  enable_gradient_checkpointing: true
  enable_mixed_precision_training: true
  vla_data:
    video_backend: torchvision_av
output_dir: ./checkpoints/0428_liberoall_Qwen3vlGR00TAML_vggt_longcat_view2_mlp_gated_tranformer_bs16_4gpus_reload_vlm_action_ration