Cognition2ActionLab commited on
Commit
c96216d
·
verified ·
1 Parent(s): 619cc46

Upload folder using huggingface_hub

Browse files
libero_10_2B/starvla_qwen_dual/config.yaml ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: starvla_qwen_dual
2
+ run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_10_2B
3
+ seed: 42
4
+ trackers:
5
+ - jsonl
6
+ - wandb
7
+ wandb_project: starVLA
8
+ is_debug: false
9
+ enable_mee: false
10
+ mee_weight: 0.01
11
+ framework:
12
+ name: Qwen-Dual
13
+ qwenvl:
14
+ base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
15
+ attn_implementation: flash_attention_2
16
+ vl_hidden_dim: 2048
17
+ dino:
18
+ dino_backbone: dinov2_vits14
19
+ action_model:
20
+ action_model_type: DiT-B
21
+ action_hidden_dim: 2
22
+ hidden_size: 1024
23
+ add_pos_embed: true
24
+ max_seq_len: 1024
25
+ action_dim: 7
26
+ state_dim: 7
27
+ future_action_window_size: 7
28
+ action_horizon: 8
29
+ past_action_window_size: 0
30
+ repeated_diffusion_steps: 8
31
+ noise_beta_alpha: 1.5
32
+ noise_beta_beta: 1.0
33
+ noise_s: 0.999
34
+ num_timestep_buckets: 1000
35
+ num_inference_timesteps: 4
36
+ num_target_vision_tokens: 32
37
+ diffusion_model_cfg:
38
+ cross_attention_dim: 2048
39
+ dropout: 0.2
40
+ final_dropout: true
41
+ interleave_self_attention: true
42
+ norm_type: ada_norm
43
+ num_layers: 16
44
+ output_dim: 1024
45
+ positional_embeddings: null
46
+ reduce_in_full_precision: true
47
+ datasets:
48
+ vlm_data:
49
+ dataset_py: vlm_datasets
50
+ dataformat: llava_json
51
+ dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
52
+ eval_dataset: aokvqa_cauldron_llava_format
53
+ data_flatten: false
54
+ base_interval: 2
55
+ max_pixels: 12845056
56
+ min_pixels: 3136
57
+ model_max_length: 2048
58
+ model_type: qwen2.5vl
59
+ per_device_batch_size: 4
60
+ vla_data:
61
+ dataset_py: lerobot_datasets
62
+ data_root_dir: /share/project/baishuanghao/data/libero_lerobot
63
+ data_mix: libero_10
64
+ action_type: delta_qpos
65
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
66
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
67
+ CoT_answer: bbox
68
+ default_image_resolution:
69
+ - 3
70
+ - 224
71
+ - 224
72
+ per_device_batch_size: 16
73
+ load_all_data_for_training: true
74
+ obs:
75
+ - image_0
76
+ trainer:
77
+ epochs: 100
78
+ max_train_steps: 30000
79
+ num_warmup_steps: 5000
80
+ save_interval: 60000
81
+ eval_interval: 30000
82
+ learning_rate:
83
+ base: 4.0e-05
84
+ qwen_vl_interface: 1.0e-05
85
+ action_model: 0.0001
86
+ lr_scheduler_type: cosine_with_min_lr
87
+ scheduler_specific_kwargs:
88
+ min_lr: 1.0e-06
89
+ freeze_modules: null
90
+ loss_scale:
91
+ vla: 1.0
92
+ vlm: 0.1
93
+ max_grad_norm: 1.0
94
+ warmup_ratio: 0.1
95
+ weight_decay: 0.0
96
+ logging_frequency: 100
97
+ gradient_clipping: 1.0
98
+ gradient_accumulation_steps: 1
99
+ optimizer:
100
+ name: AdamW
101
+ betas:
102
+ - 0.9
103
+ - 0.95
104
+ eps: 1.0e-08
105
+ weight_decay: 1.0e-08
106
+ is_resume: false
107
+ resume_epoch: null
108
+ resume_step: null
109
+ enable_gradient_checkpointing: true
110
+ enable_mixed_precision_training: true
111
+ output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_10_2B/starvla_qwen_dual
libero_goal_2B/starvla_qwen_dual/config.yaml ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: starvla_qwen_dual
2
+ run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_goal_2B
3
+ seed: 42
4
+ trackers:
5
+ - jsonl
6
+ - wandb
7
+ wandb_project: starVLA
8
+ is_debug: false
9
+ enable_mee: false
10
+ mee_weight: 0.01
11
+ framework:
12
+ name: Qwen-Dual
13
+ qwenvl:
14
+ base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
15
+ attn_implementation: flash_attention_2
16
+ vl_hidden_dim: 2048
17
+ dino:
18
+ dino_backbone: dinov2_vits14
19
+ action_model:
20
+ action_model_type: DiT-B
21
+ action_hidden_dim: 2
22
+ hidden_size: 1024
23
+ add_pos_embed: true
24
+ max_seq_len: 1024
25
+ action_dim: 7
26
+ state_dim: 7
27
+ future_action_window_size: 7
28
+ action_horizon: 8
29
+ past_action_window_size: 0
30
+ repeated_diffusion_steps: 8
31
+ noise_beta_alpha: 1.5
32
+ noise_beta_beta: 1.0
33
+ noise_s: 0.999
34
+ num_timestep_buckets: 1000
35
+ num_inference_timesteps: 4
36
+ num_target_vision_tokens: 32
37
+ diffusion_model_cfg:
38
+ cross_attention_dim: 2048
39
+ dropout: 0.2
40
+ final_dropout: true
41
+ interleave_self_attention: true
42
+ norm_type: ada_norm
43
+ num_layers: 16
44
+ output_dim: 1024
45
+ positional_embeddings: null
46
+ reduce_in_full_precision: true
47
+ datasets:
48
+ vlm_data:
49
+ dataset_py: vlm_datasets
50
+ dataformat: llava_json
51
+ dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
52
+ eval_dataset: aokvqa_cauldron_llava_format
53
+ data_flatten: false
54
+ base_interval: 2
55
+ max_pixels: 12845056
56
+ min_pixels: 3136
57
+ model_max_length: 2048
58
+ model_type: qwen2.5vl
59
+ per_device_batch_size: 4
60
+ vla_data:
61
+ dataset_py: lerobot_datasets
62
+ data_root_dir: /share/project/baishuanghao/data/libero_lerobot
63
+ data_mix: libero_goal
64
+ action_type: delta_qpos
65
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
66
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
67
+ CoT_answer: bbox
68
+ default_image_resolution:
69
+ - 3
70
+ - 224
71
+ - 224
72
+ per_device_batch_size: 16
73
+ load_all_data_for_training: true
74
+ obs:
75
+ - image_0
76
+ trainer:
77
+ epochs: 100
78
+ max_train_steps: 30000
79
+ num_warmup_steps: 5000
80
+ save_interval: 60000
81
+ eval_interval: 30000
82
+ learning_rate:
83
+ base: 4.0e-05
84
+ qwen_vl_interface: 1.0e-05
85
+ action_model: 0.0001
86
+ lr_scheduler_type: cosine_with_min_lr
87
+ scheduler_specific_kwargs:
88
+ min_lr: 1.0e-06
89
+ freeze_modules: null
90
+ loss_scale:
91
+ vla: 1.0
92
+ vlm: 0.1
93
+ max_grad_norm: 1.0
94
+ warmup_ratio: 0.1
95
+ weight_decay: 0.0
96
+ logging_frequency: 100
97
+ gradient_clipping: 1.0
98
+ gradient_accumulation_steps: 1
99
+ optimizer:
100
+ name: AdamW
101
+ betas:
102
+ - 0.9
103
+ - 0.95
104
+ eps: 1.0e-08
105
+ weight_decay: 1.0e-08
106
+ is_resume: false
107
+ resume_epoch: null
108
+ resume_step: null
109
+ enable_gradient_checkpointing: true
110
+ enable_mixed_precision_training: true
111
+ output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_goal_2B/starvla_qwen_dual
libero_object_2B/starvla_qwen_dual/config.yaml ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: starvla_qwen_dual
2
+ run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_object_2B
3
+ seed: 42
4
+ trackers:
5
+ - jsonl
6
+ - wandb
7
+ wandb_project: starVLA
8
+ is_debug: false
9
+ enable_mee: false
10
+ mee_weight: 0.01
11
+ framework:
12
+ name: Qwen-Dual
13
+ qwenvl:
14
+ base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
15
+ attn_implementation: flash_attention_2
16
+ vl_hidden_dim: 2048
17
+ dino:
18
+ dino_backbone: dinov2_vits14
19
+ action_model:
20
+ action_model_type: DiT-B
21
+ action_hidden_dim: 2
22
+ hidden_size: 1024
23
+ add_pos_embed: true
24
+ max_seq_len: 1024
25
+ action_dim: 7
26
+ state_dim: 7
27
+ future_action_window_size: 7
28
+ action_horizon: 8
29
+ past_action_window_size: 0
30
+ repeated_diffusion_steps: 8
31
+ noise_beta_alpha: 1.5
32
+ noise_beta_beta: 1.0
33
+ noise_s: 0.999
34
+ num_timestep_buckets: 1000
35
+ num_inference_timesteps: 4
36
+ num_target_vision_tokens: 32
37
+ diffusion_model_cfg:
38
+ cross_attention_dim: 2048
39
+ dropout: 0.2
40
+ final_dropout: true
41
+ interleave_self_attention: true
42
+ norm_type: ada_norm
43
+ num_layers: 16
44
+ output_dim: 1024
45
+ positional_embeddings: null
46
+ reduce_in_full_precision: true
47
+ datasets:
48
+ vlm_data:
49
+ dataset_py: vlm_datasets
50
+ dataformat: llava_json
51
+ dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
52
+ eval_dataset: aokvqa_cauldron_llava_format
53
+ data_flatten: false
54
+ base_interval: 2
55
+ max_pixels: 12845056
56
+ min_pixels: 3136
57
+ model_max_length: 2048
58
+ model_type: qwen2.5vl
59
+ per_device_batch_size: 4
60
+ vla_data:
61
+ dataset_py: lerobot_datasets
62
+ data_root_dir: /share/project/baishuanghao/data/libero_lerobot
63
+ data_mix: libero_object
64
+ action_type: delta_qpos
65
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
66
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
67
+ CoT_answer: bbox
68
+ default_image_resolution:
69
+ - 3
70
+ - 224
71
+ - 224
72
+ per_device_batch_size: 16
73
+ load_all_data_for_training: true
74
+ obs:
75
+ - image_0
76
+ trainer:
77
+ epochs: 100
78
+ max_train_steps: 30000
79
+ num_warmup_steps: 5000
80
+ save_interval: 60000
81
+ eval_interval: 30000
82
+ learning_rate:
83
+ base: 4.0e-05
84
+ qwen_vl_interface: 1.0e-05
85
+ action_model: 0.0001
86
+ lr_scheduler_type: cosine_with_min_lr
87
+ scheduler_specific_kwargs:
88
+ min_lr: 1.0e-06
89
+ freeze_modules: null
90
+ loss_scale:
91
+ vla: 1.0
92
+ vlm: 0.1
93
+ max_grad_norm: 1.0
94
+ warmup_ratio: 0.1
95
+ weight_decay: 0.0
96
+ logging_frequency: 100
97
+ gradient_clipping: 1.0
98
+ gradient_accumulation_steps: 1
99
+ optimizer:
100
+ name: AdamW
101
+ betas:
102
+ - 0.9
103
+ - 0.95
104
+ eps: 1.0e-08
105
+ weight_decay: 1.0e-08
106
+ is_resume: false
107
+ resume_epoch: null
108
+ resume_step: null
109
+ enable_gradient_checkpointing: true
110
+ enable_mixed_precision_training: true
111
+ output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_object_2B/starvla_qwen_dual
libero_spatial_2B/starvla_qwen_dual/config.yaml ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: starvla_qwen_dual
2
+ run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_spatial_2B
3
+ seed: 42
4
+ trackers:
5
+ - jsonl
6
+ - wandb
7
+ wandb_project: starVLA
8
+ is_debug: false
9
+ enable_mee: false
10
+ mee_weight: 0.01
11
+ framework:
12
+ name: Qwen-Dual
13
+ qwenvl:
14
+ base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
15
+ attn_implementation: flash_attention_2
16
+ vl_hidden_dim: 2048
17
+ dino:
18
+ dino_backbone: dinov2_vits14
19
+ action_model:
20
+ action_model_type: DiT-B
21
+ action_hidden_dim: 2
22
+ hidden_size: 1024
23
+ add_pos_embed: true
24
+ max_seq_len: 1024
25
+ action_dim: 7
26
+ state_dim: 7
27
+ future_action_window_size: 7
28
+ action_horizon: 8
29
+ past_action_window_size: 0
30
+ repeated_diffusion_steps: 8
31
+ noise_beta_alpha: 1.5
32
+ noise_beta_beta: 1.0
33
+ noise_s: 0.999
34
+ num_timestep_buckets: 1000
35
+ num_inference_timesteps: 4
36
+ num_target_vision_tokens: 32
37
+ diffusion_model_cfg:
38
+ cross_attention_dim: 2048
39
+ dropout: 0.2
40
+ final_dropout: true
41
+ interleave_self_attention: true
42
+ norm_type: ada_norm
43
+ num_layers: 16
44
+ output_dim: 1024
45
+ positional_embeddings: null
46
+ reduce_in_full_precision: true
47
+ datasets:
48
+ vlm_data:
49
+ dataset_py: vlm_datasets
50
+ dataformat: llava_json
51
+ dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
52
+ eval_dataset: aokvqa_cauldron_llava_format
53
+ data_flatten: false
54
+ base_interval: 2
55
+ max_pixels: 12845056
56
+ min_pixels: 3136
57
+ model_max_length: 2048
58
+ model_type: qwen2.5vl
59
+ per_device_batch_size: 4
60
+ vla_data:
61
+ dataset_py: lerobot_datasets
62
+ data_root_dir: /share/project/baishuanghao/data/libero_lerobot
63
+ data_mix: libero_spatial
64
+ action_type: delta_qpos
65
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
66
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
67
+ CoT_answer: bbox
68
+ default_image_resolution:
69
+ - 3
70
+ - 224
71
+ - 224
72
+ per_device_batch_size: 16
73
+ load_all_data_for_training: true
74
+ obs:
75
+ - image_0
76
+ trainer:
77
+ epochs: 100
78
+ max_train_steps: 30000
79
+ num_warmup_steps: 5000
80
+ save_interval: 60000
81
+ eval_interval: 30000
82
+ learning_rate:
83
+ base: 4.0e-05
84
+ qwen_vl_interface: 1.0e-05
85
+ action_model: 0.0001
86
+ lr_scheduler_type: cosine_with_min_lr
87
+ scheduler_specific_kwargs:
88
+ min_lr: 1.0e-06
89
+ freeze_modules: null
90
+ loss_scale:
91
+ vla: 1.0
92
+ vlm: 0.1
93
+ max_grad_norm: 1.0
94
+ warmup_ratio: 0.1
95
+ weight_decay: 0.0
96
+ logging_frequency: 100
97
+ gradient_clipping: 1.0
98
+ gradient_accumulation_steps: 1
99
+ optimizer:
100
+ name: AdamW
101
+ betas:
102
+ - 0.9
103
+ - 0.95
104
+ eps: 1.0e-08
105
+ weight_decay: 1.0e-08
106
+ is_resume: false
107
+ resume_epoch: null
108
+ resume_step: null
109
+ enable_gradient_checkpointing: true
110
+ enable_mixed_precision_training: true
111
+ output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_2/libero_spatial_2B/starvla_qwen_dual