Cognition2ActionLab commited on
Commit
e4e4b43
·
verified ·
1 Parent(s): 723903a

Upload folder using huggingface_hub

Browse files
libero_10_2B_mee1e-2/starvla_qwen_oft/config.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: starvla_qwen_oft
2
+ run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_3/libero_10_2B_mee1e-2
3
+ seed: 42
4
+ trackers:
5
+ - jsonl
6
+ - wandb
7
+ wandb_project: starVLA
8
+ is_debug: false
9
+ enable_mee: true
10
+ mee_weight: 0.01
11
+ enable_adaptive: true
12
+ framework:
13
+ name: QwenOFT
14
+ qwenvl:
15
+ base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
16
+ attn_implementation: flash_attention_2
17
+ vl_hidden_dim: 2048
18
+ dino:
19
+ dino_backbone: dinov2_vits14
20
+ action_model:
21
+ action_model_type: DiT-B
22
+ action_hidden_dim: 2
23
+ hidden_size: 1024
24
+ add_pos_embed: true
25
+ max_seq_len: 1024
26
+ action_dim: 7
27
+ state_dim: 7
28
+ future_action_window_size: 7
29
+ action_horizon: 8
30
+ past_action_window_size: 0
31
+ repeated_diffusion_steps: 8
32
+ noise_beta_alpha: 1.5
33
+ noise_beta_beta: 1.0
34
+ noise_s: 0.999
35
+ num_timestep_buckets: 1000
36
+ num_inference_timesteps: 4
37
+ num_target_vision_tokens: 32
38
+ diffusion_model_cfg:
39
+ cross_attention_dim: 2048
40
+ dropout: 0.2
41
+ final_dropout: true
42
+ interleave_self_attention: true
43
+ norm_type: ada_norm
44
+ num_layers: 16
45
+ output_dim: 1024
46
+ positional_embeddings: null
47
+ reduce_in_full_precision: true
48
+ datasets:
49
+ vlm_data:
50
+ dataset_py: vlm_datasets
51
+ dataformat: llava_json
52
+ dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
53
+ eval_dataset: aokvqa_cauldron_llava_format
54
+ data_flatten: false
55
+ base_interval: 2
56
+ max_pixels: 12845056
57
+ min_pixels: 3136
58
+ model_max_length: 2048
59
+ model_type: qwen2.5vl
60
+ per_device_batch_size: 4
61
+ vla_data:
62
+ dataset_py: lerobot_datasets
63
+ data_root_dir: /share/project/baishuanghao/data/libero_lerobot
64
+ data_mix: libero_10
65
+ action_type: delta_qpos
66
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
67
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
68
+ CoT_answer: bbox
69
+ default_image_resolution:
70
+ - 3
71
+ - 224
72
+ - 224
73
+ per_device_batch_size: 16
74
+ load_all_data_for_training: true
75
+ obs:
76
+ - image_0
77
+ trainer:
78
+ epochs: 100
79
+ max_train_steps: 30000
80
+ num_warmup_steps: 5000
81
+ save_interval: 60000
82
+ eval_interval: 60000
83
+ learning_rate:
84
+ base: 4.0e-05
85
+ qwen_vl_interface: 1.0e-05
86
+ action_model: 0.0001
87
+ lr_scheduler_type: cosine_with_min_lr
88
+ scheduler_specific_kwargs:
89
+ min_lr: 1.0e-06
90
+ freeze_modules: null
91
+ loss_scale:
92
+ vla: 1.0
93
+ vlm: 0.1
94
+ max_grad_norm: 1.0
95
+ warmup_ratio: 0.1
96
+ weight_decay: 0.0
97
+ logging_frequency: 100
98
+ gradient_clipping: 1.0
99
+ gradient_accumulation_steps: 1
100
+ optimizer:
101
+ name: AdamW
102
+ betas:
103
+ - 0.9
104
+ - 0.95
105
+ eps: 1.0e-08
106
+ weight_decay: 1.0e-08
107
+ is_resume: false
108
+ resume_epoch: null
109
+ resume_step: null
110
+ enable_gradient_checkpointing: true
111
+ enable_mixed_precision_training: true
112
+ output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_3/libero_10_2B_mee1e-2/starvla_qwen_oft
libero_goal_2B_mee1e-0/starvla_qwen_oft/config.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: starvla_qwen_oft
2
+ run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_3/libero_goal_2B_mee1e-0
3
+ seed: 42
4
+ trackers:
5
+ - jsonl
6
+ - wandb
7
+ wandb_project: starVLA
8
+ is_debug: false
9
+ enable_mee: true
10
+ mee_weight: 1.0
11
+ enable_adaptive: true
12
+ framework:
13
+ name: QwenOFT
14
+ qwenvl:
15
+ base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
16
+ attn_implementation: flash_attention_2
17
+ vl_hidden_dim: 2048
18
+ dino:
19
+ dino_backbone: dinov2_vits14
20
+ action_model:
21
+ action_model_type: DiT-B
22
+ action_hidden_dim: 2
23
+ hidden_size: 1024
24
+ add_pos_embed: true
25
+ max_seq_len: 1024
26
+ action_dim: 7
27
+ state_dim: 7
28
+ future_action_window_size: 7
29
+ action_horizon: 8
30
+ past_action_window_size: 0
31
+ repeated_diffusion_steps: 8
32
+ noise_beta_alpha: 1.5
33
+ noise_beta_beta: 1.0
34
+ noise_s: 0.999
35
+ num_timestep_buckets: 1000
36
+ num_inference_timesteps: 4
37
+ num_target_vision_tokens: 32
38
+ diffusion_model_cfg:
39
+ cross_attention_dim: 2048
40
+ dropout: 0.2
41
+ final_dropout: true
42
+ interleave_self_attention: true
43
+ norm_type: ada_norm
44
+ num_layers: 16
45
+ output_dim: 1024
46
+ positional_embeddings: null
47
+ reduce_in_full_precision: true
48
+ datasets:
49
+ vlm_data:
50
+ dataset_py: vlm_datasets
51
+ dataformat: llava_json
52
+ dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
53
+ eval_dataset: aokvqa_cauldron_llava_format
54
+ data_flatten: false
55
+ base_interval: 2
56
+ max_pixels: 12845056
57
+ min_pixels: 3136
58
+ model_max_length: 2048
59
+ model_type: qwen2.5vl
60
+ per_device_batch_size: 4
61
+ vla_data:
62
+ dataset_py: lerobot_datasets
63
+ data_root_dir: /share/project/baishuanghao/data/libero_lerobot
64
+ data_mix: libero_goal
65
+ action_type: delta_qpos
66
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
67
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
68
+ CoT_answer: bbox
69
+ default_image_resolution:
70
+ - 3
71
+ - 224
72
+ - 224
73
+ per_device_batch_size: 16
74
+ load_all_data_for_training: true
75
+ obs:
76
+ - image_0
77
+ trainer:
78
+ epochs: 100
79
+ max_train_steps: 30000
80
+ num_warmup_steps: 5000
81
+ save_interval: 60000
82
+ eval_interval: 60000
83
+ learning_rate:
84
+ base: 4.0e-05
85
+ qwen_vl_interface: 1.0e-05
86
+ action_model: 0.0001
87
+ lr_scheduler_type: cosine_with_min_lr
88
+ scheduler_specific_kwargs:
89
+ min_lr: 1.0e-06
90
+ freeze_modules: null
91
+ loss_scale:
92
+ vla: 1.0
93
+ vlm: 0.1
94
+ max_grad_norm: 1.0
95
+ warmup_ratio: 0.1
96
+ weight_decay: 0.0
97
+ logging_frequency: 100
98
+ gradient_clipping: 1.0
99
+ gradient_accumulation_steps: 1
100
+ optimizer:
101
+ name: AdamW
102
+ betas:
103
+ - 0.9
104
+ - 0.95
105
+ eps: 1.0e-08
106
+ weight_decay: 1.0e-08
107
+ is_resume: false
108
+ resume_epoch: null
109
+ resume_step: null
110
+ enable_gradient_checkpointing: true
111
+ enable_mixed_precision_training: true
112
+ output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_3/libero_goal_2B_mee1e-0/starvla_qwen_oft
libero_object_2B_mee1e-2/starvla_qwen_oft/config.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: starvla_qwen_oft
2
+ run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_3/libero_object_2B_mee1e-2
3
+ seed: 42
4
+ trackers:
5
+ - jsonl
6
+ - wandb
7
+ wandb_project: starVLA
8
+ is_debug: false
9
+ enable_mee: true
10
+ mee_weight: 0.01
11
+ enable_adaptive: true
12
+ framework:
13
+ name: QwenOFT
14
+ qwenvl:
15
+ base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
16
+ attn_implementation: flash_attention_2
17
+ vl_hidden_dim: 2048
18
+ dino:
19
+ dino_backbone: dinov2_vits14
20
+ action_model:
21
+ action_model_type: DiT-B
22
+ action_hidden_dim: 2
23
+ hidden_size: 1024
24
+ add_pos_embed: true
25
+ max_seq_len: 1024
26
+ action_dim: 7
27
+ state_dim: 7
28
+ future_action_window_size: 7
29
+ action_horizon: 8
30
+ past_action_window_size: 0
31
+ repeated_diffusion_steps: 8
32
+ noise_beta_alpha: 1.5
33
+ noise_beta_beta: 1.0
34
+ noise_s: 0.999
35
+ num_timestep_buckets: 1000
36
+ num_inference_timesteps: 4
37
+ num_target_vision_tokens: 32
38
+ diffusion_model_cfg:
39
+ cross_attention_dim: 2048
40
+ dropout: 0.2
41
+ final_dropout: true
42
+ interleave_self_attention: true
43
+ norm_type: ada_norm
44
+ num_layers: 16
45
+ output_dim: 1024
46
+ positional_embeddings: null
47
+ reduce_in_full_precision: true
48
+ datasets:
49
+ vlm_data:
50
+ dataset_py: vlm_datasets
51
+ dataformat: llava_json
52
+ dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
53
+ eval_dataset: aokvqa_cauldron_llava_format
54
+ data_flatten: false
55
+ base_interval: 2
56
+ max_pixels: 12845056
57
+ min_pixels: 3136
58
+ model_max_length: 2048
59
+ model_type: qwen2.5vl
60
+ per_device_batch_size: 4
61
+ vla_data:
62
+ dataset_py: lerobot_datasets
63
+ data_root_dir: /share/project/baishuanghao/data/libero_lerobot
64
+ data_mix: libero_object
65
+ action_type: delta_qpos
66
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
67
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
68
+ CoT_answer: bbox
69
+ default_image_resolution:
70
+ - 3
71
+ - 224
72
+ - 224
73
+ per_device_batch_size: 16
74
+ load_all_data_for_training: true
75
+ obs:
76
+ - image_0
77
+ trainer:
78
+ epochs: 100
79
+ max_train_steps: 30000
80
+ num_warmup_steps: 5000
81
+ save_interval: 60000
82
+ eval_interval: 60000
83
+ learning_rate:
84
+ base: 4.0e-05
85
+ qwen_vl_interface: 1.0e-05
86
+ action_model: 0.0001
87
+ lr_scheduler_type: cosine_with_min_lr
88
+ scheduler_specific_kwargs:
89
+ min_lr: 1.0e-06
90
+ freeze_modules: null
91
+ loss_scale:
92
+ vla: 1.0
93
+ vlm: 0.1
94
+ max_grad_norm: 1.0
95
+ warmup_ratio: 0.1
96
+ weight_decay: 0.0
97
+ logging_frequency: 100
98
+ gradient_clipping: 1.0
99
+ gradient_accumulation_steps: 1
100
+ optimizer:
101
+ name: AdamW
102
+ betas:
103
+ - 0.9
104
+ - 0.95
105
+ eps: 1.0e-08
106
+ weight_decay: 1.0e-08
107
+ is_resume: false
108
+ resume_epoch: null
109
+ resume_step: null
110
+ enable_gradient_checkpointing: true
111
+ enable_mixed_precision_training: true
112
+ output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_3/libero_object_2B_mee1e-2/starvla_qwen_oft
libero_spatial_2B_mee1e1/starvla_qwen_oft/config.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: starvla_qwen_oft
2
+ run_root_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_3/libero_spatial_2B_mee1e1
3
+ seed: 42
4
+ trackers:
5
+ - jsonl
6
+ - wandb
7
+ wandb_project: starVLA
8
+ is_debug: false
9
+ enable_mee: true
10
+ mee_weight: 10.0
11
+ enable_adaptive: true
12
+ framework:
13
+ name: QwenOFT
14
+ qwenvl:
15
+ base_vlm: /share/project/baishuanghao/code/HLM-VLA/models/Qwen3-VL-2B-Instruct
16
+ attn_implementation: flash_attention_2
17
+ vl_hidden_dim: 2048
18
+ dino:
19
+ dino_backbone: dinov2_vits14
20
+ action_model:
21
+ action_model_type: DiT-B
22
+ action_hidden_dim: 2
23
+ hidden_size: 1024
24
+ add_pos_embed: true
25
+ max_seq_len: 1024
26
+ action_dim: 7
27
+ state_dim: 7
28
+ future_action_window_size: 7
29
+ action_horizon: 8
30
+ past_action_window_size: 0
31
+ repeated_diffusion_steps: 8
32
+ noise_beta_alpha: 1.5
33
+ noise_beta_beta: 1.0
34
+ noise_s: 0.999
35
+ num_timestep_buckets: 1000
36
+ num_inference_timesteps: 4
37
+ num_target_vision_tokens: 32
38
+ diffusion_model_cfg:
39
+ cross_attention_dim: 2048
40
+ dropout: 0.2
41
+ final_dropout: true
42
+ interleave_self_attention: true
43
+ norm_type: ada_norm
44
+ num_layers: 16
45
+ output_dim: 1024
46
+ positional_embeddings: null
47
+ reduce_in_full_precision: true
48
+ datasets:
49
+ vlm_data:
50
+ dataset_py: vlm_datasets
51
+ dataformat: llava_json
52
+ dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
53
+ eval_dataset: aokvqa_cauldron_llava_format
54
+ data_flatten: false
55
+ base_interval: 2
56
+ max_pixels: 12845056
57
+ min_pixels: 3136
58
+ model_max_length: 2048
59
+ model_type: qwen2.5vl
60
+ per_device_batch_size: 4
61
+ vla_data:
62
+ dataset_py: lerobot_datasets
63
+ data_root_dir: /share/project/baishuanghao/data/libero_lerobot
64
+ data_mix: libero_spatial
65
+ action_type: delta_qpos
66
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
67
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
68
+ CoT_answer: bbox
69
+ default_image_resolution:
70
+ - 3
71
+ - 224
72
+ - 224
73
+ per_device_batch_size: 16
74
+ load_all_data_for_training: true
75
+ obs:
76
+ - image_0
77
+ trainer:
78
+ epochs: 100
79
+ max_train_steps: 30000
80
+ num_warmup_steps: 5000
81
+ save_interval: 60000
82
+ eval_interval: 60000
83
+ learning_rate:
84
+ base: 4.0e-05
85
+ qwen_vl_interface: 1.0e-05
86
+ action_model: 0.0001
87
+ lr_scheduler_type: cosine_with_min_lr
88
+ scheduler_specific_kwargs:
89
+ min_lr: 1.0e-06
90
+ freeze_modules: null
91
+ loss_scale:
92
+ vla: 1.0
93
+ vlm: 0.1
94
+ max_grad_norm: 1.0
95
+ warmup_ratio: 0.1
96
+ weight_decay: 0.0
97
+ logging_frequency: 100
98
+ gradient_clipping: 1.0
99
+ gradient_accumulation_steps: 1
100
+ optimizer:
101
+ name: AdamW
102
+ betas:
103
+ - 0.9
104
+ - 0.95
105
+ eps: 1.0e-08
106
+ weight_decay: 1.0e-08
107
+ is_resume: false
108
+ resume_epoch: null
109
+ resume_step: null
110
+ enable_gradient_checkpointing: true
111
+ enable_mixed_precision_training: true
112
+ output_dir: /share/project/baishuanghao/code/starVLA/pretrained_models_3/libero_spatial_2B_mee1e1/starvla_qwen_oft