ginwind commited on
Commit
ccfcdff
·
verified ·
1 Parent(s): 85a3090

Upload folder using huggingface_hub

Browse files
LIBERO/checkpoints/VLA-JEPA-LIBERO.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b46c8f268905d50944aea9dc0a087400cdfbc401a8ec9ca22921fa91c0dcb841
3
+ size 6163579855
LIBERO/config.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_id": "LIBERO",
3
+ "run_root_dir": "checkpoints",
4
+ "seed": 42,
5
+ "trackers": [
6
+ "json"
7
+ ],
8
+ "is_debug": false,
9
+ "framework": {
10
+ "name": "VLA_JEPA",
11
+ "qwenvl": {
12
+ "base_vlm": "/home/dataset-local/models/Qwen3-VL-2B-Instruct",
13
+ "attn_implementation": "flash_attention_2",
14
+ "vl_hidden_dim": 2048
15
+ },
16
+ "action_model": {
17
+ "action_model_type": "DiT-B",
18
+ "action_hidden_dim": 1024,
19
+ "hidden_size": 1024,
20
+ "add_pos_embed": true,
21
+ "max_seq_len": 1024,
22
+ "action_dim": 7,
23
+ "state_dim": 8,
24
+ "future_action_window_size": 6,
25
+ "action_horizon": 7,
26
+ "past_action_window_size": 0,
27
+ "repeated_diffusion_steps": 8,
28
+ "noise_beta_alpha": 1.5,
29
+ "noise_beta_beta": 1.0,
30
+ "noise_s": 0.999,
31
+ "num_timestep_buckets": 1000,
32
+ "num_inference_timesteps": 4,
33
+ "num_target_vision_tokens": 32,
34
+ "diffusion_model_cfg": {
35
+ "cross_attention_dim": 2048,
36
+ "dropout": 0.2,
37
+ "final_dropout": true,
38
+ "interleave_self_attention": true,
39
+ "norm_type": "ada_norm",
40
+ "num_layers": 16,
41
+ "output_dim": 1024,
42
+ "positional_embeddings": null
43
+ }
44
+ },
45
+ "vj2_model": {
46
+ "base_encoder": "/home/dataset-local/models/vjepa2-vitl-fpc64-256",
47
+ "depth": 12,
48
+ "num_heads": 8,
49
+ "special_action_token": "<|action_{}|>",
50
+ "num_action_tokens_per_timestep": 8,
51
+ "embodied_action_token": "<|embodied_action|>",
52
+ "num_embodied_action_tokens_per_instruction": 32,
53
+ "num_frames": 8
54
+ },
55
+ "reduce_in_full_precision": true
56
+ },
57
+ "datasets": {
58
+ "vla_data": {
59
+ "dataset_py": "lerobot_datasets",
60
+ "data_root_dir": "/home/dataset-local/datasets/LeRobot/LEROBOT_LIBERO_DATA",
61
+ "data_mix": "libero_all",
62
+ "action_type": "delta_qpos",
63
+ "CoT_prompt": "Your task is {instruction}. Infer the temporal dynamics from frames {actions} and produce the corresponding policy actions {e_actions}.",
64
+ "resolution_size": 224,
65
+ "per_device_batch_size": 32,
66
+ "video_resolution_size": 256,
67
+ "load_all_data_for_training": true,
68
+ "with_state": true
69
+ }
70
+ },
71
+ "trainer": {
72
+ "epochs": 100,
73
+ "max_train_steps": 30000,
74
+ "num_warmup_steps": 5000,
75
+ "save_interval": 10000,
76
+ "eval_interval": 100,
77
+ "learning_rate": {
78
+ "base": 3e-05,
79
+ "qwen_vl_interface": 1e-05,
80
+ "action_model": 0.0001
81
+ },
82
+ "lr_scheduler_type": "cosine_with_min_lr",
83
+ "scheduler_specific_kwargs": {
84
+ "min_lr": 1e-06
85
+ },
86
+ "freeze_modules": "",
87
+ "loss_scale": {
88
+ "vla": 1.0,
89
+ "vlm": 0.1
90
+ },
91
+ "max_grad_norm": 1.0,
92
+ "warmup_ratio": 0.1,
93
+ "weight_decay": 0.0,
94
+ "logging_frequency": 10,
95
+ "gradient_clipping": 1.0,
96
+ "gradient_accumulation_steps": 1,
97
+ "pretrained_checkpoint": "/home/dataset-local/VLA_JEPA/checkpoints/pretrain/VLA-JEPA-pretrain.pt",
98
+ "optimizer": {
99
+ "name": "AdamW",
100
+ "betas": [
101
+ 0.9,
102
+ 0.95
103
+ ],
104
+ "eps": 1e-08,
105
+ "weight_decay": 1e-08
106
+ },
107
+ "is_resume": false,
108
+ "resume_epoch": null,
109
+ "resume_step": null,
110
+ "enable_gradient_checkpointing": true,
111
+ "enable_mixed_precision_training": true
112
+ },
113
+ "output_dir": "checkpoints/LIBERO"
114
+ }
LIBERO/config.yaml ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: LIBERO
2
+ run_root_dir: checkpoints
3
+ seed: 42
4
+ trackers:
5
+ - json
6
+ is_debug: false
7
+ framework:
8
+ name: VLA_JEPA
9
+ qwenvl:
10
+ base_vlm: /home/dataset-local/models/Qwen3-VL-2B-Instruct
11
+ attn_implementation: flash_attention_2
12
+ vl_hidden_dim: 2048
13
+ action_model:
14
+ action_model_type: DiT-B
15
+ action_hidden_dim: 1024
16
+ hidden_size: 1024
17
+ add_pos_embed: true
18
+ max_seq_len: 1024
19
+ action_dim: 7
20
+ state_dim: 8
21
+ future_action_window_size: 6
22
+ action_horizon: 7
23
+ past_action_window_size: 0
24
+ repeated_diffusion_steps: 8
25
+ noise_beta_alpha: 1.5
26
+ noise_beta_beta: 1.0
27
+ noise_s: 0.999
28
+ num_timestep_buckets: 1000
29
+ num_inference_timesteps: 4
30
+ num_target_vision_tokens: 32
31
+ diffusion_model_cfg:
32
+ cross_attention_dim: 2048
33
+ dropout: 0.2
34
+ final_dropout: true
35
+ interleave_self_attention: true
36
+ norm_type: ada_norm
37
+ num_layers: 16
38
+ output_dim: 1024
39
+ positional_embeddings: null
40
+ vj2_model:
41
+ base_encoder: /home/dataset-local/models/vjepa2-vitl-fpc64-256
42
+ depth: 12
43
+ num_heads: 8
44
+ special_action_token: <|action_{}|>
45
+ num_action_tokens_per_timestep: 8
46
+ embodied_action_token: <|embodied_action|>
47
+ num_embodied_action_tokens_per_instruction: 32
48
+ num_frames: 8
49
+ reduce_in_full_precision: true
50
+ datasets:
51
+ vla_data:
52
+ dataset_py: lerobot_datasets
53
+ data_root_dir: /home/dataset-local/datasets/LeRobot/LEROBOT_LIBERO_DATA
54
+ data_mix: libero_all
55
+ action_type: delta_qpos
56
+ CoT_prompt: Your task is {instruction}. Infer the temporal dynamics from frames
57
+ {actions} and produce the corresponding policy actions {e_actions}.
58
+ resolution_size: 224
59
+ per_device_batch_size: 32
60
+ video_resolution_size: 256
61
+ load_all_data_for_training: true
62
+ with_state: true
63
+ trainer:
64
+ epochs: 100
65
+ max_train_steps: 30000
66
+ num_warmup_steps: 5000
67
+ save_interval: 10000
68
+ eval_interval: 100
69
+ learning_rate:
70
+ base: 3.0e-05
71
+ qwen_vl_interface: 1.0e-05
72
+ action_model: 0.0001
73
+ lr_scheduler_type: cosine_with_min_lr
74
+ scheduler_specific_kwargs:
75
+ min_lr: 1.0e-06
76
+ freeze_modules: ''
77
+ loss_scale:
78
+ vla: 1.0
79
+ vlm: 0.1
80
+ max_grad_norm: 1.0
81
+ warmup_ratio: 0.1
82
+ weight_decay: 0.0
83
+ logging_frequency: 10
84
+ gradient_clipping: 1.0
85
+ gradient_accumulation_steps: 1
86
+ pretrained_checkpoint: /home/dataset-local/VLA_JEPA/checkpoints/pretrain/VLA-JEPA-pretrain.pt
87
+ optimizer:
88
+ name: AdamW
89
+ betas:
90
+ - 0.9
91
+ - 0.95
92
+ eps: 1.0e-08
93
+ weight_decay: 1.0e-08
94
+ is_resume: false
95
+ resume_epoch: null
96
+ resume_step: null
97
+ enable_gradient_checkpointing: true
98
+ enable_mixed_precision_training: true
99
+ output_dir: checkpoints/LIBERO
LIBERO/dataset_statistics.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "franka": {
3
+ "action": {
4
+ "mean": [
5
+ 0.07237596483901143,
6
+ 0.08987006871029735,
7
+ -0.10144743137061596,
8
+ -0.00045383188989944756,
9
+ 0.006273590726777911,
10
+ -0.003878799732774496,
11
+ 0.524486355483532
12
+ ],
13
+ "std": [
14
+ 0.3498823308902479,
15
+ 0.37794140366375184,
16
+ 0.460084266976933,
17
+ 0.0403885784928603,
18
+ 0.06616144248501059,
19
+ 0.07763074391911857,
20
+ 0.4994683356809767
21
+ ],
22
+ "max": [
23
+ 0.9375,
24
+ 0.9375,
25
+ 0.9375,
26
+ 0.3557142913341522,
27
+ 0.375,
28
+ 0.375,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -0.9375,
33
+ -0.9375,
34
+ -0.9375,
35
+ -0.2582142949104309,
36
+ -0.375,
37
+ -0.3675000071525574,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.8785714507102966,
42
+ -0.8758928775787354,
43
+ -0.9375,
44
+ -0.1510714292526245,
45
+ -0.20678570866584778,
46
+ -0.2742857038974762,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.9375,
51
+ 0.9107142686843872,
52
+ 0.9375,
53
+ 0.20357142388820648,
54
+ 0.26357144117355347,
55
+ 0.375,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "state": {
69
+ "mean": [
70
+ -0.04889854742214084,
71
+ 0.03689368185587227,
72
+ 0.7890402488410473,
73
+ 2.9771945476531982,
74
+ -0.1417286954820156,
75
+ -0.11769362539052963,
76
+ 0.026436020154505968,
77
+ -0.02665513101965189
78
+ ],
79
+ "std": [
80
+ 0.10639013941746686,
81
+ 0.15115733130675715,
82
+ 0.38406895599530033,
83
+ 0.3530238395244304,
84
+ 0.8227341427331599,
85
+ 0.32357567121520087,
86
+ 0.014583991652936385,
87
+ 0.014467005007200339
88
+ ],
89
+ "max": [
90
+ 0.21031762659549713,
91
+ 0.39128610491752625,
92
+ 1.3660105466842651,
93
+ 3.6714255809783936,
94
+ 3.560650587081909,
95
+ 1.386339545249939,
96
+ 0.04233968257904053,
97
+ 0.0013633022317662835
98
+ ],
99
+ "min": [
100
+ -0.4828203022480011,
101
+ -0.3255046010017395,
102
+ 0.008128180168569088,
103
+ 0.35277295112609863,
104
+ -3.641430377960205,
105
+ -1.842738389968872,
106
+ -0.0013586411951109767,
107
+ -0.042040832340717316
108
+ ],
109
+ "q01": [
110
+ -0.4240104854106903,
111
+ -0.28383004665374756,
112
+ 0.009925739839673042,
113
+ 1.3085840940475464,
114
+ -2.8866775035858154,
115
+ -1.159900426864624,
116
+ 0.001503719249740243,
117
+ -0.040336400270462036
118
+ ],
119
+ "q99": [
120
+ 0.15302616357803345,
121
+ 0.362916499376297,
122
+ 1.2910678386688232,
123
+ 3.3035426139831543,
124
+ 2.7496531009674072,
125
+ 0.6893712878227234,
126
+ 0.04061093553900719,
127
+ -0.0015016930410638452
128
+ ]
129
+ },
130
+ "num_transitions": 272104,
131
+ "num_trajectories": 1693
132
+ }
133
+ }
LIBERO/summary.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"steps": 10000}
2
+ {"steps": 20000}
3
+ {"steps": 30000}
Pretrain/checkpoints/VLA-JEPA-pretrain.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd929c79d9bbd0bda56c0b952c7acb470d93c6241a519013fe5248c3f3ea5fab
3
+ size 6163578232
Pretrain/config.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_id": "pretrain",
3
+ "run_root_dir": "checkpoints",
4
+ "seed": 42,
5
+ "trackers": [
6
+ "json"
7
+ ],
8
+ "is_debug": false,
9
+ "framework": {
10
+ "name": "VLA_JEPA",
11
+ "qwenvl": {
12
+ "base_vlm": "/home/dataset-assist-0/algorithm/ginwind/models/Qwen3-VL-2B-Instruct",
13
+ "attn_implementation": "flash_attention_2",
14
+ "vl_hidden_dim": 2048
15
+ },
16
+ "action_model": {
17
+ "action_model_type": "DiT-B",
18
+ "action_hidden_dim": 1024,
19
+ "hidden_size": 1024,
20
+ "add_pos_embed": true,
21
+ "max_seq_len": 1024,
22
+ "action_dim": 7,
23
+ "state_dim": 8,
24
+ "future_action_window_size": 6,
25
+ "action_horizon": 7,
26
+ "past_action_window_size": 0,
27
+ "repeated_diffusion_steps": 8,
28
+ "noise_beta_alpha": 1.5,
29
+ "noise_beta_beta": 1.0,
30
+ "noise_s": 0.999,
31
+ "num_timestep_buckets": 1000,
32
+ "num_inference_timesteps": 4,
33
+ "num_target_vision_tokens": 32,
34
+ "diffusion_model_cfg": {
35
+ "cross_attention_dim": 2048,
36
+ "dropout": 0.2,
37
+ "final_dropout": true,
38
+ "interleave_self_attention": true,
39
+ "norm_type": "ada_norm",
40
+ "num_layers": 16,
41
+ "output_dim": 1024,
42
+ "positional_embeddings": null
43
+ }
44
+ },
45
+ "vj2_model": {
46
+ "base_encoder": "/home/dataset-assist-0/algorithm/ginwind/models/vjepa2-vitl-fpc64-256",
47
+ "depth": 12,
48
+ "num_heads": 8,
49
+ "special_action_token": "<|action_{}|>",
50
+ "num_action_tokens_per_timestep": 8,
51
+ "embodied_action_token": "<|embodied_action|>",
52
+ "num_embodied_action_tokens_per_instruction": 32,
53
+ "num_frames": 8
54
+ },
55
+ "reduce_in_full_precision": true
56
+ },
57
+ "datasets": {
58
+ "vla_data": {
59
+ "dataset_py": "lerobot_datasets",
60
+ "data_root_dir": "/home/dataset-local/datasets/LeRobot/droid",
61
+ "data_mix": "droid",
62
+ "action_type": "delta_qpos",
63
+ "CoT_prompt": "Your task is {instruction}. Infer the temporal dynamics from frames {actions} and produce the corresponding policy actions {e_actions}.",
64
+ "resolution_size": 224,
65
+ "video_resolution_size": 256,
66
+ "per_device_batch_size": 16,
67
+ "load_all_data_for_training": true,
68
+ "with_state": false
69
+ },
70
+ "video_data": {
71
+ "dataset_py": "video_datasets",
72
+ "video_dir": "/home/dataset-local/datasets/ssv2/20bn-something-something-v2",
73
+ "text_file": "/home/dataset-local/datasets/ssv2/test-answers.csv",
74
+ "CoT_prompt": "Your task is {instruction}. Infer the temporal dynamics of future frames {actions}.",
75
+ "extensions": [
76
+ "webm"
77
+ ],
78
+ "resolution_size": 224,
79
+ "video_resolution_size": 256,
80
+ "per_device_batch_size": 16
81
+ }
82
+ },
83
+ "trainer": {
84
+ "epochs": 100,
85
+ "max_train_steps": 50000,
86
+ "num_warmup_steps": 5000,
87
+ "save_interval": 10000,
88
+ "eval_interval": 100,
89
+ "learning_rate": {
90
+ "base": 3e-05,
91
+ "qwen_vl_interface": 1e-05,
92
+ "action_model": 0.0001
93
+ },
94
+ "lr_scheduler_type": "cosine_with_min_lr",
95
+ "scheduler_specific_kwargs": {
96
+ "min_lr": 1e-06
97
+ },
98
+ "freeze_modules": "",
99
+ "loss_scale": {
100
+ "vla": 1.0,
101
+ "vlm": 0.1
102
+ },
103
+ "max_grad_norm": 1.0,
104
+ "warmup_ratio": 0.1,
105
+ "weight_decay": 0.0,
106
+ "logging_frequency": 10,
107
+ "gradient_clipping": 1.0,
108
+ "gradient_accumulation_steps": 1,
109
+ "optimizer": {
110
+ "name": "AdamW",
111
+ "betas": [
112
+ 0.9,
113
+ 0.95
114
+ ],
115
+ "eps": 1e-08,
116
+ "weight_decay": 1e-08
117
+ },
118
+ "is_resume": false,
119
+ "resume_epoch": null,
120
+ "resume_step": null,
121
+ "enable_gradient_checkpointing": true,
122
+ "enable_mixed_precision_training": true
123
+ },
124
+ "output_dir": "checkpoints/pretrain"
125
+ }
Pretrain/config.yaml ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: pretrain
2
+ run_root_dir: checkpoints
3
+ seed: 42
4
+ trackers:
5
+ - json
6
+ is_debug: false
7
+ framework:
8
+ name: VLA_JEPA
9
+ qwenvl:
10
+ base_vlm: /home/dataset-assist-0/algorithm/ginwind/models/Qwen3-VL-2B-Instruct
11
+ attn_implementation: flash_attention_2
12
+ vl_hidden_dim: 2048
13
+ action_model:
14
+ action_model_type: DiT-B
15
+ action_hidden_dim: 1024
16
+ hidden_size: 1024
17
+ add_pos_embed: true
18
+ max_seq_len: 1024
19
+ action_dim: 7
20
+ state_dim: 8
21
+ future_action_window_size: 6
22
+ action_horizon: 7
23
+ past_action_window_size: 0
24
+ repeated_diffusion_steps: 8
25
+ noise_beta_alpha: 1.5
26
+ noise_beta_beta: 1.0
27
+ noise_s: 0.999
28
+ num_timestep_buckets: 1000
29
+ num_inference_timesteps: 4
30
+ num_target_vision_tokens: 32
31
+ diffusion_model_cfg:
32
+ cross_attention_dim: 2048
33
+ dropout: 0.2
34
+ final_dropout: true
35
+ interleave_self_attention: true
36
+ norm_type: ada_norm
37
+ num_layers: 16
38
+ output_dim: 1024
39
+ positional_embeddings: null
40
+ vj2_model:
41
+ base_encoder: /home/dataset-assist-0/algorithm/ginwind/models/vjepa2-vitl-fpc64-256
42
+ depth: 12
43
+ num_heads: 8
44
+ special_action_token: <|action_{}|>
45
+ num_action_tokens_per_timestep: 8
46
+ embodied_action_token: <|embodied_action|>
47
+ num_embodied_action_tokens_per_instruction: 32
48
+ num_frames: 8
49
+ reduce_in_full_precision: true
50
+ datasets:
51
+ vla_data:
52
+ dataset_py: lerobot_datasets
53
+ data_root_dir: /home/dataset-local/datasets/LeRobot/droid
54
+ data_mix: droid
55
+ action_type: delta_qpos
56
+ CoT_prompt: Your task is {instruction}. Infer the temporal dynamics from frames
57
+ {actions} and produce the corresponding policy actions {e_actions}.
58
+ resolution_size: 224
59
+ video_resolution_size: 256
60
+ per_device_batch_size: 16
61
+ load_all_data_for_training: true
62
+ with_state: false
63
+ video_data:
64
+ dataset_py: video_datasets
65
+ video_dir: /home/dataset-local/datasets/ssv2/20bn-something-something-v2
66
+ text_file: /home/dataset-local/datasets/ssv2/test-answers.csv
67
+ CoT_prompt: Your task is {instruction}. Infer the temporal dynamics of future
68
+ frames {actions}.
69
+ extensions:
70
+ - webm
71
+ resolution_size: 224
72
+ video_resolution_size: 256
73
+ per_device_batch_size: 16
74
+ trainer:
75
+ epochs: 100
76
+ max_train_steps: 50000
77
+ num_warmup_steps: 5000
78
+ save_interval: 10000
79
+ eval_interval: 100
80
+ learning_rate:
81
+ base: 3.0e-05
82
+ qwen_vl_interface: 1.0e-05
83
+ action_model: 0.0001
84
+ lr_scheduler_type: cosine_with_min_lr
85
+ scheduler_specific_kwargs:
86
+ min_lr: 1.0e-06
87
+ freeze_modules: ''
88
+ loss_scale:
89
+ vla: 1.0
90
+ vlm: 0.1
91
+ max_grad_norm: 1.0
92
+ warmup_ratio: 0.1
93
+ weight_decay: 0.0
94
+ logging_frequency: 10
95
+ gradient_clipping: 1.0
96
+ gradient_accumulation_steps: 1
97
+ optimizer:
98
+ name: AdamW
99
+ betas:
100
+ - 0.9
101
+ - 0.95
102
+ eps: 1.0e-08
103
+ weight_decay: 1.0e-08
104
+ is_resume: false
105
+ resume_epoch: null
106
+ resume_step: null
107
+ enable_gradient_checkpointing: true
108
+ enable_mixed_precision_training: true
109
+ output_dir: checkpoints/pretrain
Pretrain/dataset_statistics.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "franka": {
3
+ "action": {
4
+ "mean": [
5
+ 0.02742478996515274,
6
+ -0.0026808488182723522,
7
+ 0.015953252092003822,
8
+ 0.003548798616975546,
9
+ -0.030532976612448692,
10
+ -0.006683542393147945,
11
+ 0.5860324501991272
12
+ ],
13
+ "std": [
14
+ 0.25387799739837646,
15
+ 0.1842699646949768,
16
+ 0.22532877326011658,
17
+ 0.2175685167312622,
18
+ 0.22572855651378632,
19
+ 0.28678369522094727,
20
+ 0.4287617802619934
21
+ ],
22
+ "max": [
23
+ 0.9999998211860657,
24
+ 0.999991774559021,
25
+ 0.9999973177909851,
26
+ 0.9999874830245972,
27
+ 0.9999954104423523,
28
+ 0.9999998807907104,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -0.9999999403953552,
33
+ -0.9999951124191284,
34
+ -0.9999960660934448,
35
+ -0.9999980330467224,
36
+ -0.9999982118606567,
37
+ -0.9999998807907104,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.7776405811309814,
42
+ -0.5803528428077698,
43
+ -0.5795133113861084,
44
+ -0.6464062333106995,
45
+ -0.7041175365447998,
46
+ -0.8895133137702942,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.7597945332527161,
51
+ 0.5726332068443298,
52
+ 0.7351094484329224,
53
+ 0.6705538630485535,
54
+ 0.6465045213699341,
55
+ 0.8897575736045837,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "state": {
69
+ "mean": [
70
+ 0.5353796482086182,
71
+ 0.0015366104198619723,
72
+ 0.3146370053291321,
73
+ 0.3269118070602417,
74
+ -0.08703453093767166,
75
+ -0.04832201823592186,
76
+ 0.0,
77
+ 0.3697895407676697
78
+ ],
79
+ "std": [
80
+ 0.11646675318479538,
81
+ 0.17390793561935425,
82
+ 0.1611657738685608,
83
+ 2.7484281063079834,
84
+ 0.3465787172317505,
85
+ 0.7527366280555725,
86
+ 0.0,
87
+ 0.4125189185142517
88
+ ],
89
+ "max": [
90
+ 0.8575563430786133,
91
+ 0.8407337069511414,
92
+ 1.0439032316207886,
93
+ 3.1415927410125732,
94
+ 1.5705928802490234,
95
+ 3.1415927410125732,
96
+ 0.0,
97
+ 1.0
98
+ ],
99
+ "min": [
100
+ -0.2824079692363739,
101
+ -0.8556680083274841,
102
+ -0.24001094698905945,
103
+ -3.141592502593994,
104
+ -1.5703768730163574,
105
+ -3.141592025756836,
106
+ 0.0,
107
+ 0.0
108
+ ],
109
+ "q01": [
110
+ 0.2667418420314789,
111
+ -0.4394981265068054,
112
+ -0.04718969017267227,
113
+ -3.1373939514160156,
114
+ -1.2159388065338135,
115
+ -2.173978328704834,
116
+ 0.0,
117
+ 0.0
118
+ ],
119
+ "q99": [
120
+ 0.7826385498046875,
121
+ 0.4409940540790558,
122
+ 0.7858326435089111,
123
+ 3.1374692916870117,
124
+ 0.8910249471664429,
125
+ 2.0517754554748535,
126
+ 0.0,
127
+ 0.9911894202232361
128
+ ]
129
+ },
130
+ "num_transitions": 23834441,
131
+ "num_trajectories": 92233
132
+ }
133
+ }
Pretrain/summary.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"steps": 10000}
2
+ {"steps": 20000}
3
+ {"steps": 30000}
4
+ {"steps": 40000}
5
+ {"steps": 50000}