theconstruct-ai commited on
Commit
ed56b37
·
verified ·
1 Parent(s): 8045fd0

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_horizon": 50,
3
+ "add_pos_embed": true,
4
+ "apply_sincos_state_encoding": true,
5
+ "architectures": [
6
+ "Gr00tN1d6"
7
+ ],
8
+ "attn_dropout": 0.2,
9
+ "attn_implementation": null,
10
+ "backbone_embedding_dim": 2048,
11
+ "backbone_model_type": "eagle",
12
+ "backbone_trainable_params_fp32": true,
13
+ "collator_overwrite_image_inputs": false,
14
+ "color_jitter_params": {
15
+ "brightness": 0.1,
16
+ "contrast": 0.1,
17
+ "hue": 0.1,
18
+ "saturation": 0.1
19
+ },
20
+ "crop_fraction": 0.95,
21
+ "diffusion_model_cfg": {
22
+ "attention_head_dim": 48,
23
+ "dropout": 0.2,
24
+ "final_dropout": true,
25
+ "interleave_self_attention": true,
26
+ "norm_type": "ada_norm",
27
+ "num_attention_heads": 32,
28
+ "num_layers": 32,
29
+ "output_dim": 1024,
30
+ "positional_embeddings": null
31
+ },
32
+ "eagle_collator": true,
33
+ "formalize_language": true,
34
+ "gemma_collator": false,
35
+ "hidden_size": 1024,
36
+ "image_crop_size": null,
37
+ "image_target_size": null,
38
+ "input_embedding_dim": 1536,
39
+ "load_bf16": true,
40
+ "max_action_dim": 128,
41
+ "max_num_embodiments": 32,
42
+ "max_seq_len": 1024,
43
+ "max_state_dim": 128,
44
+ "model_dtype": "bfloat16",
45
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
46
+ "model_type": "Gr00tN1d6",
47
+ "noise_beta_alpha": 1.5,
48
+ "noise_beta_beta": 1.0,
49
+ "noise_s": 0.999,
50
+ "num_inference_timesteps": 4,
51
+ "num_timestep_buckets": 1000,
52
+ "random_rotation_angle": null,
53
+ "reproject_vision": false,
54
+ "select_layer": 16,
55
+ "shortest_image_edge": 256,
56
+ "state_dropout_prob": 0.0,
57
+ "torch_dtype": "bfloat16",
58
+ "transformers_version": "4.51.3",
59
+ "tune_diffusion_model": false,
60
+ "tune_llm": false,
61
+ "tune_projector": true,
62
+ "tune_top_llm_layers": 4,
63
+ "tune_visual": false,
64
+ "tune_vlln": true,
65
+ "use_albumentations_transforms": true,
66
+ "use_alternate_vl_dit": true,
67
+ "use_flash_attention": true,
68
+ "use_relative_action": true,
69
+ "use_vlln": true
70
+ }
embodiment_id.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "robocasa_panda_omron": 13,
3
+ "gr1": 20,
4
+ "behavior_r1_pro": 24,
5
+ "unitree_g1": 8,
6
+ "oxe_google": 0,
7
+ "oxe_widowx": 1,
8
+ "libero_panda": 2,
9
+ "oxe_droid": 16,
10
+ "new_embodiment": 10
11
+ }
experiment_cfg/conf.yaml ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ load_config_path: null
2
+ model:
3
+ model_type: Gr00tN1d6
4
+ model_dtype: bfloat16
5
+ model_name: nvidia/Eagle-Block2A-2B-v2
6
+ backbone_model_type: eagle
7
+ model_revision: null
8
+ tune_top_llm_layers: 4
9
+ backbone_embedding_dim: 2048
10
+ tune_llm: false
11
+ tune_visual: false
12
+ select_layer: 16
13
+ reproject_vision: false
14
+ use_flash_attention: true
15
+ load_bf16: false
16
+ collator_overwrite_image_inputs: false
17
+ eagle_collator: true
18
+ backbone_trainable_params_fp32: true
19
+ image_crop_size: null
20
+ image_target_size: null
21
+ shortest_image_edge: 256
22
+ crop_fraction: 0.95
23
+ random_rotation_angle: null
24
+ color_jitter_params:
25
+ brightness: 0.3
26
+ contrast: 0.4
27
+ saturation: 0.5
28
+ hue: 0.08
29
+ use_albumentations_transforms: true
30
+ extra_augmentation_config: null
31
+ formalize_language: true
32
+ apply_sincos_state_encoding: false
33
+ use_relative_action: true
34
+ max_state_dim: 29
35
+ max_action_dim: 29
36
+ action_horizon: 16
37
+ hidden_size: 1024
38
+ input_embedding_dim: 1536
39
+ add_pos_embed: true
40
+ attn_dropout: 0.2
41
+ use_vlln: true
42
+ max_seq_len: 1024
43
+ use_alternate_vl_dit: true
44
+ attend_text_every_n_blocks: 2
45
+ diffusion_model_cfg:
46
+ positional_embeddings: null
47
+ num_layers: 32
48
+ num_attention_heads: 32
49
+ attention_head_dim: 48
50
+ norm_type: ada_norm
51
+ dropout: 0.2
52
+ final_dropout: true
53
+ output_dim: 1024
54
+ interleave_self_attention: true
55
+ num_inference_timesteps: 4
56
+ noise_beta_alpha: 1.5
57
+ noise_beta_beta: 1.0
58
+ noise_s: 0.999
59
+ num_timestep_buckets: 1000
60
+ tune_projector: true
61
+ tune_diffusion_model: false
62
+ tune_vlln: true
63
+ state_dropout_prob: 0.0
64
+ state_additive_noise_scale: 0.0
65
+ max_num_embodiments: 32
66
+ data:
67
+ datasets:
68
+ - dataset_paths:
69
+ - datasets/theconstruct-ai_push_left_test
70
+ - datasets/theconstruct-ai_push_right_test
71
+ - datasets/theconstruct-ai_push_left_test_2
72
+ - datasets/theconstruct-ai_push_right_test_2
73
+ embodiment_tag: unitree_g1
74
+ mix_ratio: 1.0
75
+ dataset_type: physical_embodiment
76
+ val_dataset_path: null
77
+ modality_configs:
78
+ unitree_g1:
79
+ video:
80
+ delta_indices:
81
+ - 0
82
+ modality_keys:
83
+ - ego_view
84
+ sin_cos_embedding_keys: null
85
+ mean_std_embedding_keys: null
86
+ action_configs: null
87
+ state:
88
+ delta_indices:
89
+ - 0
90
+ modality_keys:
91
+ - left_leg
92
+ - right_leg
93
+ - waist
94
+ - left_arm
95
+ - right_arm
96
+ - left_hand
97
+ - right_hand
98
+ sin_cos_embedding_keys: null
99
+ mean_std_embedding_keys: null
100
+ action_configs: null
101
+ action:
102
+ delta_indices:
103
+ - 0
104
+ - 1
105
+ - 2
106
+ - 3
107
+ - 4
108
+ - 5
109
+ - 6
110
+ - 7
111
+ - 8
112
+ - 9
113
+ - 10
114
+ - 11
115
+ - 12
116
+ - 13
117
+ - 14
118
+ - 15
119
+ - 16
120
+ - 17
121
+ - 18
122
+ - 19
123
+ - 20
124
+ - 21
125
+ - 22
126
+ - 23
127
+ - 24
128
+ - 25
129
+ - 26
130
+ - 27
131
+ - 28
132
+ - 29
133
+ modality_keys:
134
+ - left_arm
135
+ - right_arm
136
+ - left_hand
137
+ - right_hand
138
+ - waist
139
+ - base_height_command
140
+ - navigate_command
141
+ sin_cos_embedding_keys: null
142
+ mean_std_embedding_keys: null
143
+ action_configs:
144
+ - rep: RELATIVE
145
+ type: NON_EEF
146
+ format: DEFAULT
147
+ state_key: null
148
+ - rep: RELATIVE
149
+ type: NON_EEF
150
+ format: DEFAULT
151
+ state_key: null
152
+ - rep: ABSOLUTE
153
+ type: NON_EEF
154
+ format: DEFAULT
155
+ state_key: null
156
+ - rep: ABSOLUTE
157
+ type: NON_EEF
158
+ format: DEFAULT
159
+ state_key: null
160
+ - rep: ABSOLUTE
161
+ type: NON_EEF
162
+ format: DEFAULT
163
+ state_key: null
164
+ - rep: ABSOLUTE
165
+ type: NON_EEF
166
+ format: DEFAULT
167
+ state_key: null
168
+ - rep: ABSOLUTE
169
+ type: NON_EEF
170
+ format: DEFAULT
171
+ state_key: null
172
+ language:
173
+ delta_indices:
174
+ - 0
175
+ modality_keys:
176
+ - annotation.human.task_description
177
+ sin_cos_embedding_keys: null
178
+ mean_std_embedding_keys: null
179
+ action_configs: null
180
+ download_cache: false
181
+ shard_size: 1024
182
+ episode_sampling_rate: 0.1
183
+ num_shards_per_epoch: 100000
184
+ override_pretraining_statistics: false
185
+ mode: single_turn
186
+ random_chop: 0.0
187
+ mock_dataset_mode: false
188
+ shuffle: true
189
+ seed: 42
190
+ multiprocessing_context: fork
191
+ allow_padding: false
192
+ subsample_ratio: 1.0
193
+ image_crop_size:
194
+ - 244
195
+ - 244
196
+ image_target_size:
197
+ - 224
198
+ - 224
199
+ video_backend: torchcodec
200
+ training:
201
+ output_dir: ./outputs
202
+ experiment_name: null
203
+ max_steps: 10
204
+ global_batch_size: 4
205
+ batch_size: null
206
+ gradient_accumulation_steps: 4
207
+ learning_rate: 0.0001
208
+ lr_scheduler_type: cosine
209
+ weight_decay: 1.0e-05
210
+ warmup_ratio: 0.05
211
+ warmup_steps: 0
212
+ max_grad_norm: 1.0
213
+ optim: adamw_torch
214
+ start_from_checkpoint: nvidia/GR00T-N1.6-3B
215
+ tf32: true
216
+ fp16: false
217
+ bf16: true
218
+ eval_bf16: true
219
+ logging_steps: 10
220
+ save_steps: 1000
221
+ save_total_limit: 5
222
+ save_vl_model: false
223
+ upload_checkpoints: false
224
+ upload_every: 1000
225
+ upload_last_n_checkpoints: 5
226
+ max_concurrent_uploads: 2
227
+ eval_strategy: 'no'
228
+ eval_steps: 500
229
+ eval_set_split_ratio: 0.1
230
+ eval_batch_size: 2
231
+ save_best_eval_metric_name: ''
232
+ save_best_eval_metric_greater_is_better: true
233
+ deepspeed_stage: 2
234
+ gradient_checkpointing: false
235
+ transformers_trust_remote_code: true
236
+ transformers_local_files_only: false
237
+ transformers_cache_dir: null
238
+ transformers_access_token: null
239
+ use_ddp: false
240
+ ddp_bucket_cap_mb: 100
241
+ num_gpus: 1
242
+ dataloader_num_workers: 4
243
+ remove_unused_columns: false
244
+ use_wandb: true
245
+ wandb_project: finetune-gr00t-n1d6
246
+ enable_profiling: false
247
+ max_retries: 3
248
+ assert_loss_less_than: null
249
+ add_rl_callback: false
250
+ enable_open_loop_eval: false
251
+ open_loop_eval_traj_ids:
252
+ - 0
253
+ open_loop_eval_steps_per_traj: 100
254
+ open_loop_eval_plot_indices: null
255
+ max_steps: 10
256
+ save_steps: 1000
experiment_cfg/config.yaml ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !!python/object:gr00t.configs.base_config.Config
2
+ data: !!python/object:gr00t.configs.data.data_config.DataConfig
3
+ allow_padding: false
4
+ datasets:
5
+ - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
6
+ dataset_paths:
7
+ - datasets/theconstruct-ai_push_left_test
8
+ - datasets/theconstruct-ai_push_right_test
9
+ - datasets/theconstruct-ai_push_left_test_2
10
+ - datasets/theconstruct-ai_push_right_test_2
11
+ dataset_type: physical_embodiment
12
+ embodiment_tag: unitree_g1
13
+ mix_ratio: 1.0
14
+ val_dataset_path: null
15
+ download_cache: false
16
+ episode_sampling_rate: 0.1
17
+ image_crop_size:
18
+ - 244
19
+ - 244
20
+ image_target_size:
21
+ - 224
22
+ - 224
23
+ mock_dataset_mode: false
24
+ modality_configs:
25
+ unitree_g1:
26
+ action: !!python/object:gr00t.data.types.ModalityConfig
27
+ action_configs:
28
+ - !!python/object:gr00t.data.types.ActionConfig
29
+ format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
30
+ - default
31
+ rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
32
+ - relative
33
+ state_key: null
34
+ type: &id003 !!python/object/apply:gr00t.data.types.ActionType
35
+ - non_eef
36
+ - !!python/object:gr00t.data.types.ActionConfig
37
+ format: *id001
38
+ rep: *id002
39
+ state_key: null
40
+ type: *id003
41
+ - !!python/object:gr00t.data.types.ActionConfig
42
+ format: *id001
43
+ rep: &id004 !!python/object/apply:gr00t.data.types.ActionRepresentation
44
+ - absolute
45
+ state_key: null
46
+ type: *id003
47
+ - !!python/object:gr00t.data.types.ActionConfig
48
+ format: *id001
49
+ rep: *id004
50
+ state_key: null
51
+ type: *id003
52
+ - !!python/object:gr00t.data.types.ActionConfig
53
+ format: *id001
54
+ rep: *id004
55
+ state_key: null
56
+ type: *id003
57
+ - !!python/object:gr00t.data.types.ActionConfig
58
+ format: *id001
59
+ rep: *id004
60
+ state_key: null
61
+ type: *id003
62
+ - !!python/object:gr00t.data.types.ActionConfig
63
+ format: *id001
64
+ rep: *id004
65
+ state_key: null
66
+ type: *id003
67
+ delta_indices:
68
+ - 0
69
+ - 1
70
+ - 2
71
+ - 3
72
+ - 4
73
+ - 5
74
+ - 6
75
+ - 7
76
+ - 8
77
+ - 9
78
+ - 10
79
+ - 11
80
+ - 12
81
+ - 13
82
+ - 14
83
+ - 15
84
+ - 16
85
+ - 17
86
+ - 18
87
+ - 19
88
+ - 20
89
+ - 21
90
+ - 22
91
+ - 23
92
+ - 24
93
+ - 25
94
+ - 26
95
+ - 27
96
+ - 28
97
+ - 29
98
+ mean_std_embedding_keys: null
99
+ modality_keys:
100
+ - left_arm
101
+ - right_arm
102
+ - left_hand
103
+ - right_hand
104
+ - waist
105
+ - base_height_command
106
+ - navigate_command
107
+ sin_cos_embedding_keys: null
108
+ language: !!python/object:gr00t.data.types.ModalityConfig
109
+ action_configs: null
110
+ delta_indices:
111
+ - 0
112
+ mean_std_embedding_keys: null
113
+ modality_keys:
114
+ - annotation.human.task_description
115
+ sin_cos_embedding_keys: null
116
+ state: !!python/object:gr00t.data.types.ModalityConfig
117
+ action_configs: null
118
+ delta_indices:
119
+ - 0
120
+ mean_std_embedding_keys: null
121
+ modality_keys:
122
+ - left_leg
123
+ - right_leg
124
+ - waist
125
+ - left_arm
126
+ - right_arm
127
+ - left_hand
128
+ - right_hand
129
+ sin_cos_embedding_keys: null
130
+ video: !!python/object:gr00t.data.types.ModalityConfig
131
+ action_configs: null
132
+ delta_indices:
133
+ - 0
134
+ mean_std_embedding_keys: null
135
+ modality_keys:
136
+ - ego_view
137
+ sin_cos_embedding_keys: null
138
+ mode: single_turn
139
+ multiprocessing_context: fork
140
+ num_shards_per_epoch: 100000
141
+ override_pretraining_statistics: false
142
+ random_chop: 0.0
143
+ seed: 42
144
+ shard_size: 1024
145
+ shuffle: true
146
+ subsample_ratio: 1.0
147
+ video_backend: torchcodec
148
+ load_config_path: null
149
+ model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
150
+ _attn_implementation_autoset: false
151
+ _attn_implementation_internal: null
152
+ _commit_hash: null
153
+ _name_or_path: ''
154
+ add_cross_attention: false
155
+ architectures: null
156
+ backbone_model_type: eagle
157
+ backbone_trainable_params_fp32: true
158
+ bad_words_ids: null
159
+ begin_suppress_tokens: null
160
+ bos_token_id: null
161
+ chunk_size_feed_forward: 0
162
+ color_jitter_params:
163
+ brightness: 0.3
164
+ contrast: 0.4
165
+ hue: 0.08
166
+ saturation: 0.5
167
+ cross_attention_hidden_size: null
168
+ decoder_start_token_id: null
169
+ diffusion_model_cfg:
170
+ attention_head_dim: 48
171
+ dropout: 0.2
172
+ final_dropout: true
173
+ interleave_self_attention: true
174
+ norm_type: ada_norm
175
+ num_attention_heads: 32
176
+ num_layers: 32
177
+ output_dim: 1024
178
+ positional_embeddings: null
179
+ diversity_penalty: 0.0
180
+ do_sample: false
181
+ eagle_collator: true
182
+ early_stopping: false
183
+ encoder_no_repeat_ngram_size: 0
184
+ eos_token_id: null
185
+ exponential_decay_length_penalty: null
186
+ extra_augmentation_config: null
187
+ finetuning_task: null
188
+ forced_bos_token_id: null
189
+ forced_eos_token_id: null
190
+ id2label:
191
+ 0: LABEL_0
192
+ 1: LABEL_1
193
+ is_decoder: false
194
+ is_encoder_decoder: false
195
+ label2id:
196
+ LABEL_0: 0
197
+ LABEL_1: 1
198
+ length_penalty: 1.0
199
+ load_bf16: false
200
+ max_length: 20
201
+ min_length: 0
202
+ model_name: nvidia/Eagle-Block2A-2B-v2
203
+ no_repeat_ngram_size: 0
204
+ num_beam_groups: 1
205
+ num_beams: 1
206
+ num_return_sequences: 1
207
+ output_attentions: false
208
+ output_hidden_states: false
209
+ output_scores: false
210
+ pad_token_id: null
211
+ prefix: null
212
+ problem_type: null
213
+ pruned_heads: {}
214
+ random_rotation_angle: null
215
+ remove_invalid_values: false
216
+ repetition_penalty: 1.0
217
+ reproject_vision: false
218
+ return_dict: true
219
+ return_dict_in_generate: false
220
+ sep_token_id: null
221
+ state_dropout_prob: 0.0
222
+ suppress_tokens: null
223
+ task_specific_params: null
224
+ temperature: 1.0
225
+ tf_legacy_loss: false
226
+ tie_encoder_decoder: false
227
+ tie_word_embeddings: true
228
+ tokenizer_class: null
229
+ top_k: 50
230
+ top_p: 1.0
231
+ torch_dtype: null
232
+ torchscript: false
233
+ transformers_version: null
234
+ tune_diffusion_model: false
235
+ tune_llm: false
236
+ tune_projector: true
237
+ tune_visual: false
238
+ typical_p: 1.0
239
+ use_bfloat16: false
240
+ use_relative_action: true
241
+ training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
242
+ add_rl_callback: false
243
+ assert_loss_less_than: null
244
+ batch_size: null
245
+ bf16: true
246
+ dataloader_num_workers: 4
247
+ ddp_bucket_cap_mb: 100
248
+ deepspeed_stage: 2
249
+ enable_open_loop_eval: false
250
+ enable_profiling: false
251
+ eval_batch_size: 2
252
+ eval_bf16: true
253
+ eval_set_split_ratio: 0.1
254
+ eval_steps: 500
255
+ eval_strategy: 'no'
256
+ experiment_name: null
257
+ fp16: false
258
+ global_batch_size: 4
259
+ gradient_accumulation_steps: 4
260
+ gradient_checkpointing: false
261
+ learning_rate: 0.0001
262
+ logging_steps: 10
263
+ lr_scheduler_type: cosine
264
+ max_concurrent_uploads: 2
265
+ max_grad_norm: 1.0
266
+ max_retries: 3
267
+ max_steps: 10
268
+ num_gpus: 1
269
+ open_loop_eval_plot_indices: null
270
+ open_loop_eval_steps_per_traj: 100
271
+ open_loop_eval_traj_ids:
272
+ - 0
273
+ optim: adamw_torch
274
+ output_dir: ./outputs
275
+ remove_unused_columns: false
276
+ save_best_eval_metric_greater_is_better: true
277
+ save_best_eval_metric_name: ''
278
+ save_steps: 1000
279
+ save_total_limit: 5
280
+ save_vl_model: false
281
+ start_from_checkpoint: nvidia/GR00T-N1.6-3B
282
+ tf32: true
283
+ transformers_access_token: null
284
+ transformers_cache_dir: null
285
+ transformers_local_files_only: false
286
+ transformers_trust_remote_code: true
287
+ upload_checkpoints: false
288
+ upload_every: 1000
289
+ upload_last_n_checkpoints: 5
290
+ use_ddp: false
291
+ use_wandb: true
292
+ wandb_project: finetune-gr00t-n1d6
293
+ warmup_ratio: 0.05
294
+ warmup_steps: 0
295
+ weight_decay: 1.0e-05
experiment_cfg/dataset_statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
experiment_cfg/final_model_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "Gr00tN1d6",
3
+ "model_dtype": "bfloat16",
4
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
5
+ "backbone_model_type": "eagle",
6
+ "model_revision": null,
7
+ "tune_top_llm_layers": 4,
8
+ "backbone_embedding_dim": 2048,
9
+ "tune_llm": false,
10
+ "tune_visual": false,
11
+ "select_layer": 16,
12
+ "reproject_vision": false,
13
+ "use_flash_attention": true,
14
+ "load_bf16": true,
15
+ "collator_overwrite_image_inputs": false,
16
+ "eagle_collator": true,
17
+ "backbone_trainable_params_fp32": true,
18
+ "extra_augmentation_config": null,
19
+ "apply_sincos_state_encoding": true,
20
+ "use_relative_action": true,
21
+ "max_state_dim": 128,
22
+ "max_action_dim": 128,
23
+ "action_horizon": 50,
24
+ "hidden_size": 1024,
25
+ "input_embedding_dim": 1536,
26
+ "add_pos_embed": true,
27
+ "attn_dropout": 0.2,
28
+ "use_vlln": true,
29
+ "max_seq_len": 1024,
30
+ "use_alternate_vl_dit": true,
31
+ "attend_text_every_n_blocks": 2,
32
+ "diffusion_model_cfg": {
33
+ "attention_head_dim": 48,
34
+ "dropout": 0.2,
35
+ "final_dropout": true,
36
+ "interleave_self_attention": true,
37
+ "norm_type": "ada_norm",
38
+ "num_attention_heads": 32,
39
+ "num_layers": 32,
40
+ "output_dim": 1024,
41
+ "positional_embeddings": null
42
+ },
43
+ "num_inference_timesteps": 4,
44
+ "noise_beta_alpha": 1.5,
45
+ "noise_beta_beta": 1.0,
46
+ "noise_s": 0.999,
47
+ "num_timestep_buckets": 1000,
48
+ "tune_projector": true,
49
+ "tune_diffusion_model": false,
50
+ "tune_vlln": true,
51
+ "state_dropout_prob": 0.0,
52
+ "state_additive_noise_scale": 0.0,
53
+ "max_num_embodiments": 32
54
+ }
experiment_cfg/final_processor_config.json ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01e110ce2057e354e8d2b5ab598f1703c8739b9682c6e53c956bccdbcd4d36a0
3
+ size 4990120184
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71d6ae4204a83c9fcf99fe4b5f548f7453e5a96adbfe8bb615e3ae0847e3320f
3
+ size 4823190320
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dab500ad7c9a60cf6533e3233cf5d09433ff0852c947cbb1181a5f671511fb57
3
+ size 4226018251
processor_config.json ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "Gr00tN1d6Processor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "behavior_r1_pro": {
6
+ "video": {
7
+ "delta_indices": [
8
+ 0
9
+ ],
10
+ "modality_keys": [
11
+ "observation.images.rgb.head_256_256",
12
+ "observation.images.rgb.left_wrist_256_256",
13
+ "observation.images.rgb.right_wrist_256_256"
14
+ ],
15
+ "sin_cos_embedding_keys": null,
16
+ "mean_std_embedding_keys": null,
17
+ "action_configs": null
18
+ },
19
+ "state": {
20
+ "delta_indices": [
21
+ 0
22
+ ],
23
+ "modality_keys": [
24
+ "robot_pos",
25
+ "robot_ori_cos",
26
+ "robot_ori_sin",
27
+ "robot_2d_ori",
28
+ "robot_2d_ori_cos",
29
+ "robot_2d_ori_sin",
30
+ "robot_lin_vel",
31
+ "robot_ang_vel",
32
+ "arm_left_qpos",
33
+ "arm_left_qpos_sin",
34
+ "arm_left_qpos_cos",
35
+ "eef_left_pos",
36
+ "eef_left_quat",
37
+ "gripper_left_qpos",
38
+ "arm_right_qpos",
39
+ "arm_right_qpos_sin",
40
+ "arm_right_qpos_cos",
41
+ "eef_right_pos",
42
+ "eef_right_quat",
43
+ "gripper_right_qpos",
44
+ "trunk_qpos"
45
+ ],
46
+ "sin_cos_embedding_keys": null,
47
+ "mean_std_embedding_keys": null,
48
+ "action_configs": null
49
+ },
50
+ "action": {
51
+ "delta_indices": [
52
+ 0,
53
+ 1,
54
+ 2,
55
+ 3,
56
+ 4,
57
+ 5,
58
+ 6,
59
+ 7,
60
+ 8,
61
+ 9,
62
+ 10,
63
+ 11,
64
+ 12,
65
+ 13,
66
+ 14,
67
+ 15,
68
+ 16,
69
+ 17,
70
+ 18,
71
+ 19,
72
+ 20,
73
+ 21,
74
+ 22,
75
+ 23,
76
+ 24,
77
+ 25,
78
+ 26,
79
+ 27,
80
+ 28,
81
+ 29,
82
+ 30,
83
+ 31
84
+ ],
85
+ "modality_keys": [
86
+ "base",
87
+ "torso",
88
+ "left_arm",
89
+ "left_gripper",
90
+ "right_arm",
91
+ "right_gripper"
92
+ ],
93
+ "sin_cos_embedding_keys": null,
94
+ "mean_std_embedding_keys": null,
95
+ "action_configs": [
96
+ {
97
+ "rep": "ABSOLUTE",
98
+ "type": "NON_EEF",
99
+ "format": "DEFAULT",
100
+ "state_key": null
101
+ },
102
+ {
103
+ "rep": "RELATIVE",
104
+ "type": "NON_EEF",
105
+ "format": "DEFAULT",
106
+ "state_key": "trunk_qpos"
107
+ },
108
+ {
109
+ "rep": "RELATIVE",
110
+ "type": "NON_EEF",
111
+ "format": "DEFAULT",
112
+ "state_key": "arm_left_qpos"
113
+ },
114
+ {
115
+ "rep": "ABSOLUTE",
116
+ "type": "NON_EEF",
117
+ "format": "DEFAULT",
118
+ "state_key": null
119
+ },
120
+ {
121
+ "rep": "RELATIVE",
122
+ "type": "NON_EEF",
123
+ "format": "DEFAULT",
124
+ "state_key": "arm_right_qpos"
125
+ },
126
+ {
127
+ "rep": "ABSOLUTE",
128
+ "type": "NON_EEF",
129
+ "format": "DEFAULT",
130
+ "state_key": null
131
+ }
132
+ ]
133
+ },
134
+ "language": {
135
+ "delta_indices": [
136
+ 0
137
+ ],
138
+ "modality_keys": [
139
+ "annotation.human.coarse_action"
140
+ ],
141
+ "sin_cos_embedding_keys": null,
142
+ "mean_std_embedding_keys": null,
143
+ "action_configs": null
144
+ }
145
+ },
146
+ "gr1": {
147
+ "video": {
148
+ "delta_indices": [
149
+ 0
150
+ ],
151
+ "modality_keys": [
152
+ "ego_view_bg_crop_pad_res256_freq20"
153
+ ],
154
+ "sin_cos_embedding_keys": null,
155
+ "mean_std_embedding_keys": null,
156
+ "action_configs": null
157
+ },
158
+ "state": {
159
+ "delta_indices": [
160
+ 0
161
+ ],
162
+ "modality_keys": [
163
+ "left_arm",
164
+ "right_arm",
165
+ "left_hand",
166
+ "right_hand",
167
+ "waist"
168
+ ],
169
+ "sin_cos_embedding_keys": [
170
+ "left_arm",
171
+ "right_arm",
172
+ "left_hand",
173
+ "right_hand",
174
+ "waist"
175
+ ],
176
+ "mean_std_embedding_keys": null,
177
+ "action_configs": null
178
+ },
179
+ "action": {
180
+ "delta_indices": [
181
+ 0,
182
+ 1,
183
+ 2,
184
+ 3,
185
+ 4,
186
+ 5,
187
+ 6,
188
+ 7,
189
+ 8,
190
+ 9,
191
+ 10,
192
+ 11,
193
+ 12,
194
+ 13,
195
+ 14,
196
+ 15
197
+ ],
198
+ "modality_keys": [
199
+ "left_arm",
200
+ "right_arm",
201
+ "left_hand",
202
+ "right_hand",
203
+ "waist"
204
+ ],
205
+ "sin_cos_embedding_keys": null,
206
+ "mean_std_embedding_keys": null,
207
+ "action_configs": [
208
+ {
209
+ "rep": "RELATIVE",
210
+ "type": "NON_EEF",
211
+ "format": "DEFAULT",
212
+ "state_key": null
213
+ },
214
+ {
215
+ "rep": "RELATIVE",
216
+ "type": "NON_EEF",
217
+ "format": "DEFAULT",
218
+ "state_key": null
219
+ },
220
+ {
221
+ "rep": "RELATIVE",
222
+ "type": "NON_EEF",
223
+ "format": "DEFAULT",
224
+ "state_key": null
225
+ },
226
+ {
227
+ "rep": "RELATIVE",
228
+ "type": "NON_EEF",
229
+ "format": "DEFAULT",
230
+ "state_key": null
231
+ },
232
+ {
233
+ "rep": "ABSOLUTE",
234
+ "type": "NON_EEF",
235
+ "format": "DEFAULT",
236
+ "state_key": null
237
+ }
238
+ ]
239
+ },
240
+ "language": {
241
+ "delta_indices": [
242
+ 0
243
+ ],
244
+ "modality_keys": [
245
+ "task"
246
+ ],
247
+ "sin_cos_embedding_keys": null,
248
+ "mean_std_embedding_keys": null,
249
+ "action_configs": null
250
+ }
251
+ },
252
+ "robocasa_panda_omron": {
253
+ "video": {
254
+ "delta_indices": [
255
+ 0
256
+ ],
257
+ "modality_keys": [
258
+ "res256_image_side_0",
259
+ "res256_image_side_1",
260
+ "res256_image_wrist_0"
261
+ ],
262
+ "sin_cos_embedding_keys": null,
263
+ "mean_std_embedding_keys": null,
264
+ "action_configs": null
265
+ },
266
+ "state": {
267
+ "delta_indices": [
268
+ 0
269
+ ],
270
+ "modality_keys": [
271
+ "end_effector_position_relative",
272
+ "end_effector_rotation_relative",
273
+ "gripper_qpos",
274
+ "base_position",
275
+ "base_rotation"
276
+ ],
277
+ "sin_cos_embedding_keys": null,
278
+ "mean_std_embedding_keys": null,
279
+ "action_configs": null
280
+ },
281
+ "action": {
282
+ "delta_indices": [
283
+ 0,
284
+ 1,
285
+ 2,
286
+ 3,
287
+ 4,
288
+ 5,
289
+ 6,
290
+ 7,
291
+ 8,
292
+ 9,
293
+ 10,
294
+ 11,
295
+ 12,
296
+ 13,
297
+ 14,
298
+ 15
299
+ ],
300
+ "modality_keys": [
301
+ "end_effector_position",
302
+ "end_effector_rotation",
303
+ "gripper_close",
304
+ "base_motion",
305
+ "control_mode"
306
+ ],
307
+ "sin_cos_embedding_keys": null,
308
+ "mean_std_embedding_keys": null,
309
+ "action_configs": [
310
+ {
311
+ "rep": "ABSOLUTE",
312
+ "type": "NON_EEF",
313
+ "format": "DEFAULT",
314
+ "state_key": null
315
+ },
316
+ {
317
+ "rep": "ABSOLUTE",
318
+ "type": "NON_EEF",
319
+ "format": "DEFAULT",
320
+ "state_key": null
321
+ },
322
+ {
323
+ "rep": "ABSOLUTE",
324
+ "type": "NON_EEF",
325
+ "format": "DEFAULT",
326
+ "state_key": null
327
+ },
328
+ {
329
+ "rep": "ABSOLUTE",
330
+ "type": "NON_EEF",
331
+ "format": "DEFAULT",
332
+ "state_key": null
333
+ },
334
+ {
335
+ "rep": "ABSOLUTE",
336
+ "type": "NON_EEF",
337
+ "format": "DEFAULT",
338
+ "state_key": null
339
+ }
340
+ ]
341
+ },
342
+ "language": {
343
+ "delta_indices": [
344
+ 0
345
+ ],
346
+ "modality_keys": [
347
+ "annotation.human.action.task_description"
348
+ ],
349
+ "sin_cos_embedding_keys": null,
350
+ "mean_std_embedding_keys": null,
351
+ "action_configs": null
352
+ }
353
+ },
354
+ "unitree_g1": {
355
+ "video": {
356
+ "delta_indices": [
357
+ 0
358
+ ],
359
+ "modality_keys": [
360
+ "ego_view"
361
+ ],
362
+ "sin_cos_embedding_keys": null,
363
+ "mean_std_embedding_keys": null,
364
+ "action_configs": null
365
+ },
366
+ "state": {
367
+ "delta_indices": [
368
+ 0
369
+ ],
370
+ "modality_keys": [
371
+ "left_leg",
372
+ "right_leg",
373
+ "waist",
374
+ "left_arm",
375
+ "right_arm",
376
+ "left_hand",
377
+ "right_hand"
378
+ ],
379
+ "sin_cos_embedding_keys": null,
380
+ "mean_std_embedding_keys": null,
381
+ "action_configs": null
382
+ },
383
+ "action": {
384
+ "delta_indices": [
385
+ 0,
386
+ 1,
387
+ 2,
388
+ 3,
389
+ 4,
390
+ 5,
391
+ 6,
392
+ 7,
393
+ 8,
394
+ 9,
395
+ 10,
396
+ 11,
397
+ 12,
398
+ 13,
399
+ 14,
400
+ 15,
401
+ 16,
402
+ 17,
403
+ 18,
404
+ 19,
405
+ 20,
406
+ 21,
407
+ 22,
408
+ 23,
409
+ 24,
410
+ 25,
411
+ 26,
412
+ 27,
413
+ 28,
414
+ 29
415
+ ],
416
+ "modality_keys": [
417
+ "left_arm",
418
+ "right_arm",
419
+ "left_hand",
420
+ "right_hand",
421
+ "waist",
422
+ "base_height_command",
423
+ "navigate_command"
424
+ ],
425
+ "sin_cos_embedding_keys": null,
426
+ "mean_std_embedding_keys": null,
427
+ "action_configs": [
428
+ {
429
+ "rep": "RELATIVE",
430
+ "type": "NON_EEF",
431
+ "format": "DEFAULT",
432
+ "state_key": null
433
+ },
434
+ {
435
+ "rep": "RELATIVE",
436
+ "type": "NON_EEF",
437
+ "format": "DEFAULT",
438
+ "state_key": null
439
+ },
440
+ {
441
+ "rep": "ABSOLUTE",
442
+ "type": "NON_EEF",
443
+ "format": "DEFAULT",
444
+ "state_key": null
445
+ },
446
+ {
447
+ "rep": "ABSOLUTE",
448
+ "type": "NON_EEF",
449
+ "format": "DEFAULT",
450
+ "state_key": null
451
+ },
452
+ {
453
+ "rep": "ABSOLUTE",
454
+ "type": "NON_EEF",
455
+ "format": "DEFAULT",
456
+ "state_key": null
457
+ },
458
+ {
459
+ "rep": "ABSOLUTE",
460
+ "type": "NON_EEF",
461
+ "format": "DEFAULT",
462
+ "state_key": null
463
+ },
464
+ {
465
+ "rep": "ABSOLUTE",
466
+ "type": "NON_EEF",
467
+ "format": "DEFAULT",
468
+ "state_key": null
469
+ }
470
+ ]
471
+ },
472
+ "language": {
473
+ "delta_indices": [
474
+ 0
475
+ ],
476
+ "modality_keys": [
477
+ "annotation.human.task_description"
478
+ ],
479
+ "sin_cos_embedding_keys": null,
480
+ "mean_std_embedding_keys": null,
481
+ "action_configs": null
482
+ }
483
+ }
484
+ },
485
+ "image_crop_size": null,
486
+ "image_target_size": null,
487
+ "use_albumentations": true,
488
+ "random_rotation_angle": null,
489
+ "color_jitter_params": {
490
+ "brightness": 0.3,
491
+ "contrast": 0.4,
492
+ "saturation": 0.5,
493
+ "hue": 0.08
494
+ },
495
+ "shortest_image_edge": 256,
496
+ "crop_fraction": 0.95,
497
+ "model_name": "nvidia/Eagle-Block2A-2B-v2",
498
+ "model_type": "eagle",
499
+ "formalize_language": true,
500
+ "max_state_dim": 128,
501
+ "max_action_dim": 128,
502
+ "max_action_horizon": 50,
503
+ "use_percentiles": false,
504
+ "clip_outliers": true,
505
+ "apply_sincos_state_encoding": true,
506
+ "use_relative_action": true
507
+ }
508
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af8bd8e05457c51c966e8c3b7b1f70a92789f7d712a20ce0d26482224c96d635
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f29b14e4fe3333c764817ba3914df120adf512a79675e9ce6747be6f9beaad8f
3
+ size 1465
statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 10,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "grad_norm": 0.18987387418746948,
14
+ "learning_rate": 3.0153689607045845e-06,
15
+ "loss": 1.0835,
16
+ "step": 10
17
+ }
18
+ ],
19
+ "logging_steps": 10,
20
+ "max_steps": 10,
21
+ "num_input_tokens_seen": 0,
22
+ "num_train_epochs": 9223372036854775807,
23
+ "save_steps": 1000,
24
+ "stateful_callbacks": {
25
+ "TrainerControl": {
26
+ "args": {
27
+ "should_epoch_stop": false,
28
+ "should_evaluate": false,
29
+ "should_log": false,
30
+ "should_save": true,
31
+ "should_training_stop": true
32
+ },
33
+ "attributes": {}
34
+ }
35
+ },
36
+ "total_flos": 0.0,
37
+ "train_batch_size": 4,
38
+ "trial_name": null,
39
+ "trial_params": null
40
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:299da16ba2d50f8a07db1c1e846990897563c619c257f0f80bea79fc62c1251e
3
+ size 5713
wandb_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"project": "finetune-gr00t-n1d6", "run_id": "outputs"}