theconstruct-ai commited on 12 days ago

Commit

ed56b37

verified ·

1 Parent(s): 8045fd0

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

config.json +70 -0
embodiment_id.json +11 -0
experiment_cfg/conf.yaml +256 -0
experiment_cfg/config.yaml +295 -0
experiment_cfg/dataset_statistics.json +0 -0
experiment_cfg/final_model_config.json +54 -0
experiment_cfg/final_processor_config.json +0 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
optimizer.pt +3 -0
processor_config.json +508 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
statistics.json +0 -0
trainer_state.json +40 -0
training_args.bin +3 -0
wandb_config.json +1 -0

config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "action_horizon": 50,
+  "add_pos_embed": true,
+  "apply_sincos_state_encoding": true,
+  "architectures": [
+    "Gr00tN1d6"
+  ],
+  "attn_dropout": 0.2,
+  "attn_implementation": null,
+  "backbone_embedding_dim": 2048,
+  "backbone_model_type": "eagle",
+  "backbone_trainable_params_fp32": true,
+  "collator_overwrite_image_inputs": false,
+  "color_jitter_params": {
+    "brightness": 0.1,
+    "contrast": 0.1,
+    "hue": 0.1,
+    "saturation": 0.1
+  },
+  "crop_fraction": 0.95,
+  "diffusion_model_cfg": {
+    "attention_head_dim": 48,
+    "dropout": 0.2,
+    "final_dropout": true,
+    "interleave_self_attention": true,
+    "norm_type": "ada_norm",
+    "num_attention_heads": 32,
+    "num_layers": 32,
+    "output_dim": 1024,
+    "positional_embeddings": null
+  },
+  "eagle_collator": true,
+  "formalize_language": true,
+  "gemma_collator": false,
+  "hidden_size": 1024,
+  "image_crop_size": null,
+  "image_target_size": null,
+  "input_embedding_dim": 1536,
+  "load_bf16": true,
+  "max_action_dim": 128,
+  "max_num_embodiments": 32,
+  "max_seq_len": 1024,
+  "max_state_dim": 128,
+  "model_dtype": "bfloat16",
+  "model_name": "nvidia/Eagle-Block2A-2B-v2",
+  "model_type": "Gr00tN1d6",
+  "noise_beta_alpha": 1.5,
+  "noise_beta_beta": 1.0,
+  "noise_s": 0.999,
+  "num_inference_timesteps": 4,
+  "num_timestep_buckets": 1000,
+  "random_rotation_angle": null,
+  "reproject_vision": false,
+  "select_layer": 16,
+  "shortest_image_edge": 256,
+  "state_dropout_prob": 0.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "tune_diffusion_model": false,
+  "tune_llm": false,
+  "tune_projector": true,
+  "tune_top_llm_layers": 4,
+  "tune_visual": false,
+  "tune_vlln": true,
+  "use_albumentations_transforms": true,
+  "use_alternate_vl_dit": true,
+  "use_flash_attention": true,
+  "use_relative_action": true,
+  "use_vlln": true
+}

embodiment_id.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "robocasa_panda_omron": 13,
+  "gr1": 20,
+  "behavior_r1_pro": 24,
+  "unitree_g1": 8,
+  "oxe_google": 0,
+  "oxe_widowx": 1,
+  "libero_panda": 2,
+  "oxe_droid": 16,
+  "new_embodiment": 10
+}

experiment_cfg/conf.yaml ADDED Viewed

	@@ -0,0 +1,256 @@

+load_config_path: null
+model:
+  model_type: Gr00tN1d6
+  model_dtype: bfloat16
+  model_name: nvidia/Eagle-Block2A-2B-v2
+  backbone_model_type: eagle
+  model_revision: null
+  tune_top_llm_layers: 4
+  backbone_embedding_dim: 2048
+  tune_llm: false
+  tune_visual: false
+  select_layer: 16
+  reproject_vision: false
+  use_flash_attention: true
+  load_bf16: false
+  collator_overwrite_image_inputs: false
+  eagle_collator: true
+  backbone_trainable_params_fp32: true
+  image_crop_size: null
+  image_target_size: null
+  shortest_image_edge: 256
+  crop_fraction: 0.95
+  random_rotation_angle: null
+  color_jitter_params:
+    brightness: 0.3
+    contrast: 0.4
+    saturation: 0.5
+    hue: 0.08
+  use_albumentations_transforms: true
+  extra_augmentation_config: null
+  formalize_language: true
+  apply_sincos_state_encoding: false
+  use_relative_action: true
+  max_state_dim: 29
+  max_action_dim: 29
+  action_horizon: 16
+  hidden_size: 1024
+  input_embedding_dim: 1536
+  add_pos_embed: true
+  attn_dropout: 0.2
+  use_vlln: true
+  max_seq_len: 1024
+  use_alternate_vl_dit: true
+  attend_text_every_n_blocks: 2
+  diffusion_model_cfg:
+    positional_embeddings: null
+    num_layers: 32
+    num_attention_heads: 32
+    attention_head_dim: 48
+    norm_type: ada_norm
+    dropout: 0.2
+    final_dropout: true
+    output_dim: 1024
+    interleave_self_attention: true
+  num_inference_timesteps: 4
+  noise_beta_alpha: 1.5
+  noise_beta_beta: 1.0
+  noise_s: 0.999
+  num_timestep_buckets: 1000
+  tune_projector: true
+  tune_diffusion_model: false
+  tune_vlln: true
+  state_dropout_prob: 0.0
+  state_additive_noise_scale: 0.0
+  max_num_embodiments: 32
+data:
+  datasets:
+  - dataset_paths:
+    - datasets/theconstruct-ai_push_left_test
+    - datasets/theconstruct-ai_push_right_test
+    - datasets/theconstruct-ai_push_left_test_2
+    - datasets/theconstruct-ai_push_right_test_2
+    embodiment_tag: unitree_g1
+    mix_ratio: 1.0
+    dataset_type: physical_embodiment
+    val_dataset_path: null
+  modality_configs:
+    unitree_g1:
+      video:
+        delta_indices:
+        - 0
+        modality_keys:
+        - ego_view
+        sin_cos_embedding_keys: null
+        mean_std_embedding_keys: null
+        action_configs: null
+      state:
+        delta_indices:
+        - 0
+        modality_keys:
+        - left_leg
+        - right_leg
+        - waist
+        - left_arm
+        - right_arm
+        - left_hand
+        - right_hand
+        sin_cos_embedding_keys: null
+        mean_std_embedding_keys: null
+        action_configs: null
+      action:
+        delta_indices:
+        - 0
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 6
+        - 7
+        - 8
+        - 9
+        - 10
+        - 11
+        - 12
+        - 13
+        - 14
+        - 15
+        - 16
+        - 17
+        - 18
+        - 19
+        - 20
+        - 21
+        - 22
+        - 23
+        - 24
+        - 25
+        - 26
+        - 27
+        - 28
+        - 29
+        modality_keys:
+        - left_arm
+        - right_arm
+        - left_hand
+        - right_hand
+        - waist
+        - base_height_command
+        - navigate_command
+        sin_cos_embedding_keys: null
+        mean_std_embedding_keys: null
+        action_configs:
+        - rep: RELATIVE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+        - rep: RELATIVE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+        - rep: ABSOLUTE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+        - rep: ABSOLUTE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+        - rep: ABSOLUTE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+        - rep: ABSOLUTE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+        - rep: ABSOLUTE
+          type: NON_EEF
+          format: DEFAULT
+          state_key: null
+      language:
+        delta_indices:
+        - 0
+        modality_keys:
+        - annotation.human.task_description
+        sin_cos_embedding_keys: null
+        mean_std_embedding_keys: null
+        action_configs: null
+  download_cache: false
+  shard_size: 1024
+  episode_sampling_rate: 0.1
+  num_shards_per_epoch: 100000
+  override_pretraining_statistics: false
+  mode: single_turn
+  random_chop: 0.0
+  mock_dataset_mode: false
+  shuffle: true
+  seed: 42
+  multiprocessing_context: fork
+  allow_padding: false
+  subsample_ratio: 1.0
+  image_crop_size:
+  - 244
+  - 244
+  image_target_size:
+  - 224
+  - 224
+  video_backend: torchcodec
+training:
+  output_dir: ./outputs
+  experiment_name: null
+  max_steps: 10
+  global_batch_size: 4
+  batch_size: null
+  gradient_accumulation_steps: 4
+  learning_rate: 0.0001
+  lr_scheduler_type: cosine
+  weight_decay: 1.0e-05
+  warmup_ratio: 0.05
+  warmup_steps: 0
+  max_grad_norm: 1.0
+  optim: adamw_torch
+  start_from_checkpoint: nvidia/GR00T-N1.6-3B
+  tf32: true
+  fp16: false
+  bf16: true
+  eval_bf16: true
+  logging_steps: 10
+  save_steps: 1000
+  save_total_limit: 5
+  save_vl_model: false
+  upload_checkpoints: false
+  upload_every: 1000
+  upload_last_n_checkpoints: 5
+  max_concurrent_uploads: 2
+  eval_strategy: 'no'
+  eval_steps: 500
+  eval_set_split_ratio: 0.1
+  eval_batch_size: 2
+  save_best_eval_metric_name: ''
+  save_best_eval_metric_greater_is_better: true
+  deepspeed_stage: 2
+  gradient_checkpointing: false
+  transformers_trust_remote_code: true
+  transformers_local_files_only: false
+  transformers_cache_dir: null
+  transformers_access_token: null
+  use_ddp: false
+  ddp_bucket_cap_mb: 100
+  num_gpus: 1
+  dataloader_num_workers: 4
+  remove_unused_columns: false
+  use_wandb: true
+  wandb_project: finetune-gr00t-n1d6
+  enable_profiling: false
+  max_retries: 3
+  assert_loss_less_than: null
+  add_rl_callback: false
+  enable_open_loop_eval: false
+  open_loop_eval_traj_ids:
+  - 0
+  open_loop_eval_steps_per_traj: 100
+  open_loop_eval_plot_indices: null
+max_steps: 10
+save_steps: 1000

experiment_cfg/config.yaml ADDED Viewed

	@@ -0,0 +1,295 @@

+!!python/object:gr00t.configs.base_config.Config
+data: !!python/object:gr00t.configs.data.data_config.DataConfig
+  allow_padding: false
+  datasets:
+  - !!python/object:gr00t.configs.data.data_config.SingleDatasetConfig
+    dataset_paths:
+    - datasets/theconstruct-ai_push_left_test
+    - datasets/theconstruct-ai_push_right_test
+    - datasets/theconstruct-ai_push_left_test_2
+    - datasets/theconstruct-ai_push_right_test_2
+    dataset_type: physical_embodiment
+    embodiment_tag: unitree_g1
+    mix_ratio: 1.0
+    val_dataset_path: null
+  download_cache: false
+  episode_sampling_rate: 0.1
+  image_crop_size:
+  - 244
+  - 244
+  image_target_size:
+  - 224
+  - 224
+  mock_dataset_mode: false
+  modality_configs:
+    unitree_g1:
+      action: !!python/object:gr00t.data.types.ModalityConfig
+        action_configs:
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: &id001 !!python/object/apply:gr00t.data.types.ActionFormat
+          - default
+          rep: &id002 !!python/object/apply:gr00t.data.types.ActionRepresentation
+          - relative
+          state_key: null
+          type: &id003 !!python/object/apply:gr00t.data.types.ActionType
+          - non_eef
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: *id001
+          rep: *id002
+          state_key: null
+          type: *id003
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: *id001
+          rep: &id004 !!python/object/apply:gr00t.data.types.ActionRepresentation
+          - absolute
+          state_key: null
+          type: *id003
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: *id001
+          rep: *id004
+          state_key: null
+          type: *id003
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: *id001
+          rep: *id004
+          state_key: null
+          type: *id003
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: *id001
+          rep: *id004
+          state_key: null
+          type: *id003
+        - !!python/object:gr00t.data.types.ActionConfig
+          format: *id001
+          rep: *id004
+          state_key: null
+          type: *id003
+        delta_indices:
+        - 0
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 6
+        - 7
+        - 8
+        - 9
+        - 10
+        - 11
+        - 12
+        - 13
+        - 14
+        - 15
+        - 16
+        - 17
+        - 18
+        - 19
+        - 20
+        - 21
+        - 22
+        - 23
+        - 24
+        - 25
+        - 26
+        - 27
+        - 28
+        - 29
+        mean_std_embedding_keys: null
+        modality_keys:
+        - left_arm
+        - right_arm
+        - left_hand
+        - right_hand
+        - waist
+        - base_height_command
+        - navigate_command
+        sin_cos_embedding_keys: null
+      language: !!python/object:gr00t.data.types.ModalityConfig
+        action_configs: null
+        delta_indices:
+        - 0
+        mean_std_embedding_keys: null
+        modality_keys:
+        - annotation.human.task_description
+        sin_cos_embedding_keys: null
+      state: !!python/object:gr00t.data.types.ModalityConfig
+        action_configs: null
+        delta_indices:
+        - 0
+        mean_std_embedding_keys: null
+        modality_keys:
+        - left_leg
+        - right_leg
+        - waist
+        - left_arm
+        - right_arm
+        - left_hand
+        - right_hand
+        sin_cos_embedding_keys: null
+      video: !!python/object:gr00t.data.types.ModalityConfig
+        action_configs: null
+        delta_indices:
+        - 0
+        mean_std_embedding_keys: null
+        modality_keys:
+        - ego_view
+        sin_cos_embedding_keys: null
+  mode: single_turn
+  multiprocessing_context: fork
+  num_shards_per_epoch: 100000
+  override_pretraining_statistics: false
+  random_chop: 0.0
+  seed: 42
+  shard_size: 1024
+  shuffle: true
+  subsample_ratio: 1.0
+  video_backend: torchcodec
+load_config_path: null
+model: !!python/object:gr00t.configs.model.gr00t_n1d6.Gr00tN1d6Config
+  _attn_implementation_autoset: false
+  _attn_implementation_internal: null
+  _commit_hash: null
+  _name_or_path: ''
+  add_cross_attention: false
+  architectures: null
+  backbone_model_type: eagle
+  backbone_trainable_params_fp32: true
+  bad_words_ids: null
+  begin_suppress_tokens: null
+  bos_token_id: null
+  chunk_size_feed_forward: 0
+  color_jitter_params:
+    brightness: 0.3
+    contrast: 0.4
+    hue: 0.08
+    saturation: 0.5
+  cross_attention_hidden_size: null
+  decoder_start_token_id: null
+  diffusion_model_cfg:
+    attention_head_dim: 48
+    dropout: 0.2
+    final_dropout: true
+    interleave_self_attention: true
+    norm_type: ada_norm
+    num_attention_heads: 32
+    num_layers: 32
+    output_dim: 1024
+    positional_embeddings: null
+  diversity_penalty: 0.0
+  do_sample: false
+  eagle_collator: true
+  early_stopping: false
+  encoder_no_repeat_ngram_size: 0
+  eos_token_id: null
+  exponential_decay_length_penalty: null
+  extra_augmentation_config: null
+  finetuning_task: null
+  forced_bos_token_id: null
+  forced_eos_token_id: null
+  id2label:
+    0: LABEL_0
+    1: LABEL_1
+  is_decoder: false
+  is_encoder_decoder: false
+  label2id:
+    LABEL_0: 0
+    LABEL_1: 1
+  length_penalty: 1.0
+  load_bf16: false
+  max_length: 20
+  min_length: 0
+  model_name: nvidia/Eagle-Block2A-2B-v2
+  no_repeat_ngram_size: 0
+  num_beam_groups: 1
+  num_beams: 1
+  num_return_sequences: 1
+  output_attentions: false
+  output_hidden_states: false
+  output_scores: false
+  pad_token_id: null
+  prefix: null
+  problem_type: null
+  pruned_heads: {}
+  random_rotation_angle: null
+  remove_invalid_values: false
+  repetition_penalty: 1.0
+  reproject_vision: false
+  return_dict: true
+  return_dict_in_generate: false
+  sep_token_id: null
+  state_dropout_prob: 0.0
+  suppress_tokens: null
+  task_specific_params: null
+  temperature: 1.0
+  tf_legacy_loss: false
+  tie_encoder_decoder: false
+  tie_word_embeddings: true
+  tokenizer_class: null
+  top_k: 50
+  top_p: 1.0
+  torch_dtype: null
+  torchscript: false
+  transformers_version: null
+  tune_diffusion_model: false
+  tune_llm: false
+  tune_projector: true
+  tune_visual: false
+  typical_p: 1.0
+  use_bfloat16: false
+  use_relative_action: true
+training: !!python/object:gr00t.configs.training.training_config.TrainingConfig
+  add_rl_callback: false
+  assert_loss_less_than: null
+  batch_size: null
+  bf16: true
+  dataloader_num_workers: 4
+  ddp_bucket_cap_mb: 100
+  deepspeed_stage: 2
+  enable_open_loop_eval: false
+  enable_profiling: false
+  eval_batch_size: 2
+  eval_bf16: true
+  eval_set_split_ratio: 0.1
+  eval_steps: 500
+  eval_strategy: 'no'
+  experiment_name: null
+  fp16: false
+  global_batch_size: 4
+  gradient_accumulation_steps: 4
+  gradient_checkpointing: false
+  learning_rate: 0.0001
+  logging_steps: 10
+  lr_scheduler_type: cosine
+  max_concurrent_uploads: 2
+  max_grad_norm: 1.0
+  max_retries: 3
+  max_steps: 10
+  num_gpus: 1
+  open_loop_eval_plot_indices: null
+  open_loop_eval_steps_per_traj: 100
+  open_loop_eval_traj_ids:
+  - 0
+  optim: adamw_torch
+  output_dir: ./outputs
+  remove_unused_columns: false
+  save_best_eval_metric_greater_is_better: true
+  save_best_eval_metric_name: ''
+  save_steps: 1000
+  save_total_limit: 5
+  save_vl_model: false
+  start_from_checkpoint: nvidia/GR00T-N1.6-3B
+  tf32: true
+  transformers_access_token: null
+  transformers_cache_dir: null
+  transformers_local_files_only: false
+  transformers_trust_remote_code: true
+  upload_checkpoints: false
+  upload_every: 1000
+  upload_last_n_checkpoints: 5
+  use_ddp: false
+  use_wandb: true
+  wandb_project: finetune-gr00t-n1d6
+  warmup_ratio: 0.05
+  warmup_steps: 0
+  weight_decay: 1.0e-05

experiment_cfg/dataset_statistics.json ADDED Viewed

The diff for this file is too large to render. See raw diff

experiment_cfg/final_model_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "model_type": "Gr00tN1d6",
+  "model_dtype": "bfloat16",
+  "model_name": "nvidia/Eagle-Block2A-2B-v2",
+  "backbone_model_type": "eagle",
+  "model_revision": null,
+  "tune_top_llm_layers": 4,
+  "backbone_embedding_dim": 2048,
+  "tune_llm": false,
+  "tune_visual": false,
+  "select_layer": 16,
+  "reproject_vision": false,
+  "use_flash_attention": true,
+  "load_bf16": true,
+  "collator_overwrite_image_inputs": false,
+  "eagle_collator": true,
+  "backbone_trainable_params_fp32": true,
+  "extra_augmentation_config": null,
+  "apply_sincos_state_encoding": true,
+  "use_relative_action": true,
+  "max_state_dim": 128,
+  "max_action_dim": 128,
+  "action_horizon": 50,
+  "hidden_size": 1024,
+  "input_embedding_dim": 1536,
+  "add_pos_embed": true,
+  "attn_dropout": 0.2,
+  "use_vlln": true,
+  "max_seq_len": 1024,
+  "use_alternate_vl_dit": true,
+  "attend_text_every_n_blocks": 2,
+  "diffusion_model_cfg": {
+    "attention_head_dim": 48,
+    "dropout": 0.2,
+    "final_dropout": true,
+    "interleave_self_attention": true,
+    "norm_type": "ada_norm",
+    "num_attention_heads": 32,
+    "num_layers": 32,
+    "output_dim": 1024,
+    "positional_embeddings": null
+  },
+  "num_inference_timesteps": 4,
+  "noise_beta_alpha": 1.5,
+  "noise_beta_beta": 1.0,
+  "noise_s": 0.999,
+  "num_timestep_buckets": 1000,
+  "tune_projector": true,
+  "tune_diffusion_model": false,
+  "tune_vlln": true,
+  "state_dropout_prob": 0.0,
+  "state_additive_noise_scale": 0.0,
+  "max_num_embodiments": 32
+}

experiment_cfg/final_processor_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01e110ce2057e354e8d2b5ab598f1703c8739b9682c6e53c956bccdbcd4d36a0
+size 4990120184

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71d6ae4204a83c9fcf99fe4b5f548f7453e5a96adbfe8bb615e3ae0847e3320f
+size 4823190320

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dab500ad7c9a60cf6533e3233cf5d09433ff0852c947cbb1181a5f671511fb57
+size 4226018251

processor_config.json ADDED Viewed

	@@ -0,0 +1,508 @@

+{
+  "processor_class": "Gr00tN1d6Processor",
+  "processor_kwargs": {
+    "modality_configs": {
+      "behavior_r1_pro": {
+        "video": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "observation.images.rgb.head_256_256",
+            "observation.images.rgb.left_wrist_256_256",
+            "observation.images.rgb.right_wrist_256_256"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "state": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "robot_pos",
+            "robot_ori_cos",
+            "robot_ori_sin",
+            "robot_2d_ori",
+            "robot_2d_ori_cos",
+            "robot_2d_ori_sin",
+            "robot_lin_vel",
+            "robot_ang_vel",
+            "arm_left_qpos",
+            "arm_left_qpos_sin",
+            "arm_left_qpos_cos",
+            "eef_left_pos",
+            "eef_left_quat",
+            "gripper_left_qpos",
+            "arm_right_qpos",
+            "arm_right_qpos_sin",
+            "arm_right_qpos_cos",
+            "eef_right_pos",
+            "eef_right_quat",
+            "gripper_right_qpos",
+            "trunk_qpos"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "action": {
+          "delta_indices": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            18,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            29,
+            30,
+            31
+          ],
+          "modality_keys": [
+            "base",
+            "torso",
+            "left_arm",
+            "left_gripper",
+            "right_arm",
+            "right_gripper"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": [
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": "trunk_qpos"
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": "arm_left_qpos"
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": "arm_right_qpos"
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            }
+          ]
+        },
+        "language": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "annotation.human.coarse_action"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        }
+      },
+      "gr1": {
+        "video": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "ego_view_bg_crop_pad_res256_freq20"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "state": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "left_arm",
+            "right_arm",
+            "left_hand",
+            "right_hand",
+            "waist"
+          ],
+          "sin_cos_embedding_keys": [
+            "left_arm",
+            "right_arm",
+            "left_hand",
+            "right_hand",
+            "waist"
+          ],
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "action": {
+          "delta_indices": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15
+          ],
+          "modality_keys": [
+            "left_arm",
+            "right_arm",
+            "left_hand",
+            "right_hand",
+            "waist"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": [
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            }
+          ]
+        },
+        "language": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "task"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        }
+      },
+      "robocasa_panda_omron": {
+        "video": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "res256_image_side_0",
+            "res256_image_side_1",
+            "res256_image_wrist_0"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "state": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "end_effector_position_relative",
+            "end_effector_rotation_relative",
+            "gripper_qpos",
+            "base_position",
+            "base_rotation"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "action": {
+          "delta_indices": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15
+          ],
+          "modality_keys": [
+            "end_effector_position",
+            "end_effector_rotation",
+            "gripper_close",
+            "base_motion",
+            "control_mode"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": [
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            }
+          ]
+        },
+        "language": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "annotation.human.action.task_description"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        }
+      },
+      "unitree_g1": {
+        "video": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "ego_view"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "state": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "left_leg",
+            "right_leg",
+            "waist",
+            "left_arm",
+            "right_arm",
+            "left_hand",
+            "right_hand"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        },
+        "action": {
+          "delta_indices": [
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            18,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            29
+          ],
+          "modality_keys": [
+            "left_arm",
+            "right_arm",
+            "left_hand",
+            "right_hand",
+            "waist",
+            "base_height_command",
+            "navigate_command"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": [
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "RELATIVE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            },
+            {
+              "rep": "ABSOLUTE",
+              "type": "NON_EEF",
+              "format": "DEFAULT",
+              "state_key": null
+            }
+          ]
+        },
+        "language": {
+          "delta_indices": [
+            0
+          ],
+          "modality_keys": [
+            "annotation.human.task_description"
+          ],
+          "sin_cos_embedding_keys": null,
+          "mean_std_embedding_keys": null,
+          "action_configs": null
+        }
+      }
+    },
+    "image_crop_size": null,
+    "image_target_size": null,
+    "use_albumentations": true,
+    "random_rotation_angle": null,
+    "color_jitter_params": {
+      "brightness": 0.3,
+      "contrast": 0.4,
+      "saturation": 0.5,
+      "hue": 0.08
+    },
+    "shortest_image_edge": 256,
+    "crop_fraction": 0.95,
+    "model_name": "nvidia/Eagle-Block2A-2B-v2",
+    "model_type": "eagle",
+    "formalize_language": true,
+    "max_state_dim": 128,
+    "max_action_dim": 128,
+    "max_action_horizon": 50,
+    "use_percentiles": false,
+    "clip_outliers": true,
+    "apply_sincos_state_encoding": true,
+    "use_relative_action": true
+  }
+}

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af8bd8e05457c51c966e8c3b7b1f70a92789f7d712a20ce0d26482224c96d635
+size 14645

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f29b14e4fe3333c764817ba3914df120adf512a79675e9ce6747be6f9beaad8f
+size 1465

statistics.json ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 10,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "grad_norm": 0.18987387418746948,
+      "learning_rate": 3.0153689607045845e-06,
+      "loss": 1.0835,
+      "step": 10
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:299da16ba2d50f8a07db1c1e846990897563c619c257f0f80bea79fc62c1251e
+size 5713

wandb_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"project": "finetune-gr00t-n1d6", "run_id": "outputs"}