Upload HEX EAI real-world 2B checkpoint

Browse files

Files changed (7) hide show

EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/EAI_real_world.log +0 -0
EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/checkpoints/steps_300000_pytorch_model.pt +3 -0
EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/config.json +169 -0
EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/config.yaml +148 -0
EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/dataset_statistics.json +0 -0
EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/embodiment_registry.json +394 -0
EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/summary.jsonl +6 -0

EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/EAI_real_world.log ADDED Viewed

The diff for this file is too large to render. See raw diff

EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/checkpoints/steps_300000_pytorch_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:345ffdda686a5941b6ee0fe4214ed909bdbf7fb2106cdaaef1cfd846be306b2f
+size 5533777864

EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/config.json ADDED Viewed

	@@ -0,0 +1,169 @@

+{
+  "run_id": "hex_ac100_300k_8gpu_state_query_history2",
+  "run_root_dir": "./pretrained_models/hex/EAI_real_world_2B",
+  "seed": 42,
+  "trackers": [
+    "jsonl",
+    "wandb"
+  ],
+  "wandb_entity": "HEXTeam",
+  "wandb_project": "hex",
+  "is_debug": false,
+  "enable_mee": false,
+  "mee_weight": 0.01,
+  "framework": {
+    "name": "HEX",
+    "qwenvl": {
+      "base_vlm": "/mnt/dataset/vnwy44/model/Qwen3-VL-2B-Instruct",
+      "attn_implementation": "flash_attention_2",
+      "vl_hidden_dim": 2048,
+      "add_query": true
+    },
+    "dino": {
+      "dino_backbone": "dinov2_vits14"
+    },
+    "action_model": {
+      "action_model_type": "DiT-B",
+      "action_hidden_dim": 2,
+      "hidden_size": 1024,
+      "add_pos_embed": true,
+      "max_seq_len": 1024,
+      "action_dim": 32,
+      "state_dim": 36,
+      "future_action_window_size": 99,
+      "action_horizon": 100,
+      "past_action_window_size": 0,
+      "repeated_diffusion_steps": 8,
+      "noise_beta_alpha": 1.5,
+      "noise_beta_beta": 1.0,
+      "noise_s": 0.999,
+      "num_timestep_buckets": 1000,
+      "num_inference_timesteps": 4,
+      "num_target_vision_tokens": 32,
+      "diffusion_model_cfg": {
+        "cross_attention_dim": 2048,
+        "cross_attention_dim_vl": 2048,
+        "cross_attention_dim_state": 768,
+        "dropout": 0.2,
+        "final_dropout": true,
+        "interleave_self_attention": true,
+        "norm_type": "ada_norm",
+        "num_layers": 16,
+        "output_dim": 1024,
+        "positional_embeddings": null
+      }
+    },
+    "state_model": {
+      "state_horizon": 50,
+      "transformer_block": {
+        "state_dim": 36,
+        "input_dim": 768,
+        "hidden_dim": 768,
+        "num_layers": 4,
+        "num_heads": 8,
+        "head_dim": 96,
+        "mlp_hidden_size": 256,
+        "dropout": 0.1,
+        "cross_attention_dim": 2048
+      },
+      "MoE_block": {
+        "num_experts_per_tok": 1,
+        "n_routed_experts": 16,
+        "scoring_func": "softmax",
+        "aux_loss_alpha": 0.01,
+        "seq_aux": true,
+        "norm_topk_prob": true,
+        "condition_dim": 7,
+        "moe_intermediate_size": 1536,
+        "n_shared_experts": 2,
+        "hidden_size": 768,
+        "intermediate_size": 1536,
+        "hidden_act": "silu",
+        "pretraining_tp": 1
+      }
+    },
+    "reduce_in_full_precision": true
+  },
+  "datasets": {
+    "vlm_data": {
+      "dataset_py": "vlm_datasets",
+      "dataformat": "llava_json",
+      "dataset_use": "asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en",
+      "eval_dataset": "aokvqa_cauldron_llava_format",
+      "data_flatten": false,
+      "base_interval": 2,
+      "max_pixels": 12845056,
+      "min_pixels": 3136,
+      "model_max_length": 2048,
+      "model_type": "qwen2.5vl",
+      "per_device_batch_size": 4
+    },
+    "vla_data": {
+      "dataset_py": "lerobot_datasets",
+      "data_root_dir": "/mnt/dataset/vnwy44/data/eai_real_world",
+      "data_mix": "EAI_real_world",
+      "action_type": "delta_qpos",
+      "CoT_prompt": "Your task is {instruction}. To identify the key objects for your task. Locate their bounding boxes in [x1,y1,x2,y2] format.",
+      "CoT_answer": "bbox",
+      "default_image_resolution": [
+        3,
+        224,
+        224
+      ],
+      "per_device_batch_size": 16,
+      "load_all_data_for_training": true,
+      "obs": [
+        "image_0"
+      ],
+      "delete_pause_frame": false,
+      "need_state": true,
+      "need_tag": true,
+      "vision_history_length": 2,
+      "action_chunk_size": 100
+    }
+  },
+  "trainer": {
+    "epochs": 100,
+    "num_warmup_steps_state": 2000,
+    "max_train_steps": 300000,
+    "num_warmup_steps": 5000,
+    "save_interval": 50000,
+    "eval_interval": 10000,
+    "learning_rate": {
+      "base": 2.5e-05,
+      "qwen_vl_interface": 1e-05,
+      "state_model": 2e-05,
+      "action_model": 2e-05
+    },
+    "lr_scheduler_type": "cosine_with_min_lr",
+    "scheduler_specific_kwargs": {
+      "min_lr": 1e-06
+    },
+    "freeze_modules": null,
+    "loss_scale": {
+      "vla": 1.0,
+      "vlm": 0.1
+    },
+    "max_grad_norm": 1.0,
+    "warmup_ratio": 0.1,
+    "weight_decay": 0.0,
+    "logging_frequency": 100,
+    "gradient_clipping": 1.0,
+    "gradient_accumulation_steps": 1,
+    "optimizer": {
+      "name": "AdamW",
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-08,
+      "weight_decay": 1e-08
+    },
+    "is_resume": false,
+    "resume_epoch": null,
+    "resume_step": null,
+    "enable_gradient_checkpointing": true,
+    "enable_mixed_precision_training": true
+  },
+  "output_dir": "./pretrained_models/hex/EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2"
+}

EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/config.yaml ADDED Viewed

	@@ -0,0 +1,148 @@

+run_id: hex_ac100_300k_8gpu_state_query_history2
+run_root_dir: ./pretrained_models/hex/EAI_real_world_2B
+seed: 42
+trackers:
+- jsonl
+- wandb
+wandb_entity: HEXTeam
+wandb_project: hex
+is_debug: false
+enable_mee: false
+mee_weight: 0.01
+framework:
+  name: HEX
+  qwenvl:
+    base_vlm: /mnt/dataset/vnwy44/model/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+    add_query: true
+  dino:
+    dino_backbone: dinov2_vits14
+  action_model:
+    action_model_type: DiT-B
+    action_hidden_dim: 2
+    hidden_size: 1024
+    add_pos_embed: true
+    max_seq_len: 1024
+    action_dim: 32
+    state_dim: 36
+    future_action_window_size: 99
+    action_horizon: 100
+    past_action_window_size: 0
+    repeated_diffusion_steps: 8
+    noise_beta_alpha: 1.5
+    noise_beta_beta: 1.0
+    noise_s: 0.999
+    num_timestep_buckets: 1000
+    num_inference_timesteps: 4
+    num_target_vision_tokens: 32
+    diffusion_model_cfg:
+      cross_attention_dim: 2048
+      cross_attention_dim_vl: 2048
+      cross_attention_dim_state: 768
+      dropout: 0.2
+      final_dropout: true
+      interleave_self_attention: true
+      norm_type: ada_norm
+      num_layers: 16
+      output_dim: 1024
+      positional_embeddings: null
+  state_model:
+    state_horizon: 50
+    transformer_block:
+      state_dim: 36
+      input_dim: 768
+      hidden_dim: 768
+      num_layers: 4
+      num_heads: 8
+      head_dim: 96
+      mlp_hidden_size: 256
+      dropout: 0.1
+      cross_attention_dim: 2048
+    MoE_block:
+      num_experts_per_tok: 1
+      n_routed_experts: 16
+      scoring_func: softmax
+      aux_loss_alpha: 0.01
+      seq_aux: true
+      norm_topk_prob: true
+      condition_dim: 7
+      moe_intermediate_size: 1536
+      n_shared_experts: 2
+      hidden_size: 768
+      intermediate_size: 1536
+      hidden_act: silu
+      pretraining_tp: 1
+  reduce_in_full_precision: true
+datasets:
+  vlm_data:
+    dataset_py: vlm_datasets
+    dataformat: llava_json
+    dataset_use: asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en
+    eval_dataset: aokvqa_cauldron_llava_format
+    data_flatten: false
+    base_interval: 2
+    max_pixels: 12845056
+    min_pixels: 3136
+    model_max_length: 2048
+    model_type: qwen2.5vl
+    per_device_batch_size: 4
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /mnt/dataset/vnwy44/data/eai_real_world
+    data_mix: EAI_real_world
+    action_type: delta_qpos
+    CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
+      Locate their bounding boxes in [x1,y1,x2,y2] format.
+    CoT_answer: bbox
+    default_image_resolution:
+    - 3
+    - 224
+    - 224
+    per_device_batch_size: 16
+    load_all_data_for_training: true
+    obs:
+    - image_0
+    delete_pause_frame: false
+    need_state: true
+    need_tag: true
+    vision_history_length: 2
+    action_chunk_size: 100
+trainer:
+  epochs: 100
+  num_warmup_steps_state: 2000
+  max_train_steps: 300000
+  num_warmup_steps: 5000
+  save_interval: 50000
+  eval_interval: 10000
+  learning_rate:
+    base: 2.5e-05
+    qwen_vl_interface: 1.0e-05
+    state_model: 2.0e-05
+    action_model: 2.0e-05
+  lr_scheduler_type: cosine_with_min_lr
+  scheduler_specific_kwargs:
+    min_lr: 1.0e-06
+  freeze_modules: null
+  loss_scale:
+    vla: 1.0
+    vlm: 0.1
+  max_grad_norm: 1.0
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 100
+  gradient_clipping: 1.0
+  gradient_accumulation_steps: 1
+  optimizer:
+    name: AdamW
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
+output_dir: ./pretrained_models/hex/EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2

EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/dataset_statistics.json ADDED Viewed

The diff for this file is too large to render. See raw diff

EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/embodiment_registry.json ADDED Viewed

	@@ -0,0 +1,394 @@

+{
+  "state_registry": {
+    "tienkung2_v1": {
+      "left_arm": {
+        "start": 0,
+        "end": 7,
+        "dim": 7
+      },
+      "left_hand": {
+        "start": 7,
+        "end": 13,
+        "dim": 6
+      },
+      "right_arm": {
+        "start": 13,
+        "end": 20,
+        "dim": 7
+      },
+      "right_hand": {
+        "start": 20,
+        "end": 26,
+        "dim": 6
+      },
+      "head": {
+        "start": 26,
+        "end": 29,
+        "dim": 3
+      },
+      "waist": {
+        "start": 29,
+        "end": 33,
+        "dim": 4
+      }
+    },
+    "tienkung2_v2": {
+      "left_arm": {
+        "start": 0,
+        "end": 7,
+        "dim": 7
+      },
+      "left_hand": {
+        "start": 7,
+        "end": 13,
+        "dim": 6
+      },
+      "right_arm": {
+        "start": 13,
+        "end": 20,
+        "dim": 7
+      },
+      "right_hand": {
+        "start": 20,
+        "end": 26,
+        "dim": 6
+      },
+      "waist": {
+        "start": 26,
+        "end": 30,
+        "dim": 4
+      }
+    },
+    "tienkung2_v3": {
+      "left_arm": {
+        "start": 0,
+        "end": 7,
+        "dim": 7
+      },
+      "left_hand": {
+        "start": 7,
+        "end": 13,
+        "dim": 6
+      },
+      "right_arm": {
+        "start": 13,
+        "end": 20,
+        "dim": 7
+      },
+      "right_hand": {
+        "start": 20,
+        "end": 26,
+        "dim": 6
+      },
+      "waist": {
+        "start": 26,
+        "end": 30,
+        "dim": 4
+      },
+      "others": {
+        "start": 30,
+        "end": 36,
+        "dim": 6
+      }
+    },
+    "tienkung3_v1": {
+      "left_arm": {
+        "start": 0,
+        "end": 7,
+        "dim": 7
+      },
+      "left_hand": {
+        "start": 7,
+        "end": 13,
+        "dim": 6
+      },
+      "right_arm": {
+        "start": 13,
+        "end": 20,
+        "dim": 7
+      },
+      "right_hand": {
+        "start": 20,
+        "end": 26,
+        "dim": 6
+      },
+      "waist": {
+        "start": 26,
+        "end": 29,
+        "dim": 3
+      },
+      "left_leg": {
+        "start": 29,
+        "end": 35,
+        "dim": 6
+      },
+      "right_leg": {
+        "start": 35,
+        "end": 41,
+        "dim": 6
+      },
+      "others": {
+        "start": 41,
+        "end": 51,
+        "dim": 10
+      }
+    },
+    "tienkung3_v2": {
+      "left_arm": {
+        "start": 0,
+        "end": 7,
+        "dim": 7
+      },
+      "left_hand": {
+        "start": 7,
+        "end": 8,
+        "dim": 1
+      },
+      "right_arm": {
+        "start": 8,
+        "end": 15,
+        "dim": 7
+      },
+      "right_hand": {
+        "start": 15,
+        "end": 21,
+        "dim": 6
+      },
+      "waist": {
+        "start": 21,
+        "end": 24,
+        "dim": 3
+      },
+      "left_leg": {
+        "start": 24,
+        "end": 30,
+        "dim": 6
+      },
+      "right_leg": {
+        "start": 30,
+        "end": 36,
+        "dim": 6
+      },
+      "others": {
+        "start": 36,
+        "end": 46,
+        "dim": 10
+      }
+    },
+    "tienkung3_v3": {
+      "left_arm": {
+        "start": 0,
+        "end": 7,
+        "dim": 7
+      },
+      "right_arm": {
+        "start": 7,
+        "end": 14,
+        "dim": 7
+      },
+      "right_hand": {
+        "start": 14,
+        "end": 20,
+        "dim": 6
+      },
+      "others": {
+        "start": 20,
+        "end": 40,
+        "dim": 20
+      },
+      "waist": {
+        "start": 40,
+        "end": 43,
+        "dim": 3
+      },
+      "head": {
+        "start": 43,
+        "end": 46,
+        "dim": 3
+      }
+    },
+    "tianyi_v1": {
+      "left_arm": {
+        "start": 0,
+        "end": 7,
+        "dim": 7
+      },
+      "left_hand": {
+        "start": 7,
+        "end": 8,
+        "dim": 1
+      },
+      "right_arm": {
+        "start": 8,
+        "end": 15,
+        "dim": 7
+      },
+      "right_hand": {
+        "start": 15,
+        "end": 16,
+        "dim": 1
+      }
+    },
+    "unitree_g1_v2": {
+      "left_arm": {
+        "start": 0,
+        "end": 7,
+        "dim": 7
+      },
+      "right_arm": {
+        "start": 7,
+        "end": 14,
+        "dim": 7
+      },
+      "left_hand": {
+        "start": 14,
+        "end": 21,
+        "dim": 7
+      },
+      "right_hand": {
+        "start": 21,
+        "end": 28,
+        "dim": 7
+      },
+      "left_leg": {
+        "start": 28,
+        "end": 34,
+        "dim": 6
+      },
+      "right_leg": {
+        "start": 34,
+        "end": 40,
+        "dim": 6
+      }
+    },
+    "unitree_g1_v1": {
+      "left_arm": {
+        "start": 0,
+        "end": 7,
+        "dim": 7
+      },
+      "right_arm": {
+        "start": 7,
+        "end": 14,
+        "dim": 7
+      },
+      "left_hand": {
+        "start": 14,
+        "end": 21,
+        "dim": 7
+      },
+      "right_hand": {
+        "start": 21,
+        "end": 28,
+        "dim": 7
+      },
+      "left_leg": {
+        "start": 28,
+        "end": 34,
+        "dim": 6
+      },
+      "right_leg": {
+        "start": 34,
+        "end": 40,
+        "dim": 6
+      },
+      "waist": {
+        "start": 40,
+        "end": 43,
+        "dim": 3
+      }
+    },
+    "unitree_h1_v1": {
+      "left_arm": {
+        "start": 0,
+        "end": 7,
+        "dim": 7
+      },
+      "right_arm": {
+        "start": 7,
+        "end": 14,
+        "dim": 7
+      },
+      "left_hand": {
+        "start": 14,
+        "end": 20,
+        "dim": 6
+      },
+      "right_hand": {
+        "start": 20,
+        "end": 26,
+        "dim": 6
+      },
+      "left_leg": {
+        "start": 26,
+        "end": 32,
+        "dim": 6
+      },
+      "right_leg": {
+        "start": 32,
+        "end": 38,
+        "dim": 6
+      },
+      "waist": {
+        "start": 38,
+        "end": 39,
+        "dim": 1
+      }
+    },
+    "leju_kuavo_v1": {
+      "left_arm": {
+        "start": 0,
+        "end": 7,
+        "dim": 7
+      },
+      "right_arm": {
+        "start": 7,
+        "end": 14,
+        "dim": 7
+      },
+      "left_leg": {
+        "start": 14,
+        "end": 20,
+        "dim": 6
+      },
+      "right_leg": {
+        "start": 20,
+        "end": 26,
+        "dim": 6
+      },
+      "left_hand": {
+        "start": 26,
+        "end": 32,
+        "dim": 6
+      },
+      "right_hand": {
+        "start": 32,
+        "end": 38,
+        "dim": 6
+      },
+      "head": {
+        "start": 38,
+        "end": 40,
+        "dim": 2
+      },
+      "others": {
+        "start": 40,
+        "end": 118,
+        "dim": 78
+      }
+    }
+  },
+  "action_registry": {
+    "tienkung2_v1": 23,
+    "tienkung2_v2": 20,
+    "tienkung2_v3": 32,
+    "tienkung3_v1": 20,
+    "tienkung3_v2": 20,
+    "tienkung3_v3": 8,
+    "tianyi_v1": 16,
+    "unitree_g1_v2": 32,
+    "unitree_g1_v1": 28,
+    "unitree_h1_v1": 26,
+    "leju_kuavo_v1": 54
+  }
+}

EAI_real_world_2B/hex_ac100_300k_8gpu_state_query_history2/summary.jsonl ADDED Viewed

	@@ -0,0 +1,6 @@

+{"steps": 50000}
+{"steps": 100000}
+{"steps": 150000}
+{"steps": 200000}
+{"steps": 250000}
+{"steps": 300000}