Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

LIBERO/checkpoints/VLA-JEPA-LIBERO.pt +3 -0
LIBERO/config.json +114 -0
LIBERO/config.yaml +99 -0
LIBERO/dataset_statistics.json +133 -0
LIBERO/summary.jsonl +3 -0
Pretrain/checkpoints/VLA-JEPA-pretrain.pt +3 -0
Pretrain/config.json +125 -0
Pretrain/config.yaml +109 -0
Pretrain/dataset_statistics.json +133 -0
Pretrain/summary.jsonl +5 -0

LIBERO/checkpoints/VLA-JEPA-LIBERO.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b46c8f268905d50944aea9dc0a087400cdfbc401a8ec9ca22921fa91c0dcb841
+size 6163579855

LIBERO/config.json ADDED Viewed

	@@ -0,0 +1,114 @@

+{
+  "run_id": "LIBERO",
+  "run_root_dir": "checkpoints",
+  "seed": 42,
+  "trackers": [
+    "json"
+  ],
+  "is_debug": false,
+  "framework": {
+    "name": "VLA_JEPA",
+    "qwenvl": {
+      "base_vlm": "/home/dataset-local/models/Qwen3-VL-2B-Instruct",
+      "attn_implementation": "flash_attention_2",
+      "vl_hidden_dim": 2048
+    },
+    "action_model": {
+      "action_model_type": "DiT-B",
+      "action_hidden_dim": 1024,
+      "hidden_size": 1024,
+      "add_pos_embed": true,
+      "max_seq_len": 1024,
+      "action_dim": 7,
+      "state_dim": 8,
+      "future_action_window_size": 6,
+      "action_horizon": 7,
+      "past_action_window_size": 0,
+      "repeated_diffusion_steps": 8,
+      "noise_beta_alpha": 1.5,
+      "noise_beta_beta": 1.0,
+      "noise_s": 0.999,
+      "num_timestep_buckets": 1000,
+      "num_inference_timesteps": 4,
+      "num_target_vision_tokens": 32,
+      "diffusion_model_cfg": {
+        "cross_attention_dim": 2048,
+        "dropout": 0.2,
+        "final_dropout": true,
+        "interleave_self_attention": true,
+        "norm_type": "ada_norm",
+        "num_layers": 16,
+        "output_dim": 1024,
+        "positional_embeddings": null
+      }
+    },
+    "vj2_model": {
+      "base_encoder": "/home/dataset-local/models/vjepa2-vitl-fpc64-256",
+      "depth": 12,
+      "num_heads": 8,
+      "special_action_token": "<|action_{}|>",
+      "num_action_tokens_per_timestep": 8,
+      "embodied_action_token": "<|embodied_action|>",
+      "num_embodied_action_tokens_per_instruction": 32,
+      "num_frames": 8
+    },
+    "reduce_in_full_precision": true
+  },
+  "datasets": {
+    "vla_data": {
+      "dataset_py": "lerobot_datasets",
+      "data_root_dir": "/home/dataset-local/datasets/LeRobot/LEROBOT_LIBERO_DATA",
+      "data_mix": "libero_all",
+      "action_type": "delta_qpos",
+      "CoT_prompt": "Your task is {instruction}. Infer the temporal dynamics from frames {actions} and produce the corresponding policy actions {e_actions}.",
+      "resolution_size": 224,
+      "per_device_batch_size": 32,
+      "video_resolution_size": 256,
+      "load_all_data_for_training": true,
+      "with_state": true
+    }
+  },
+  "trainer": {
+    "epochs": 100,
+    "max_train_steps": 30000,
+    "num_warmup_steps": 5000,
+    "save_interval": 10000,
+    "eval_interval": 100,
+    "learning_rate": {
+      "base": 3e-05,
+      "qwen_vl_interface": 1e-05,
+      "action_model": 0.0001
+    },
+    "lr_scheduler_type": "cosine_with_min_lr",
+    "scheduler_specific_kwargs": {
+      "min_lr": 1e-06
+    },
+    "freeze_modules": "",
+    "loss_scale": {
+      "vla": 1.0,
+      "vlm": 0.1
+    },
+    "max_grad_norm": 1.0,
+    "warmup_ratio": 0.1,
+    "weight_decay": 0.0,
+    "logging_frequency": 10,
+    "gradient_clipping": 1.0,
+    "gradient_accumulation_steps": 1,
+    "pretrained_checkpoint": "/home/dataset-local/VLA_JEPA/checkpoints/pretrain/VLA-JEPA-pretrain.pt",
+    "optimizer": {
+      "name": "AdamW",
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-08,
+      "weight_decay": 1e-08
+    },
+    "is_resume": false,
+    "resume_epoch": null,
+    "resume_step": null,
+    "enable_gradient_checkpointing": true,
+    "enable_mixed_precision_training": true
+  },
+  "output_dir": "checkpoints/LIBERO"
+}

LIBERO/config.yaml ADDED Viewed

	@@ -0,0 +1,99 @@

+run_id: LIBERO
+run_root_dir: checkpoints
+seed: 42
+trackers:
+- json
+is_debug: false
+framework:
+  name: VLA_JEPA
+  qwenvl:
+    base_vlm: /home/dataset-local/models/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+  action_model:
+    action_model_type: DiT-B
+    action_hidden_dim: 1024
+    hidden_size: 1024
+    add_pos_embed: true
+    max_seq_len: 1024
+    action_dim: 7
+    state_dim: 8
+    future_action_window_size: 6
+    action_horizon: 7
+    past_action_window_size: 0
+    repeated_diffusion_steps: 8
+    noise_beta_alpha: 1.5
+    noise_beta_beta: 1.0
+    noise_s: 0.999
+    num_timestep_buckets: 1000
+    num_inference_timesteps: 4
+    num_target_vision_tokens: 32
+    diffusion_model_cfg:
+      cross_attention_dim: 2048
+      dropout: 0.2
+      final_dropout: true
+      interleave_self_attention: true
+      norm_type: ada_norm
+      num_layers: 16
+      output_dim: 1024
+      positional_embeddings: null
+  vj2_model:
+    base_encoder: /home/dataset-local/models/vjepa2-vitl-fpc64-256
+    depth: 12
+    num_heads: 8
+    special_action_token: <|action_{}|>
+    num_action_tokens_per_timestep: 8
+    embodied_action_token: <|embodied_action|>
+    num_embodied_action_tokens_per_instruction: 32
+    num_frames: 8
+  reduce_in_full_precision: true
+datasets:
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /home/dataset-local/datasets/LeRobot/LEROBOT_LIBERO_DATA
+    data_mix: libero_all
+    action_type: delta_qpos
+    CoT_prompt: Your task is {instruction}. Infer the temporal dynamics from frames
+      {actions} and produce the corresponding policy actions {e_actions}.
+    resolution_size: 224
+    per_device_batch_size: 32
+    video_resolution_size: 256
+    load_all_data_for_training: true
+    with_state: true
+trainer:
+  epochs: 100
+  max_train_steps: 30000
+  num_warmup_steps: 5000
+  save_interval: 10000
+  eval_interval: 100
+  learning_rate:
+    base: 3.0e-05
+    qwen_vl_interface: 1.0e-05
+    action_model: 0.0001
+  lr_scheduler_type: cosine_with_min_lr
+  scheduler_specific_kwargs:
+    min_lr: 1.0e-06
+  freeze_modules: ''
+  loss_scale:
+    vla: 1.0
+    vlm: 0.1
+  max_grad_norm: 1.0
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 10
+  gradient_clipping: 1.0
+  gradient_accumulation_steps: 1
+  pretrained_checkpoint: /home/dataset-local/VLA_JEPA/checkpoints/pretrain/VLA-JEPA-pretrain.pt
+  optimizer:
+    name: AdamW
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
+output_dir: checkpoints/LIBERO

LIBERO/dataset_statistics.json ADDED Viewed

	@@ -0,0 +1,133 @@

+{
+  "franka": {
+    "action": {
+      "mean": [
+        0.07237596483901143,
+        0.08987006871029735,
+        -0.10144743137061596,
+        -0.00045383188989944756,
+        0.006273590726777911,
+        -0.003878799732774496,
+        0.524486355483532
+      ],
+      "std": [
+        0.3498823308902479,
+        0.37794140366375184,
+        0.460084266976933,
+        0.0403885784928603,
+        0.06616144248501059,
+        0.07763074391911857,
+        0.4994683356809767
+      ],
+      "max": [
+        0.9375,
+        0.9375,
+        0.9375,
+        0.3557142913341522,
+        0.375,
+        0.375,
+        1.0
+      ],
+      "min": [
+        -0.9375,
+        -0.9375,
+        -0.9375,
+        -0.2582142949104309,
+        -0.375,
+        -0.3675000071525574,
+        0.0
+      ],
+      "q01": [
+        -0.8785714507102966,
+        -0.8758928775787354,
+        -0.9375,
+        -0.1510714292526245,
+        -0.20678570866584778,
+        -0.2742857038974762,
+        0.0
+      ],
+      "q99": [
+        0.9375,
+        0.9107142686843872,
+        0.9375,
+        0.20357142388820648,
+        0.26357144117355347,
+        0.375,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "state": {
+      "mean": [
+        -0.04889854742214084,
+        0.03689368185587227,
+        0.7890402488410473,
+        2.9771945476531982,
+        -0.1417286954820156,
+        -0.11769362539052963,
+        0.026436020154505968,
+        -0.02665513101965189
+      ],
+      "std": [
+        0.10639013941746686,
+        0.15115733130675715,
+        0.38406895599530033,
+        0.3530238395244304,
+        0.8227341427331599,
+        0.32357567121520087,
+        0.014583991652936385,
+        0.014467005007200339
+      ],
+      "max": [
+        0.21031762659549713,
+        0.39128610491752625,
+        1.3660105466842651,
+        3.6714255809783936,
+        3.560650587081909,
+        1.386339545249939,
+        0.04233968257904053,
+        0.0013633022317662835
+      ],
+      "min": [
+        -0.4828203022480011,
+        -0.3255046010017395,
+        0.008128180168569088,
+        0.35277295112609863,
+        -3.641430377960205,
+        -1.842738389968872,
+        -0.0013586411951109767,
+        -0.042040832340717316
+      ],
+      "q01": [
+        -0.4240104854106903,
+        -0.28383004665374756,
+        0.009925739839673042,
+        1.3085840940475464,
+        -2.8866775035858154,
+        -1.159900426864624,
+        0.001503719249740243,
+        -0.040336400270462036
+      ],
+      "q99": [
+        0.15302616357803345,
+        0.362916499376297,
+        1.2910678386688232,
+        3.3035426139831543,
+        2.7496531009674072,
+        0.6893712878227234,
+        0.04061093553900719,
+        -0.0015016930410638452
+      ]
+    },
+    "num_transitions": 272104,
+    "num_trajectories": 1693
+  }
+}

LIBERO/summary.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+{"steps": 10000}
+{"steps": 20000}
+{"steps": 30000}

Pretrain/checkpoints/VLA-JEPA-pretrain.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd929c79d9bbd0bda56c0b952c7acb470d93c6241a519013fe5248c3f3ea5fab
+size 6163578232

Pretrain/config.json ADDED Viewed

	@@ -0,0 +1,125 @@

+{
+  "run_id": "pretrain",
+  "run_root_dir": "checkpoints",
+  "seed": 42,
+  "trackers": [
+    "json"
+  ],
+  "is_debug": false,
+  "framework": {
+    "name": "VLA_JEPA",
+    "qwenvl": {
+      "base_vlm": "/home/dataset-assist-0/algorithm/ginwind/models/Qwen3-VL-2B-Instruct",
+      "attn_implementation": "flash_attention_2",
+      "vl_hidden_dim": 2048
+    },
+    "action_model": {
+      "action_model_type": "DiT-B",
+      "action_hidden_dim": 1024,
+      "hidden_size": 1024,
+      "add_pos_embed": true,
+      "max_seq_len": 1024,
+      "action_dim": 7,
+      "state_dim": 8,
+      "future_action_window_size": 6,
+      "action_horizon": 7,
+      "past_action_window_size": 0,
+      "repeated_diffusion_steps": 8,
+      "noise_beta_alpha": 1.5,
+      "noise_beta_beta": 1.0,
+      "noise_s": 0.999,
+      "num_timestep_buckets": 1000,
+      "num_inference_timesteps": 4,
+      "num_target_vision_tokens": 32,
+      "diffusion_model_cfg": {
+        "cross_attention_dim": 2048,
+        "dropout": 0.2,
+        "final_dropout": true,
+        "interleave_self_attention": true,
+        "norm_type": "ada_norm",
+        "num_layers": 16,
+        "output_dim": 1024,
+        "positional_embeddings": null
+      }
+    },
+    "vj2_model": {
+      "base_encoder": "/home/dataset-assist-0/algorithm/ginwind/models/vjepa2-vitl-fpc64-256",
+      "depth": 12,
+      "num_heads": 8,
+      "special_action_token": "<|action_{}|>",
+      "num_action_tokens_per_timestep": 8,
+      "embodied_action_token": "<|embodied_action|>",
+      "num_embodied_action_tokens_per_instruction": 32,
+      "num_frames": 8
+    },
+    "reduce_in_full_precision": true
+  },
+  "datasets": {
+    "vla_data": {
+      "dataset_py": "lerobot_datasets",
+      "data_root_dir": "/home/dataset-local/datasets/LeRobot/droid",
+      "data_mix": "droid",
+      "action_type": "delta_qpos",
+      "CoT_prompt": "Your task is {instruction}. Infer the temporal dynamics from frames {actions} and produce the corresponding policy actions {e_actions}.",
+      "resolution_size": 224,
+      "video_resolution_size": 256,
+      "per_device_batch_size": 16,
+      "load_all_data_for_training": true,
+      "with_state": false
+    },
+    "video_data": {
+      "dataset_py": "video_datasets",
+      "video_dir": "/home/dataset-local/datasets/ssv2/20bn-something-something-v2",
+      "text_file": "/home/dataset-local/datasets/ssv2/test-answers.csv",
+      "CoT_prompt": "Your task is {instruction}. Infer the temporal dynamics of future frames {actions}.",
+      "extensions": [
+        "webm"
+      ],
+      "resolution_size": 224,
+      "video_resolution_size": 256,
+      "per_device_batch_size": 16
+    }
+  },
+  "trainer": {
+    "epochs": 100,
+    "max_train_steps": 50000,
+    "num_warmup_steps": 5000,
+    "save_interval": 10000,
+    "eval_interval": 100,
+    "learning_rate": {
+      "base": 3e-05,
+      "qwen_vl_interface": 1e-05,
+      "action_model": 0.0001
+    },
+    "lr_scheduler_type": "cosine_with_min_lr",
+    "scheduler_specific_kwargs": {
+      "min_lr": 1e-06
+    },
+    "freeze_modules": "",
+    "loss_scale": {
+      "vla": 1.0,
+      "vlm": 0.1
+    },
+    "max_grad_norm": 1.0,
+    "warmup_ratio": 0.1,
+    "weight_decay": 0.0,
+    "logging_frequency": 10,
+    "gradient_clipping": 1.0,
+    "gradient_accumulation_steps": 1,
+    "optimizer": {
+      "name": "AdamW",
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-08,
+      "weight_decay": 1e-08
+    },
+    "is_resume": false,
+    "resume_epoch": null,
+    "resume_step": null,
+    "enable_gradient_checkpointing": true,
+    "enable_mixed_precision_training": true
+  },
+  "output_dir": "checkpoints/pretrain"
+}

Pretrain/config.yaml ADDED Viewed

	@@ -0,0 +1,109 @@

+run_id: pretrain
+run_root_dir: checkpoints
+seed: 42
+trackers:
+- json
+is_debug: false
+framework:
+  name: VLA_JEPA
+  qwenvl:
+    base_vlm: /home/dataset-assist-0/algorithm/ginwind/models/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+  action_model:
+    action_model_type: DiT-B
+    action_hidden_dim: 1024
+    hidden_size: 1024
+    add_pos_embed: true
+    max_seq_len: 1024
+    action_dim: 7
+    state_dim: 8
+    future_action_window_size: 6
+    action_horizon: 7
+    past_action_window_size: 0
+    repeated_diffusion_steps: 8
+    noise_beta_alpha: 1.5
+    noise_beta_beta: 1.0
+    noise_s: 0.999
+    num_timestep_buckets: 1000
+    num_inference_timesteps: 4
+    num_target_vision_tokens: 32
+    diffusion_model_cfg:
+      cross_attention_dim: 2048
+      dropout: 0.2
+      final_dropout: true
+      interleave_self_attention: true
+      norm_type: ada_norm
+      num_layers: 16
+      output_dim: 1024
+      positional_embeddings: null
+  vj2_model:
+    base_encoder: /home/dataset-assist-0/algorithm/ginwind/models/vjepa2-vitl-fpc64-256
+    depth: 12
+    num_heads: 8
+    special_action_token: <|action_{}|>
+    num_action_tokens_per_timestep: 8
+    embodied_action_token: <|embodied_action|>
+    num_embodied_action_tokens_per_instruction: 32
+    num_frames: 8
+  reduce_in_full_precision: true
+datasets:
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /home/dataset-local/datasets/LeRobot/droid
+    data_mix: droid
+    action_type: delta_qpos
+    CoT_prompt: Your task is {instruction}. Infer the temporal dynamics from frames
+      {actions} and produce the corresponding policy actions {e_actions}.
+    resolution_size: 224
+    video_resolution_size: 256
+    per_device_batch_size: 16
+    load_all_data_for_training: true
+    with_state: false
+  video_data:
+    dataset_py: video_datasets
+    video_dir: /home/dataset-local/datasets/ssv2/20bn-something-something-v2
+    text_file: /home/dataset-local/datasets/ssv2/test-answers.csv
+    CoT_prompt: Your task is {instruction}. Infer the temporal dynamics of future
+      frames {actions}.
+    extensions:
+    - webm
+    resolution_size: 224
+    video_resolution_size: 256
+    per_device_batch_size: 16
+trainer:
+  epochs: 100
+  max_train_steps: 50000
+  num_warmup_steps: 5000
+  save_interval: 10000
+  eval_interval: 100
+  learning_rate:
+    base: 3.0e-05
+    qwen_vl_interface: 1.0e-05
+    action_model: 0.0001
+  lr_scheduler_type: cosine_with_min_lr
+  scheduler_specific_kwargs:
+    min_lr: 1.0e-06
+  freeze_modules: ''
+  loss_scale:
+    vla: 1.0
+    vlm: 0.1
+  max_grad_norm: 1.0
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 10
+  gradient_clipping: 1.0
+  gradient_accumulation_steps: 1
+  optimizer:
+    name: AdamW
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
+output_dir: checkpoints/pretrain

Pretrain/dataset_statistics.json ADDED Viewed

	@@ -0,0 +1,133 @@

+{
+  "franka": {
+    "action": {
+      "mean": [
+        0.02742478996515274,
+        -0.0026808488182723522,
+        0.015953252092003822,
+        0.003548798616975546,
+        -0.030532976612448692,
+        -0.006683542393147945,
+        0.5860324501991272
+      ],
+      "std": [
+        0.25387799739837646,
+        0.1842699646949768,
+        0.22532877326011658,
+        0.2175685167312622,
+        0.22572855651378632,
+        0.28678369522094727,
+        0.4287617802619934
+      ],
+      "max": [
+        0.9999998211860657,
+        0.999991774559021,
+        0.9999973177909851,
+        0.9999874830245972,
+        0.9999954104423523,
+        0.9999998807907104,
+        1.0
+      ],
+      "min": [
+        -0.9999999403953552,
+        -0.9999951124191284,
+        -0.9999960660934448,
+        -0.9999980330467224,
+        -0.9999982118606567,
+        -0.9999998807907104,
+        0.0
+      ],
+      "q01": [
+        -0.7776405811309814,
+        -0.5803528428077698,
+        -0.5795133113861084,
+        -0.6464062333106995,
+        -0.7041175365447998,
+        -0.8895133137702942,
+        0.0
+      ],
+      "q99": [
+        0.7597945332527161,
+        0.5726332068443298,
+        0.7351094484329224,
+        0.6705538630485535,
+        0.6465045213699341,
+        0.8897575736045837,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "state": {
+      "mean": [
+        0.5353796482086182,
+        0.0015366104198619723,
+        0.3146370053291321,
+        0.3269118070602417,
+        -0.08703453093767166,
+        -0.04832201823592186,
+        0.0,
+        0.3697895407676697
+      ],
+      "std": [
+        0.11646675318479538,
+        0.17390793561935425,
+        0.1611657738685608,
+        2.7484281063079834,
+        0.3465787172317505,
+        0.7527366280555725,
+        0.0,
+        0.4125189185142517
+      ],
+      "max": [
+        0.8575563430786133,
+        0.8407337069511414,
+        1.0439032316207886,
+        3.1415927410125732,
+        1.5705928802490234,
+        3.1415927410125732,
+        0.0,
+        1.0
+      ],
+      "min": [
+        -0.2824079692363739,
+        -0.8556680083274841,
+        -0.24001094698905945,
+        -3.141592502593994,
+        -1.5703768730163574,
+        -3.141592025756836,
+        0.0,
+        0.0
+      ],
+      "q01": [
+        0.2667418420314789,
+        -0.4394981265068054,
+        -0.04718969017267227,
+        -3.1373939514160156,
+        -1.2159388065338135,
+        -2.173978328704834,
+        0.0,
+        0.0
+      ],
+      "q99": [
+        0.7826385498046875,
+        0.4409940540790558,
+        0.7858326435089111,
+        3.1374692916870117,
+        0.8910249471664429,
+        2.0517754554748535,
+        0.0,
+        0.9911894202232361
+      ]
+    },
+    "num_transitions": 23834441,
+    "num_trajectories": 92233
+  }
+}

Pretrain/summary.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"steps": 10000}
+{"steps": 20000}
+{"steps": 30000}
+{"steps": 40000}
+{"steps": 50000}