Add model weights

Files changed (5) hide show

README.md +14 -0
checkpoints/pretrain_ckpt.pt +3 -0
config.json +59 -0
config.yaml +55 -0
dataset_statistics.json +136 -0

README.md CHANGED Viewed

@@ -1,3 +1,17 @@
 ---
 license: mit
 ---

 ---
 license: mit
 ---
+# MLA: A Multisensory Language-Action Model for Multimodal Understanding and Forecasting in Robotic Manipulation
+![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)
+![PyTorch](https://img.shields.io/badge/PyTorch-%23EE4C2C.svg?style=for-the-badge&logo=PyTorch&logoColor=white)
+[🌐**Project Page**](https://sites.google.com/view/open-mla) | [✍️**Paper(Arxiv)**](http://arxiv.org/abs/2509.26642) | [🎥**Demo**](https://sites.google.com/view/open-mla)
+Zhuoyang Liu*, Jiaming Liu*, Jiadong Xu, Nuowei Han, Chenyang Gu, Hao Chen, Kaichen Zhou, Renrui Zhang, Kai Chin Hsieh, Kun Wu, Zhengping Che, Jian Tang, Shanghang Zhang
+We introduce a multisensory language–action (MLA) model that collaboratively perceives heterogeneous sensory modalities and predicts future multisensory objectives to facilitate physical world modeling.
+Specifically, to enhance perceptual representations, we propose an encoder-free multimodal alignment scheme that innovatively repurposes the large language model itself as a perception module, directly interpreting multimodal cues by aligning 2D images, 3D point clouds, and tactile tokens through positional correspondence.
+To further enhance MLA’s understanding of physical dynamics, we design a future multisensory generation post-training strategy that enables MLA to reason about semantic, geometric, and interaction information, providing more robust conditions for action generation.

checkpoints/pretrain_ckpt.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79b761a59bacc7ab068f2477577397654519c40a3950de107862381e7e41ec82
+size 27491564615

config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "action_dim": 7,
+  "action_tokenizer_exist": false,
+  "class_dropout_prob": 0.0,
+  "data_root_dir": "/media/liuzhuoyang/data/rtx/rlds/rtx_0812",
+  "future_action_window_size": 0,
+  "hf_token": ".hf_token",
+  "image_aug": false,
+  "is_resume": false,
+  "llm_vision_layers": 8,
+  "load_all_data_for_training": true,
+  "past_action_window_size": 0,
+  "pretrained_checkpoint": "/media/huggingface/hub/models--openvla--openvla-7b/snapshots/31f090d05236101ebfc381b61c674dd4746d4ce0",
+  "recon_image": false,
+  "recon_pointcloud": false,
+  "repeated_diffusion_steps": 4,
+  "resume_epoch": null,
+  "resume_step": null,
+  "run_id": "exp_rtx_0812_Pretrainopenvla_FreezeVisfalse_Window0_Difftrue_Recfalse2d_Contrastive_Vislayer8_1024_0403_0818",
+  "run_id_note": null,
+  "run_root_dir": "/media/liuzhuoyang/new_vla/Rec_Diff_beta/pretrain-exp",
+  "save_interval": 1,
+  "seed": 42,
+  "trackers": [
+    "jsonl",
+    "wandb"
+  ],
+  "use_diff": true,
+  "use_ema": false,
+  "use_pointcloud": false,
+  "use_reconstruction": false,
+  "use_roi": false,
+  "vla": {
+    "base_vlm": "prism-dinosiglip-224px+7b",
+    "data_mix": "rtx_dataset",
+    "enable_gradient_checkpointing": true,
+    "enable_mixed_precision_training": true,
+    "epochs": 10,
+    "expected_world_size": 32,
+    "freeze_llm_backbone": false,
+    "freeze_vision_tower": false,
+    "global_batch_size": 256,
+    "learning_rate": 2e-05,
+    "lr_scheduler_type": "constant",
+    "max_grad_norm": 1.0,
+    "max_steps": null,
+    "per_device_batch_size": 8,
+    "reduce_in_full_precision": true,
+    "shuffle_buffer_size": 10000,
+    "train_strategy": "fsdp-full-shard",
+    "type": "prism-dinosiglip-224px+oxe+diffusion",
+    "unfreeze_last_llm_layer": false,
+    "vla_id": "prism-dinosiglip-224px+oxe+diffusion",
+    "warmup_ratio": 0.0,
+    "weight_decay": 0.0
+  },
+  "wandb_entity": "liumail2023-peking-university",
+  "wandb_project": "one_model_vla_pretrain"
+}

config.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+action_dim: 7
+action_tokenizer_exist: false
+class_dropout_prob: 0.0
+data_root_dir: /media/liuzhuoyang/data/rtx/rlds/rtx_0812
+future_action_window_size: 0
+hf_token: .hf_token
+image_aug: false
+is_resume: false
+llm_vision_layers: 8
+load_all_data_for_training: true
+past_action_window_size: 0
+pretrained_checkpoint: /media/huggingface/hub/models--openvla--openvla-7b/snapshots/31f090d05236101ebfc381b61c674dd4746d4ce0
+recon_image: false
+recon_pointcloud: false
+repeated_diffusion_steps: 4
+resume_epoch: null
+resume_step: null
+run_id: exp_rtx_0812_Pretrainopenvla_FreezeVisfalse_Window0_Difftrue_Recfalse2d_Contrastive_Vislayer8_1024_0403_0818
+run_id_note: null
+run_root_dir: /media/liuzhuoyang/new_vla/Rec_Diff_beta/pretrain-exp
+save_interval: 1
+seed: 42
+trackers:
+- jsonl
+- wandb
+use_diff: true
+use_ema: false
+use_pointcloud: false
+use_reconstruction: false
+use_roi: false
+vla:
+  base_vlm: prism-dinosiglip-224px+7b
+  data_mix: rtx_dataset
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
+  epochs: 10
+  expected_world_size: 32
+  freeze_llm_backbone: false
+  freeze_vision_tower: false
+  global_batch_size: 256
+  learning_rate: 2.0e-05
+  lr_scheduler_type: constant
+  max_grad_norm: 1.0
+  max_steps: null
+  per_device_batch_size: 8
+  reduce_in_full_precision: true
+  shuffle_buffer_size: 10000
+  train_strategy: fsdp-full-shard
+  type: prism-dinosiglip-224px+oxe+diffusion
+  unfreeze_last_llm_layer: false
+  vla_id: prism-dinosiglip-224px+oxe+diffusion
+  warmup_ratio: 0.0
+  weight_decay: 0.0
+wandb_entity: liumail2023-peking-university
+wandb_project: one_model_vla_pretrain

dataset_statistics.json ADDED Viewed

	@@ -0,0 +1,136 @@

+{
+  "rtx_dataset": {
+    "action": {
+      "mean": [
+        1.739593608363066e-05,
+        0.004589446354657412,
+        0.002011711709201336,
+        -0.0006840903661213815,
+        0.005076114553958178,
+        -0.005238891579210758,
+        0.4615870714187622
+      ],
+      "std": [
+        0.3626190721988678,
+        0.4572296738624573,
+        0.33315929770469666,
+        0.8355317115783691,
+        0.054371483623981476,
+        0.5975595116615295,
+        0.48040977120399475
+      ],
+      "max": [
+        96.34329223632812,
+        129.48878479003906,
+        158.0499267578125,
+        6.2831830978393555,
+        1.8618112802505493,
+        6.2831854820251465,
+        1.0
+      ],
+      "min": [
+        -157.32989501953125,
+        -161.42481994628906,
+        -123.72489929199219,
+        -6.283183574676514,
+        -1.8618112802505493,
+        -6.2831854820251465,
+        0.0
+      ],
+      "q01": [
+        -0.09744655042886735,
+        -0.1314285695552826,
+        -0.16288121417164803,
+        -0.6645961225032807,
+        -0.09883208796381951,
+        -0.2489599719643593,
+        0.0
+      ],
+      "q99": [
+        0.11538894884288375,
+        0.27978515625,
+        0.16587213799357436,
+        0.678488978743613,
+        0.2778055757284177,
+        0.3031894564628601,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "proprio": {
+      "mean": [
+        0.7892308831214905,
+        0.04583429917693138,
+        1.1704102754592896,
+        -0.001955259358510375,
+        0.003115077270194888,
+        -0.22162029147148132,
+        0.4615870714187622
+      ],
+      "std": [
+        18.893327713012695,
+        6.578038215637207,
+        30.723447799682617,
+        2.2716941833496094,
+        0.39969319105148315,
+        1.445878505706787,
+        0.48040977120399475
+      ],
+      "max": [
+        938.3041381835938,
+        623.6038818359375,
+        1441.9671630859375,
+        6.2831830978393555,
+        1.8151572942733765,
+        3.1415927410125732,
+        1.0
+      ],
+      "min": [
+        -3.4371097087860107,
+        -531.4224243164062,
+        -132.0138397216797,
+        -3.1415927410125732,
+        -2.500656843185425,
+        -3.169050455093384,
+        -8.864999836077914e-05
+      ],
+      "q01": [
+        -0.28570315092802046,
+        -0.3549496218562126,
+        -0.06616472341120243,
+        -3.1397440433502197,
+        -1.168001264333725,
+        -3.1413214206695557,
+        0.0
+      ],
+      "q99": [
+        0.8912830322980883,
+        0.8644397854804993,
+        1.0340391397476196,
+        4.148796057701141,
+        1.286495512723926,
+        3.141319990158081,
+        1.0
+      ],
+      "mask": [
+        true,
+        true,
+        true,
+        true,
+        true,
+        true,
+        false
+      ]
+    },
+    "num_transitions": 36346806,
+    "num_trajectories": 574875
+  }
+}