lzy commited on
Commit ·
460896b
1
Parent(s): 5ace1d0
Add model weights
Browse files- README.md +14 -0
- checkpoints/pretrain_ckpt.pt +3 -0
- config.json +59 -0
- config.yaml +55 -0
- dataset_statistics.json +136 -0
README.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
---
|
| 4 |
+
|
| 5 |
+
# MLA: A Multisensory Language-Action Model for Multimodal Understanding and Forecasting in Robotic Manipulation
|
| 6 |
+
|
| 7 |
+

|
| 8 |
+

|
| 9 |
+
|
| 10 |
+
[🌐**Project Page**](https://sites.google.com/view/open-mla) | [✍️**Paper(Arxiv)**](http://arxiv.org/abs/2509.26642) | [🎥**Demo**](https://sites.google.com/view/open-mla)
|
| 11 |
+
|
| 12 |
+
Zhuoyang Liu*, Jiaming Liu*, Jiadong Xu, Nuowei Han, Chenyang Gu, Hao Chen, Kaichen Zhou, Renrui Zhang, Kai Chin Hsieh, Kun Wu, Zhengping Che, Jian Tang, Shanghang Zhang
|
| 13 |
+
|
| 14 |
+
We introduce a multisensory language–action (MLA) model that collaboratively perceives heterogeneous sensory modalities and predicts future multisensory objectives to facilitate physical world modeling.
|
| 15 |
+
Specifically, to enhance perceptual representations, we propose an encoder-free multimodal alignment scheme that innovatively repurposes the large language model itself as a perception module, directly interpreting multimodal cues by aligning 2D images, 3D point clouds, and tactile tokens through positional correspondence.
|
| 16 |
+
To further enhance MLA’s understanding of physical dynamics, we design a future multisensory generation post-training strategy that enables MLA to reason about semantic, geometric, and interaction information, providing more robust conditions for action generation.
|
| 17 |
+
|
checkpoints/pretrain_ckpt.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79b761a59bacc7ab068f2477577397654519c40a3950de107862381e7e41ec82
|
| 3 |
+
size 27491564615
|
config.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"action_dim": 7,
|
| 3 |
+
"action_tokenizer_exist": false,
|
| 4 |
+
"class_dropout_prob": 0.0,
|
| 5 |
+
"data_root_dir": "/media/liuzhuoyang/data/rtx/rlds/rtx_0812",
|
| 6 |
+
"future_action_window_size": 0,
|
| 7 |
+
"hf_token": ".hf_token",
|
| 8 |
+
"image_aug": false,
|
| 9 |
+
"is_resume": false,
|
| 10 |
+
"llm_vision_layers": 8,
|
| 11 |
+
"load_all_data_for_training": true,
|
| 12 |
+
"past_action_window_size": 0,
|
| 13 |
+
"pretrained_checkpoint": "/media/huggingface/hub/models--openvla--openvla-7b/snapshots/31f090d05236101ebfc381b61c674dd4746d4ce0",
|
| 14 |
+
"recon_image": false,
|
| 15 |
+
"recon_pointcloud": false,
|
| 16 |
+
"repeated_diffusion_steps": 4,
|
| 17 |
+
"resume_epoch": null,
|
| 18 |
+
"resume_step": null,
|
| 19 |
+
"run_id": "exp_rtx_0812_Pretrainopenvla_FreezeVisfalse_Window0_Difftrue_Recfalse2d_Contrastive_Vislayer8_1024_0403_0818",
|
| 20 |
+
"run_id_note": null,
|
| 21 |
+
"run_root_dir": "/media/liuzhuoyang/new_vla/Rec_Diff_beta/pretrain-exp",
|
| 22 |
+
"save_interval": 1,
|
| 23 |
+
"seed": 42,
|
| 24 |
+
"trackers": [
|
| 25 |
+
"jsonl",
|
| 26 |
+
"wandb"
|
| 27 |
+
],
|
| 28 |
+
"use_diff": true,
|
| 29 |
+
"use_ema": false,
|
| 30 |
+
"use_pointcloud": false,
|
| 31 |
+
"use_reconstruction": false,
|
| 32 |
+
"use_roi": false,
|
| 33 |
+
"vla": {
|
| 34 |
+
"base_vlm": "prism-dinosiglip-224px+7b",
|
| 35 |
+
"data_mix": "rtx_dataset",
|
| 36 |
+
"enable_gradient_checkpointing": true,
|
| 37 |
+
"enable_mixed_precision_training": true,
|
| 38 |
+
"epochs": 10,
|
| 39 |
+
"expected_world_size": 32,
|
| 40 |
+
"freeze_llm_backbone": false,
|
| 41 |
+
"freeze_vision_tower": false,
|
| 42 |
+
"global_batch_size": 256,
|
| 43 |
+
"learning_rate": 2e-05,
|
| 44 |
+
"lr_scheduler_type": "constant",
|
| 45 |
+
"max_grad_norm": 1.0,
|
| 46 |
+
"max_steps": null,
|
| 47 |
+
"per_device_batch_size": 8,
|
| 48 |
+
"reduce_in_full_precision": true,
|
| 49 |
+
"shuffle_buffer_size": 10000,
|
| 50 |
+
"train_strategy": "fsdp-full-shard",
|
| 51 |
+
"type": "prism-dinosiglip-224px+oxe+diffusion",
|
| 52 |
+
"unfreeze_last_llm_layer": false,
|
| 53 |
+
"vla_id": "prism-dinosiglip-224px+oxe+diffusion",
|
| 54 |
+
"warmup_ratio": 0.0,
|
| 55 |
+
"weight_decay": 0.0
|
| 56 |
+
},
|
| 57 |
+
"wandb_entity": "liumail2023-peking-university",
|
| 58 |
+
"wandb_project": "one_model_vla_pretrain"
|
| 59 |
+
}
|
config.yaml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
action_dim: 7
|
| 2 |
+
action_tokenizer_exist: false
|
| 3 |
+
class_dropout_prob: 0.0
|
| 4 |
+
data_root_dir: /media/liuzhuoyang/data/rtx/rlds/rtx_0812
|
| 5 |
+
future_action_window_size: 0
|
| 6 |
+
hf_token: .hf_token
|
| 7 |
+
image_aug: false
|
| 8 |
+
is_resume: false
|
| 9 |
+
llm_vision_layers: 8
|
| 10 |
+
load_all_data_for_training: true
|
| 11 |
+
past_action_window_size: 0
|
| 12 |
+
pretrained_checkpoint: /media/huggingface/hub/models--openvla--openvla-7b/snapshots/31f090d05236101ebfc381b61c674dd4746d4ce0
|
| 13 |
+
recon_image: false
|
| 14 |
+
recon_pointcloud: false
|
| 15 |
+
repeated_diffusion_steps: 4
|
| 16 |
+
resume_epoch: null
|
| 17 |
+
resume_step: null
|
| 18 |
+
run_id: exp_rtx_0812_Pretrainopenvla_FreezeVisfalse_Window0_Difftrue_Recfalse2d_Contrastive_Vislayer8_1024_0403_0818
|
| 19 |
+
run_id_note: null
|
| 20 |
+
run_root_dir: /media/liuzhuoyang/new_vla/Rec_Diff_beta/pretrain-exp
|
| 21 |
+
save_interval: 1
|
| 22 |
+
seed: 42
|
| 23 |
+
trackers:
|
| 24 |
+
- jsonl
|
| 25 |
+
- wandb
|
| 26 |
+
use_diff: true
|
| 27 |
+
use_ema: false
|
| 28 |
+
use_pointcloud: false
|
| 29 |
+
use_reconstruction: false
|
| 30 |
+
use_roi: false
|
| 31 |
+
vla:
|
| 32 |
+
base_vlm: prism-dinosiglip-224px+7b
|
| 33 |
+
data_mix: rtx_dataset
|
| 34 |
+
enable_gradient_checkpointing: true
|
| 35 |
+
enable_mixed_precision_training: true
|
| 36 |
+
epochs: 10
|
| 37 |
+
expected_world_size: 32
|
| 38 |
+
freeze_llm_backbone: false
|
| 39 |
+
freeze_vision_tower: false
|
| 40 |
+
global_batch_size: 256
|
| 41 |
+
learning_rate: 2.0e-05
|
| 42 |
+
lr_scheduler_type: constant
|
| 43 |
+
max_grad_norm: 1.0
|
| 44 |
+
max_steps: null
|
| 45 |
+
per_device_batch_size: 8
|
| 46 |
+
reduce_in_full_precision: true
|
| 47 |
+
shuffle_buffer_size: 10000
|
| 48 |
+
train_strategy: fsdp-full-shard
|
| 49 |
+
type: prism-dinosiglip-224px+oxe+diffusion
|
| 50 |
+
unfreeze_last_llm_layer: false
|
| 51 |
+
vla_id: prism-dinosiglip-224px+oxe+diffusion
|
| 52 |
+
warmup_ratio: 0.0
|
| 53 |
+
weight_decay: 0.0
|
| 54 |
+
wandb_entity: liumail2023-peking-university
|
| 55 |
+
wandb_project: one_model_vla_pretrain
|
dataset_statistics.json
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"rtx_dataset": {
|
| 3 |
+
"action": {
|
| 4 |
+
"mean": [
|
| 5 |
+
1.739593608363066e-05,
|
| 6 |
+
0.004589446354657412,
|
| 7 |
+
0.002011711709201336,
|
| 8 |
+
-0.0006840903661213815,
|
| 9 |
+
0.005076114553958178,
|
| 10 |
+
-0.005238891579210758,
|
| 11 |
+
0.4615870714187622
|
| 12 |
+
],
|
| 13 |
+
"std": [
|
| 14 |
+
0.3626190721988678,
|
| 15 |
+
0.4572296738624573,
|
| 16 |
+
0.33315929770469666,
|
| 17 |
+
0.8355317115783691,
|
| 18 |
+
0.054371483623981476,
|
| 19 |
+
0.5975595116615295,
|
| 20 |
+
0.48040977120399475
|
| 21 |
+
],
|
| 22 |
+
"max": [
|
| 23 |
+
96.34329223632812,
|
| 24 |
+
129.48878479003906,
|
| 25 |
+
158.0499267578125,
|
| 26 |
+
6.2831830978393555,
|
| 27 |
+
1.8618112802505493,
|
| 28 |
+
6.2831854820251465,
|
| 29 |
+
1.0
|
| 30 |
+
],
|
| 31 |
+
"min": [
|
| 32 |
+
-157.32989501953125,
|
| 33 |
+
-161.42481994628906,
|
| 34 |
+
-123.72489929199219,
|
| 35 |
+
-6.283183574676514,
|
| 36 |
+
-1.8618112802505493,
|
| 37 |
+
-6.2831854820251465,
|
| 38 |
+
0.0
|
| 39 |
+
],
|
| 40 |
+
"q01": [
|
| 41 |
+
-0.09744655042886735,
|
| 42 |
+
-0.1314285695552826,
|
| 43 |
+
-0.16288121417164803,
|
| 44 |
+
-0.6645961225032807,
|
| 45 |
+
-0.09883208796381951,
|
| 46 |
+
-0.2489599719643593,
|
| 47 |
+
0.0
|
| 48 |
+
],
|
| 49 |
+
"q99": [
|
| 50 |
+
0.11538894884288375,
|
| 51 |
+
0.27978515625,
|
| 52 |
+
0.16587213799357436,
|
| 53 |
+
0.678488978743613,
|
| 54 |
+
0.2778055757284177,
|
| 55 |
+
0.3031894564628601,
|
| 56 |
+
1.0
|
| 57 |
+
],
|
| 58 |
+
"mask": [
|
| 59 |
+
true,
|
| 60 |
+
true,
|
| 61 |
+
true,
|
| 62 |
+
true,
|
| 63 |
+
true,
|
| 64 |
+
true,
|
| 65 |
+
false
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
"proprio": {
|
| 69 |
+
"mean": [
|
| 70 |
+
0.7892308831214905,
|
| 71 |
+
0.04583429917693138,
|
| 72 |
+
1.1704102754592896,
|
| 73 |
+
-0.001955259358510375,
|
| 74 |
+
0.003115077270194888,
|
| 75 |
+
-0.22162029147148132,
|
| 76 |
+
0.4615870714187622
|
| 77 |
+
],
|
| 78 |
+
"std": [
|
| 79 |
+
18.893327713012695,
|
| 80 |
+
6.578038215637207,
|
| 81 |
+
30.723447799682617,
|
| 82 |
+
2.2716941833496094,
|
| 83 |
+
0.39969319105148315,
|
| 84 |
+
1.445878505706787,
|
| 85 |
+
0.48040977120399475
|
| 86 |
+
],
|
| 87 |
+
"max": [
|
| 88 |
+
938.3041381835938,
|
| 89 |
+
623.6038818359375,
|
| 90 |
+
1441.9671630859375,
|
| 91 |
+
6.2831830978393555,
|
| 92 |
+
1.8151572942733765,
|
| 93 |
+
3.1415927410125732,
|
| 94 |
+
1.0
|
| 95 |
+
],
|
| 96 |
+
"min": [
|
| 97 |
+
-3.4371097087860107,
|
| 98 |
+
-531.4224243164062,
|
| 99 |
+
-132.0138397216797,
|
| 100 |
+
-3.1415927410125732,
|
| 101 |
+
-2.500656843185425,
|
| 102 |
+
-3.169050455093384,
|
| 103 |
+
-8.864999836077914e-05
|
| 104 |
+
],
|
| 105 |
+
"q01": [
|
| 106 |
+
-0.28570315092802046,
|
| 107 |
+
-0.3549496218562126,
|
| 108 |
+
-0.06616472341120243,
|
| 109 |
+
-3.1397440433502197,
|
| 110 |
+
-1.168001264333725,
|
| 111 |
+
-3.1413214206695557,
|
| 112 |
+
0.0
|
| 113 |
+
],
|
| 114 |
+
"q99": [
|
| 115 |
+
0.8912830322980883,
|
| 116 |
+
0.8644397854804993,
|
| 117 |
+
1.0340391397476196,
|
| 118 |
+
4.148796057701141,
|
| 119 |
+
1.286495512723926,
|
| 120 |
+
3.141319990158081,
|
| 121 |
+
1.0
|
| 122 |
+
],
|
| 123 |
+
"mask": [
|
| 124 |
+
true,
|
| 125 |
+
true,
|
| 126 |
+
true,
|
| 127 |
+
true,
|
| 128 |
+
true,
|
| 129 |
+
true,
|
| 130 |
+
false
|
| 131 |
+
]
|
| 132 |
+
},
|
| 133 |
+
"num_transitions": 36346806,
|
| 134 |
+
"num_trajectories": 574875
|
| 135 |
+
}
|
| 136 |
+
}
|