lzy commited on
Commit
460896b
·
1 Parent(s): 5ace1d0

Add model weights

Browse files
Files changed (5) hide show
  1. README.md +14 -0
  2. checkpoints/pretrain_ckpt.pt +3 -0
  3. config.json +59 -0
  4. config.yaml +55 -0
  5. dataset_statistics.json +136 -0
README.md CHANGED
@@ -1,3 +1,17 @@
1
  ---
2
  license: mit
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
  ---
4
+
5
+ # MLA: A Multisensory Language-Action Model for Multimodal Understanding and Forecasting in Robotic Manipulation
6
+
7
+ ![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)
8
+ ![PyTorch](https://img.shields.io/badge/PyTorch-%23EE4C2C.svg?style=for-the-badge&logo=PyTorch&logoColor=white)
9
+
10
+ [🌐**Project Page**](https://sites.google.com/view/open-mla) | [✍️**Paper(Arxiv)**](http://arxiv.org/abs/2509.26642) | [🎥**Demo**](https://sites.google.com/view/open-mla)
11
+
12
+ Zhuoyang Liu*, Jiaming Liu*, Jiadong Xu, Nuowei Han, Chenyang Gu, Hao Chen, Kaichen Zhou, Renrui Zhang, Kai Chin Hsieh, Kun Wu, Zhengping Che, Jian Tang, Shanghang Zhang
13
+
14
+ We introduce a multisensory language–action (MLA) model that collaboratively perceives heterogeneous sensory modalities and predicts future multisensory objectives to facilitate physical world modeling.
15
+ Specifically, to enhance perceptual representations, we propose an encoder-free multimodal alignment scheme that innovatively repurposes the large language model itself as a perception module, directly interpreting multimodal cues by aligning 2D images, 3D point clouds, and tactile tokens through positional correspondence.
16
+ To further enhance MLA’s understanding of physical dynamics, we design a future multisensory generation post-training strategy that enables MLA to reason about semantic, geometric, and interaction information, providing more robust conditions for action generation.
17
+
checkpoints/pretrain_ckpt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79b761a59bacc7ab068f2477577397654519c40a3950de107862381e7e41ec82
3
+ size 27491564615
config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 7,
3
+ "action_tokenizer_exist": false,
4
+ "class_dropout_prob": 0.0,
5
+ "data_root_dir": "/media/liuzhuoyang/data/rtx/rlds/rtx_0812",
6
+ "future_action_window_size": 0,
7
+ "hf_token": ".hf_token",
8
+ "image_aug": false,
9
+ "is_resume": false,
10
+ "llm_vision_layers": 8,
11
+ "load_all_data_for_training": true,
12
+ "past_action_window_size": 0,
13
+ "pretrained_checkpoint": "/media/huggingface/hub/models--openvla--openvla-7b/snapshots/31f090d05236101ebfc381b61c674dd4746d4ce0",
14
+ "recon_image": false,
15
+ "recon_pointcloud": false,
16
+ "repeated_diffusion_steps": 4,
17
+ "resume_epoch": null,
18
+ "resume_step": null,
19
+ "run_id": "exp_rtx_0812_Pretrainopenvla_FreezeVisfalse_Window0_Difftrue_Recfalse2d_Contrastive_Vislayer8_1024_0403_0818",
20
+ "run_id_note": null,
21
+ "run_root_dir": "/media/liuzhuoyang/new_vla/Rec_Diff_beta/pretrain-exp",
22
+ "save_interval": 1,
23
+ "seed": 42,
24
+ "trackers": [
25
+ "jsonl",
26
+ "wandb"
27
+ ],
28
+ "use_diff": true,
29
+ "use_ema": false,
30
+ "use_pointcloud": false,
31
+ "use_reconstruction": false,
32
+ "use_roi": false,
33
+ "vla": {
34
+ "base_vlm": "prism-dinosiglip-224px+7b",
35
+ "data_mix": "rtx_dataset",
36
+ "enable_gradient_checkpointing": true,
37
+ "enable_mixed_precision_training": true,
38
+ "epochs": 10,
39
+ "expected_world_size": 32,
40
+ "freeze_llm_backbone": false,
41
+ "freeze_vision_tower": false,
42
+ "global_batch_size": 256,
43
+ "learning_rate": 2e-05,
44
+ "lr_scheduler_type": "constant",
45
+ "max_grad_norm": 1.0,
46
+ "max_steps": null,
47
+ "per_device_batch_size": 8,
48
+ "reduce_in_full_precision": true,
49
+ "shuffle_buffer_size": 10000,
50
+ "train_strategy": "fsdp-full-shard",
51
+ "type": "prism-dinosiglip-224px+oxe+diffusion",
52
+ "unfreeze_last_llm_layer": false,
53
+ "vla_id": "prism-dinosiglip-224px+oxe+diffusion",
54
+ "warmup_ratio": 0.0,
55
+ "weight_decay": 0.0
56
+ },
57
+ "wandb_entity": "liumail2023-peking-university",
58
+ "wandb_project": "one_model_vla_pretrain"
59
+ }
config.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ action_dim: 7
2
+ action_tokenizer_exist: false
3
+ class_dropout_prob: 0.0
4
+ data_root_dir: /media/liuzhuoyang/data/rtx/rlds/rtx_0812
5
+ future_action_window_size: 0
6
+ hf_token: .hf_token
7
+ image_aug: false
8
+ is_resume: false
9
+ llm_vision_layers: 8
10
+ load_all_data_for_training: true
11
+ past_action_window_size: 0
12
+ pretrained_checkpoint: /media/huggingface/hub/models--openvla--openvla-7b/snapshots/31f090d05236101ebfc381b61c674dd4746d4ce0
13
+ recon_image: false
14
+ recon_pointcloud: false
15
+ repeated_diffusion_steps: 4
16
+ resume_epoch: null
17
+ resume_step: null
18
+ run_id: exp_rtx_0812_Pretrainopenvla_FreezeVisfalse_Window0_Difftrue_Recfalse2d_Contrastive_Vislayer8_1024_0403_0818
19
+ run_id_note: null
20
+ run_root_dir: /media/liuzhuoyang/new_vla/Rec_Diff_beta/pretrain-exp
21
+ save_interval: 1
22
+ seed: 42
23
+ trackers:
24
+ - jsonl
25
+ - wandb
26
+ use_diff: true
27
+ use_ema: false
28
+ use_pointcloud: false
29
+ use_reconstruction: false
30
+ use_roi: false
31
+ vla:
32
+ base_vlm: prism-dinosiglip-224px+7b
33
+ data_mix: rtx_dataset
34
+ enable_gradient_checkpointing: true
35
+ enable_mixed_precision_training: true
36
+ epochs: 10
37
+ expected_world_size: 32
38
+ freeze_llm_backbone: false
39
+ freeze_vision_tower: false
40
+ global_batch_size: 256
41
+ learning_rate: 2.0e-05
42
+ lr_scheduler_type: constant
43
+ max_grad_norm: 1.0
44
+ max_steps: null
45
+ per_device_batch_size: 8
46
+ reduce_in_full_precision: true
47
+ shuffle_buffer_size: 10000
48
+ train_strategy: fsdp-full-shard
49
+ type: prism-dinosiglip-224px+oxe+diffusion
50
+ unfreeze_last_llm_layer: false
51
+ vla_id: prism-dinosiglip-224px+oxe+diffusion
52
+ warmup_ratio: 0.0
53
+ weight_decay: 0.0
54
+ wandb_entity: liumail2023-peking-university
55
+ wandb_project: one_model_vla_pretrain
dataset_statistics.json ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "rtx_dataset": {
3
+ "action": {
4
+ "mean": [
5
+ 1.739593608363066e-05,
6
+ 0.004589446354657412,
7
+ 0.002011711709201336,
8
+ -0.0006840903661213815,
9
+ 0.005076114553958178,
10
+ -0.005238891579210758,
11
+ 0.4615870714187622
12
+ ],
13
+ "std": [
14
+ 0.3626190721988678,
15
+ 0.4572296738624573,
16
+ 0.33315929770469666,
17
+ 0.8355317115783691,
18
+ 0.054371483623981476,
19
+ 0.5975595116615295,
20
+ 0.48040977120399475
21
+ ],
22
+ "max": [
23
+ 96.34329223632812,
24
+ 129.48878479003906,
25
+ 158.0499267578125,
26
+ 6.2831830978393555,
27
+ 1.8618112802505493,
28
+ 6.2831854820251465,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -157.32989501953125,
33
+ -161.42481994628906,
34
+ -123.72489929199219,
35
+ -6.283183574676514,
36
+ -1.8618112802505493,
37
+ -6.2831854820251465,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.09744655042886735,
42
+ -0.1314285695552826,
43
+ -0.16288121417164803,
44
+ -0.6645961225032807,
45
+ -0.09883208796381951,
46
+ -0.2489599719643593,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.11538894884288375,
51
+ 0.27978515625,
52
+ 0.16587213799357436,
53
+ 0.678488978743613,
54
+ 0.2778055757284177,
55
+ 0.3031894564628601,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "proprio": {
69
+ "mean": [
70
+ 0.7892308831214905,
71
+ 0.04583429917693138,
72
+ 1.1704102754592896,
73
+ -0.001955259358510375,
74
+ 0.003115077270194888,
75
+ -0.22162029147148132,
76
+ 0.4615870714187622
77
+ ],
78
+ "std": [
79
+ 18.893327713012695,
80
+ 6.578038215637207,
81
+ 30.723447799682617,
82
+ 2.2716941833496094,
83
+ 0.39969319105148315,
84
+ 1.445878505706787,
85
+ 0.48040977120399475
86
+ ],
87
+ "max": [
88
+ 938.3041381835938,
89
+ 623.6038818359375,
90
+ 1441.9671630859375,
91
+ 6.2831830978393555,
92
+ 1.8151572942733765,
93
+ 3.1415927410125732,
94
+ 1.0
95
+ ],
96
+ "min": [
97
+ -3.4371097087860107,
98
+ -531.4224243164062,
99
+ -132.0138397216797,
100
+ -3.1415927410125732,
101
+ -2.500656843185425,
102
+ -3.169050455093384,
103
+ -8.864999836077914e-05
104
+ ],
105
+ "q01": [
106
+ -0.28570315092802046,
107
+ -0.3549496218562126,
108
+ -0.06616472341120243,
109
+ -3.1397440433502197,
110
+ -1.168001264333725,
111
+ -3.1413214206695557,
112
+ 0.0
113
+ ],
114
+ "q99": [
115
+ 0.8912830322980883,
116
+ 0.8644397854804993,
117
+ 1.0340391397476196,
118
+ 4.148796057701141,
119
+ 1.286495512723926,
120
+ 3.141319990158081,
121
+ 1.0
122
+ ],
123
+ "mask": [
124
+ true,
125
+ true,
126
+ true,
127
+ true,
128
+ true,
129
+ true,
130
+ false
131
+ ]
132
+ },
133
+ "num_transitions": 36346806,
134
+ "num_trajectories": 574875
135
+ }
136
+ }