bryce-gl jadechoghari commited on
Commit
07a2660
·
verified ·
0 Parent(s):

Duplicate from lerobot/xvla-folding

Browse files

Co-authored-by: Jade Choghari <jadechoghari@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ library_name: lerobot
5
+ pipeline_tag: robotics
6
+ tags:
7
+ - vision-language-action
8
+ - imitation-learning
9
+ - lerobot
10
+ inference: false
11
+ license: apache-2.0
12
+ ---
13
+
14
+ # X-VLA (LeRobot)
15
+
16
+ X-VLA is a Vision-Language-Action foundation model that uses soft prompts to handle cross-embodiment and cross-domain robot control within a unified Transformer architecture.
17
+
18
+ A fine-tuned dexterous manipulation model trained on the high-quality Soft-FOLD cloth folding dataset. Achieves 100% success rate over 2 hours of continuous cloth folding..
19
+
20
+ **Original paper:** [X-VLA: Soft-Prompted Transformer as Scalable Cross-Embodiment Vision-Language-Action Model](https://arxiv.org/abs/2510.10274)
21
+ **Reference implementation:** https://github.com/2toinf/X-VLA
22
+ **LeRobot implementation:** Follows the original reference code for compatibility.
23
+
24
+
25
+ ## Model description
26
+
27
+ - **Inputs:** images (multi-view), proprio/state, optional language instruction
28
+ - **Outputs:** continuous actions
29
+ - **Training objective:** flow matching
30
+ - **Action representation:** continuous
31
+ - **Intended use:** Base model to fine tune on your specific use case
32
+
33
+
34
+ ## Quick start (inference on a real batch)
35
+
36
+ ### Installation
37
+
38
+ ```bash
39
+ pip install "lerobot[xvla]"
40
+ ```
41
+ For full installation details (including optional video dependencies such as ffmpeg for torchcodec), see the official documentation: https://huggingface.co/docs/lerobot/installation
42
+
43
+ ### Load model + dataset, run `select_action`
44
+
45
+ ```python
46
+ import torch
47
+ from lerobot.datasets.lerobot_dataset import LeRobotDataset
48
+ from lerobot.policies.factory import make_pre_post_processors
49
+
50
+ # Swap this import per-policy
51
+ from lerobot.policies.xvla.modeling_xvla import XVLAPolicy
52
+
53
+ # load a policy
54
+ model_id = "lerobot/xvla-folding" # <- swap checkpoint
55
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
56
+
57
+ policy = XVLAPolicy.from_pretrained(model_id).to(device).eval()
58
+
59
+ preprocess, postprocess = make_pre_post_processors(
60
+ policy.config,
61
+ model_id,
62
+ preprocessor_overrides={"device_processor": {"device": str(device)}},
63
+ )
64
+ # load a lerobotdataset (we will replace with a simpler dataset)
65
+ dataset = LeRobotDataset("lerobot/libero")
66
+
67
+ # pick an episode
68
+ episode_index = 0
69
+
70
+ # each episode corresponds to a contiguous range of frame indices
71
+ from_idx = dataset.meta.episodes["dataset_from_index"][episode_index]
72
+ to_idx = dataset.meta.episodes["dataset_to_index"][episode_index]
73
+
74
+ # get a single frame from that episode (e.g. the first frame)
75
+ frame_index = from_idx
76
+ frame = dict(dataset[frame_index])
77
+
78
+ batch = preprocess(frame)
79
+ with torch.inference_mode():
80
+ pred_action = policy.select_action(batch)
81
+ # use your policy postprocess, this post process the action
82
+ # for instance unnormalize the actions, detokenize it etc..
83
+ pred_action = postprocess(pred_action)
84
+ ```
85
+
86
+
87
+ ## Training step (loss + backward)
88
+
89
+ If you’re training / fine-tuning, you typically call `forward(...)` to get a loss and then:
90
+
91
+ ```python
92
+ policy.train()
93
+ batch = dict(dataset[0])
94
+ batch = preprocess(batch)
95
+
96
+ loss, outputs = policy.forward(batch)
97
+ loss.backward()
98
+
99
+ ```
100
+
101
+ > Notes:
102
+ >
103
+ > - Some policies expose `policy(**batch)` or return a dict; keep this snippet aligned with the policy API.
104
+ > - Use your trainer script (`lerobot-train`) for full training loops.
105
+
106
+
107
+ ## How to train / fine-tune
108
+
109
+ ```bash
110
+ lerobot-train \
111
+ --dataset.repo_id=${HF_USER}/<dataset> \
112
+ --output_dir=./outputs/[RUN_NAME] \
113
+ --job_name=[RUN_NAME] \
114
+ --policy.repo_id=${HF_USER}/<desired_policy_repo_id> \
115
+ --policy.path=lerobot/[BASE_CHECKPOINT] \
116
+ --policy.dtype=bfloat16 \
117
+ --policy.device=cuda \
118
+ --steps=100000 \
119
+ --batch_size=4
120
+ ```
121
+
122
+ Add policy-specific flags below:
123
+
124
+ - `-policy.chunk_size=...`
125
+ - `-policy.n_action_steps=...`
126
+ - `-policy.max_action_tokens=...`
127
+ - `-policy.gradient_checkpointing=true`
128
+
129
+
130
+ ## Real-World Inference & Evaluation
131
+
132
+ You can use the `record` script from [**`lerobot-record`**](https://github.com/huggingface/lerobot/blob/main/src/lerobot/scripts/lerobot_record.py) with a policy checkpoint as input, to run inference and evaluate your policy.
133
+
134
+ For instance, run this command or API example to run inference and record 10 evaluation episodes:
135
+
136
+ ```
137
+ lerobot-record \
138
+ --robot.type=so100_follower \
139
+ --robot.port=/dev/ttyACM1 \
140
+ --robot.cameras="{ up: {type: opencv, index_or_path: /dev/video10, width: 640, height: 480, fps: 30}, side: {type: intelrealsense, serial_number_or_name: 233522074606, width: 640, height: 480, fps: 30}}" \
141
+ --robot.id=my_awesome_follower_arm \
142
+ --display_data=false \
143
+ --dataset.repo_id=${HF_USER}/eval_so100 \
144
+ --dataset.single_task="Put lego brick into the transparent box" \
145
+ # <- Teleop optional if you want to teleoperate in between episodes \
146
+ # --teleop.type=so100_leader \
147
+ # --teleop.port=/dev/ttyACM0 \
148
+ # --teleop.id=my_awesome_leader_arm \
149
+ --policy.path=${HF_USER}/my_policy
150
+ ```
config.json ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "xvla",
3
+ "n_obs_steps": 1,
4
+ "input_features": {
5
+ "observation.images.image": {
6
+ "type": "VISUAL",
7
+ "shape": [
8
+ 3,
9
+ 256,
10
+ 256
11
+ ]
12
+ },
13
+ "observation.images.image2": {
14
+ "type": "VISUAL",
15
+ "shape": [
16
+ 3,
17
+ 256,
18
+ 256
19
+ ]
20
+ },
21
+ "observation.state": {
22
+ "type": "STATE",
23
+ "shape": [
24
+ 8
25
+ ]
26
+ },
27
+ "observation.images.empty_camera_0": {
28
+ "type": "VISUAL",
29
+ "shape": [
30
+ 3,
31
+ 224,
32
+ 224
33
+ ]
34
+ }
35
+ },
36
+ "output_features": {
37
+ "action": {
38
+ "type": "ACTION",
39
+ "shape": [
40
+ 20
41
+ ]
42
+ }
43
+ },
44
+ "device": "cuda",
45
+ "use_amp": false,
46
+ "push_to_hub": true,
47
+ "repo_id": null,
48
+ "private": null,
49
+ "tags": null,
50
+ "license": null,
51
+ "pretrained_path": null,
52
+ "chunk_size": 30,
53
+ "n_action_steps": 30,
54
+ "normalization_mapping": {
55
+ "STATE": "IDENTITY",
56
+ "ACTION": "MEAN_STD",
57
+ "VISUAL": "IDENTITY"
58
+ },
59
+ "florence_config": {
60
+ "model_type": "florence2",
61
+ "bos_token_id": 0,
62
+ "eos_token_id": 2,
63
+ "ignore_index": -100,
64
+ "pad_token_id": 1,
65
+ "projection_dim": 1024,
66
+ "text_config": {
67
+ "vocab_size": 51289,
68
+ "activation_dropout": 0.1,
69
+ "activation_function": "gelu",
70
+ "attention_dropout": 0.1,
71
+ "d_model": 1024,
72
+ "decoder_attention_heads": 16,
73
+ "decoder_layers": 12,
74
+ "encoder_attention_heads": 16,
75
+ "encoder_layers": 12,
76
+ "dropout": 0.1,
77
+ "max_position_embeddings": 4096,
78
+ "num_hidden_layers": 12,
79
+ "num_beams": 3
80
+ },
81
+ "vision_config": {
82
+ "model_type": "davit",
83
+ "drop_path_rate": 0.1,
84
+ "patch_size": [
85
+ 7,
86
+ 3,
87
+ 3,
88
+ 3
89
+ ],
90
+ "patch_stride": [
91
+ 4,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "patch_padding": [
97
+ 3,
98
+ 1,
99
+ 1,
100
+ 1
101
+ ],
102
+ "patch_prenorm": [
103
+ false,
104
+ true,
105
+ true,
106
+ true
107
+ ],
108
+ "enable_checkpoint": false,
109
+ "dim_embed": [
110
+ 256,
111
+ 512,
112
+ 1024,
113
+ 2048
114
+ ],
115
+ "num_heads": [
116
+ 8,
117
+ 16,
118
+ 32,
119
+ 64
120
+ ],
121
+ "num_groups": [
122
+ 8,
123
+ 16,
124
+ 32,
125
+ 64
126
+ ],
127
+ "depths": [
128
+ 1,
129
+ 1,
130
+ 9,
131
+ 1
132
+ ],
133
+ "window_size": 12,
134
+ "projection_dim": 1024,
135
+ "visual_temporal_embedding": {
136
+ "type": "COSINE",
137
+ "max_temporal_embeddings": 100
138
+ },
139
+ "image_pos_embed": {
140
+ "type": "learned_abs_2d",
141
+ "max_pos_embeddings": 50
142
+ },
143
+ "image_feature_source": [
144
+ "spatial_avg_pool",
145
+ "temporal_avg_pool"
146
+ ]
147
+ },
148
+ "vocab_size": 51289,
149
+ "torch_dtype": "float16",
150
+ "is_encoder_decoder": true
151
+ },
152
+ "tokenizer_name": "facebook/bart-large",
153
+ "tokenizer_max_length": 1024,
154
+ "tokenizer_padding_side": "right",
155
+ "pad_language_to": "max_length",
156
+ "hidden_size": 1024,
157
+ "depth": 24,
158
+ "num_heads": 16,
159
+ "mlp_ratio": 4.0,
160
+ "num_domains": 30,
161
+ "len_soft_prompts": 32,
162
+ "dim_time": 32,
163
+ "max_len_seq": 512,
164
+ "use_hetero_proj": false,
165
+ "action_mode": "ee6d",
166
+ "num_denoising_steps": 10,
167
+ "use_proprio": true,
168
+ "max_state_dim": 20,
169
+ "domain_feature_key": null,
170
+ "resize_imgs_with_padding": [
171
+ 224,
172
+ 224
173
+ ],
174
+ "num_image_views": 3,
175
+ "empty_cameras": 1,
176
+ "optimizer_lr": 0.0001,
177
+ "optimizer_betas": [
178
+ 0.9,
179
+ 0.95
180
+ ],
181
+ "optimizer_eps": 1e-08,
182
+ "optimizer_weight_decay": 0.0001,
183
+ "optimizer_grad_clip_norm": 10.0,
184
+ "scheduler_warmup_steps": 1000,
185
+ "scheduler_decay_steps": 30000,
186
+ "scheduler_decay_lr": 2.5e-06
187
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d9abbcffd042e75347c11b569b8f4f8993062a3c5995b364ac4094d378c1a91
3
+ size 3519073692
policy_postprocessor.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "policy_postprocessor",
3
+ "steps": [
4
+ {
5
+ "registry_name": "unnormalizer_processor",
6
+ "config": {
7
+ "eps": 1e-08,
8
+ "features": {
9
+ "action": {
10
+ "type": "ACTION",
11
+ "shape": [
12
+ 20
13
+ ]
14
+ }
15
+ },
16
+ "norm_map": {
17
+ "STATE": "IDENTITY",
18
+ "ACTION": "MEAN_STD",
19
+ "VISUAL": "MEAN_STD"
20
+ }
21
+ }
22
+ },
23
+ {
24
+ "registry_name": "device_processor",
25
+ "config": {
26
+ "device": "cpu",
27
+ "float_dtype": null
28
+ }
29
+ }
30
+ ]
31
+ }
policy_preprocessor.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "policy_preprocessor",
3
+ "steps": [
4
+ {
5
+ "registry_name": "rename_observations_processor",
6
+ "config": {
7
+ "rename_map": {}
8
+ }
9
+ },
10
+ {
11
+ "registry_name": "to_batch_processor",
12
+ "config": {}
13
+ },
14
+ {
15
+ "registry_name": "tokenizer_processor",
16
+ "config": {
17
+ "max_length": 50,
18
+ "task_key": "task",
19
+ "padding_side": "right",
20
+ "padding": "max_length",
21
+ "truncation": true,
22
+ "tokenizer_name": "facebook/bart-large"
23
+ }
24
+ },
25
+ {
26
+ "registry_name": "device_processor",
27
+ "config": {
28
+ "device": "cuda",
29
+ "float_dtype": null
30
+ }
31
+ },
32
+ {
33
+ "registry_name": "normalizer_processor",
34
+ "config": {
35
+ "eps": 1e-08,
36
+ "features": {
37
+ "observation.images.image": {
38
+ "type": "VISUAL",
39
+ "shape": [
40
+ 3,
41
+ 256,
42
+ 256
43
+ ]
44
+ },
45
+ "observation.images.image2": {
46
+ "type": "VISUAL",
47
+ "shape": [
48
+ 3,
49
+ 256,
50
+ 256
51
+ ]
52
+ },
53
+ "observation.state": {
54
+ "type": "STATE",
55
+ "shape": [
56
+ 8
57
+ ]
58
+ },
59
+ "observation.images.empty_camera_0": {
60
+ "type": "VISUAL",
61
+ "shape": [
62
+ 3,
63
+ 224,
64
+ 224
65
+ ]
66
+ },
67
+ "action": {
68
+ "type": "ACTION",
69
+ "shape": [
70
+ 20
71
+ ]
72
+ }
73
+ },
74
+ "norm_map": {
75
+ "STATE": "IDENTITY",
76
+ "ACTION": "MEAN_STD",
77
+ "VISUAL": "IDENTITY"
78
+ }
79
+ }
80
+ }
81
+ ]
82
+ }