jaehyunkang commited on
Commit
13d439a
·
0 Parent(s):

RLDX-1 Release

Browse files
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ architecture.png filter=lfs diff=lfs merge=lfs -text
37
+ teaser.png filter=lfs diff=lfs merge=lfs -text
LICENSE.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RLWRLD Model License v1.0
2
+
3
+ ## 1. Definitions
4
+
5
+ "Licensor" means RLWRLD, INC. and its affiliates.
6
+
7
+ "Model" means the machine learning model, including learnt weights, parameters, configuration files, and documentation made available under this license.
8
+
9
+ "Derivative Model" means all (a) modifications to the Model, (b) works based on the Model, and (c) any other derivative works of the Model, including models fine-tuned from the Model.
10
+
11
+ "You" means an individual or legal entity exercising permissions granted by this license.
12
+
13
+ ## 2. License Grant
14
+
15
+ Subject to the terms and conditions of this license, Licensor grants to You a perpetual, worldwide, non-exclusive, royalty-free license to use, reproduce, prepare derivative works of, publicly display, publicly perform, and distribute the Model and any Derivative Models.
16
+
17
+ ## 3. Conditions and Limitations
18
+
19
+ **3.1 Non-Commercial Use.** The Model and any Derivative Models may only be used for non-commercial purposes. "Non-commercial" means for academic research, educational, personal, or evaluation purposes only, and does not include any use primarily intended for or directed toward commercial advantage or monetary compensation.
20
+
21
+ **3.2 Attribution.** You must give appropriate credit to Licensor, provide a link to this license, and indicate if changes were made. You must include the following attribution notice with any distribution of the Model or Derivative Model:
22
+
23
+ > "Licensed under the RLWRLD Model License v1.0"
24
+
25
+ **3.3 Share-Alike.** If You distribute a Derivative Model, You must do so under this same license, or another license that includes at minimum (a) a non-commercial use limitation no less restrictive than Section 3.1 and (b) a share-alike requirement no less restrictive than this Section 3.3.
26
+
27
+ **3.4 Redistribution.** You may distribute copies of the Model or Derivative Models provided that You (a) include a complete copy of this license, (b) retain all copyright, trademark, and attribution notices, and (c) comply with all conditions in this Section 3.
28
+
29
+ **3.5 Use Restrictions.** The Model and any Derivative Models shall not be used for: (a) military, weapons development, or defense applications; (b) surveillance or monitoring of individuals without their consent; or (c) any use that violates applicable laws or regulations.
30
+
31
+ **3.6 Trademarks.** This license does not grant any rights to use Licensor's names, logos, or trademarks, except as required for reasonable and customary use in describing the origin of the Model and reproducing the notices described in this license.
32
+
33
+ **3.7 Patent Claims.** If You or Your affiliate(s) bring or threaten to bring any claim or litigation (including any claim, cross-claim, or counterclaim in a lawsuit) against any entity to enforce any patents that You allege are infringed by the Model, then any rights granted to You under this license will terminate immediately.
34
+
35
+ **3.8 Termination.** If You violate any term of this license, Your rights under this license will terminate immediately.
36
+
37
+ ## 4. Third-Party Components
38
+
39
+ The Model may include or be distributed with third-party components that are subject to separate license terms and notices. Such components are subject to their respective licenses, including any notices and disclaimers contained therein. Licensor does not grant any rights with respect to third-party components beyond those provided under the applicable third-party licenses.
40
+
41
+ ## 5. Disclaimer of Warranty
42
+
43
+ THE MODEL IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NONINFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OR LAWFULNESS OF USING OR REDISTRIBUTING THE MODEL, DERIVATIVE MODELS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE MODEL, DERIVATIVE MODELS AND ANY OUTPUT AND RESULTS.
44
+
45
+ ## 6. Limitation of Liability
46
+
47
+ IN NO EVENT SHALL LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE OR THE USE OR INABILITY TO USE THE MODEL, DERIVATIVE MODELS, OR ANY OUTPUTS THEREOF, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
48
+
49
+ ## 7. Indemnity
50
+
51
+ You shall indemnify and hold harmless Licensor from and against any claim by any third party arising out of or related to Your use or distribution of the Model, Derivative Models, or any outputs thereof.
52
+
53
+ ## 8. Feedback
54
+
55
+ If You provide feedback, suggestions, or improvements regarding the Model, Licensor may use such feedback without restriction or compensation to You.
56
+
57
+ ## 9. General Provisions
58
+
59
+ **9.1 Governing Law.** This license will be governed by and construed in accordance with the laws of the State of Delaware, United States, without regard to its conflict of laws rules. The UN Convention on Contracts for International Sale of Goods does not apply to this license.
60
+
61
+ **9.2 License Updates.** Licensor may update this license to comply with legal and regulatory requirements at any time. You agree to either comply with any updated license or cease Your use and distribution of the Model and any Derivative Model.
README.md ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: rlwrld-model-license-v1.0
4
+ license_link: LICENSE.md
5
+ library_name: transformers
6
+ pipeline_tag: robotics
7
+ tags:
8
+ - robotics
9
+ - vla
10
+ - vision-language-action
11
+ - manipulation
12
+ - flow-matching
13
+ - rldx
14
+ base_model: Qwen/Qwen3-VL-8B-Instruct
15
+ ---
16
+
17
+ # RLDX-1
18
+
19
+ [Paper](https://arxiv.org/abs/2605.03269)  ·  [Project page](https://rlwrld.ai/rldx-1)  ·  [Code](https://github.com/RLWRLD/RLDX-1)  ·  [Models](https://huggingface.co/collections/RLWRLD/rldx-1)
20
+
21
+ <p align="center">
22
+ <img src="teaser.png" width="100%" alt="RLDX-1 teaser">
23
+ </p>
24
+
25
+ **RLDX-1** is a general-purpose Robot Foundation Model designed for dexterous
26
+ manipulation. Powered by a **Multi-Stream Action Transformer (MSAT)**, it
27
+ seamlessly unifies multimodal perception (visual + tactile), high-DoF
28
+ actuation, and memory-aware decision-making in a single architecture. RLDX-1
29
+ achieves state-of-the-art performance across diverse simulation benchmarks
30
+ and is fully validated on real-world hardware.
31
+
32
+ This repository hosts **`RLDX-1-PT`** — a foundation checkpoint pretrained on
33
+ a broad mixture of public manipulation corpora, from which all downstream
34
+ `RLDX-1-{FT,MT}-*` releases finetune. Use it as your starting point for new
35
+ embodiments and tasks.
36
+
37
+ <p align="center">
38
+ <img src="architecture.png" width="90%" alt="RLDX-1 architecture">
39
+ </p>
40
+
41
+ ## Highlights
42
+
43
+ - **Multi-Stream Action Transformer (MSAT).** Cognition, physics, and
44
+ action each get a dedicated stream coupled by joint self-attention —
45
+ an extension of MM-DiT to action modeling.
46
+ - **Motion awareness.** Multi-frame observations + a motion module
47
+ capture temporal dynamics; intermediate VLM layers compress video
48
+ tokens to keep the policy efficient.
49
+ - **Long-term memory.** A memory module fuses past cognition features
50
+ with the current ones for history-grounded decisions beyond a short
51
+ multi-frame window.
52
+ - **Physical sensing.** Tactile and torque enter as a dedicated physics
53
+ stream; the decoder is jointly trained to predict future physical
54
+ signals.
55
+ - **Three-stage training.** Pre-training (generalization) → mid-training
56
+ (functionality) → post-training (task adaptation), with synthetic data
57
+ augmenting rare manipulation scenarios.
58
+ - **Real-time inference.** Static graph capture + custom fused kernels
59
+ bring the all-modality model to **43.7 ms / step on RTX 5090
60
+ (1.63× speedup, >22 Hz)**.
61
+
62
+ ## Released Checkpoints
63
+
64
+ This card describes `RLDX-1-PT` (foundation). The full RLDX-1 model family:
65
+
66
+ | Checkpoint | Description | Params | Embodiment Tag |
67
+ |---|---|---|---|
68
+ | [`RLDX-1-PT`](https://huggingface.co/RLWRLD/RLDX-1-PT) | Multi-source pretrained foundation (this repo) | 6.9B | per-dataset |
69
+ | [`RLDX-1-VLM`](https://huggingface.co/RLWRLD/RLDX-1-VLM) | Qwen3-VL-8B vision-language backbone | 8B | — |
70
+ | [`RLDX-1-FT-ROBOCASA`](https://huggingface.co/RLWRLD/RLDX-1-FT-ROBOCASA) | RoboCasa Kitchen 24-task finetune | 6.9B | `GENERAL_EMBODIMENT` |
71
+ | [`RLDX-1-FT-RC365`](https://huggingface.co/RLWRLD/RLDX-1-FT-RC365) | RoboCasa-365 cross-task finetune | 6.9B | `GENERAL_EMBODIMENT` |
72
+ | [`RLDX-1-FT-LIBERO`](https://huggingface.co/RLWRLD/RLDX-1-FT-LIBERO) | LIBERO 4-task suite (goal, object, spatial, long) finetune | 6.9B | `GENERAL_EMBODIMENT` |
73
+ | [`RLDX-1-FT-SIMPLER-GOOGLE`](https://huggingface.co/RLWRLD/RLDX-1-FT-SIMPLER-GOOGLE) | SIMPLER Google VM/VA finetune | 6.9B | `OXE_FRACTAL` |
74
+ | [`RLDX-1-FT-SIMPLER-WIDOWX`](https://huggingface.co/RLWRLD/RLDX-1-FT-SIMPLER-WIDOWX) | SIMPLER WidowX finetune | 6.9B | `OXE_BRIDGE_ORIG` |
75
+ | [`RLDX-1-FT-GR1`](https://huggingface.co/RLWRLD/RLDX-1-FT-GR1) | GR-1 Tabletop finetune | 6.9B | `GENERAL_EMBODIMENT` |
76
+ | [`RLDX-1-MT-DROID`](https://huggingface.co/RLWRLD/RLDX-1-MT-DROID) | DROID mid-train | 8.1B | `OXE_DROID` |
77
+ | [`RLDX-1-MT-ALLEX`](https://huggingface.co/RLWRLD/RLDX-1-MT-ALLEX) | All add-ons (memory + motion + physics + video) | 8.1B | `GENERAL_EMBODIMENT` |
78
+
79
+ ## Performance
80
+
81
+ Success rate (%) of RLDX-1 finetuned on each benchmark's training set,
82
+ evaluated with the linked checkpoint.
83
+
84
+ | Benchmark | Success Rate | Checkpoint |
85
+ |---|---|---|
86
+ | LIBERO (Avg) | 97.8 | `RLDX-1-FT-LIBERO` |
87
+ | LIBERO-Plus | 87.6 | `RLDX-1-FT-LIBERO` |
88
+ | SIMPLER Google-VM | 81.5 | `RLDX-1-FT-SIMPLER-GOOGLE` |
89
+ | SIMPLER Google-VA | 77.4 | `RLDX-1-FT-SIMPLER-GOOGLE` |
90
+ | SIMPLER WidowX | 71.9 | `RLDX-1-FT-SIMPLER-WIDOWX` |
91
+ | RoboCasa Kitchen (24 tasks) | 70.6 | `RLDX-1-FT-ROBOCASA` |
92
+ | GR-1 Tabletop | 58.7 | `RLDX-1-FT-GR1` |
93
+ | RoboCasa365 (Avg) | 31.5 | `RLDX-1-FT-RC365` |
94
+
95
+ ## Quick start
96
+
97
+ ```bash
98
+ git clone https://github.com/RLWRLD/RLDX-1.git
99
+ cd RLDX
100
+ uv sync --python 3.10
101
+ uv pip install -e .
102
+ ```
103
+
104
+ ### Inference (single step)
105
+
106
+ ```python
107
+ from rldx.policy.rldx_policy import RLDXPolicy
108
+ from rldx.data.embodiment_tags import EmbodimentTag
109
+
110
+ policy = RLDXPolicy(
111
+ model_path="RLWRLD/RLDX-1-FT-ROBOCASA",
112
+ embodiment_tag=EmbodimentTag.GENERAL_EMBODIMENT,
113
+ device="cuda:0",
114
+ )
115
+
116
+ action = policy.get_action(observation)
117
+ ```
118
+
119
+ `RLDX-1-PT` is pretrained on a multi-source mixture, so for direct inference
120
+ pair it with the embodiment tag matching your data source — e.g.
121
+ `OXE_FRACTAL`, `OXE_BRIDGE_ORIG`, `OXE_DROID`, `GALAXEA`, `AGIBOT_GRIPPER`,
122
+ `AGIBOT_DEXHAND`, `NEURAL_GR1`, `HUMANOID_EVERYDAY_G1`,
123
+ `HUMANOID_EVERYDAY_H1`, etc. For custom robots, finetune.
124
+
125
+ ### Real-time serving (ZeroMQ)
126
+
127
+ ```bash
128
+ uv run python rldx/eval/run_rldx_server.py \
129
+ --model-path RLWRLD/RLDX-1-FT-ROBOCASA \
130
+ --embodiment-tag GENERAL_EMBODIMENT \
131
+ --host 0.0.0.0 --port 20000
132
+ ```
133
+
134
+ A WebSocket server (`run_rldx_server_pi.py`) is also available for
135
+ openpi-compatible clients.
136
+
137
+ ### Finetune from `RLDX-1-PT`
138
+
139
+ ```bash
140
+ uv run python rldx/experiment/launch_train.py \
141
+ --base-model-path RLWRLD/RLDX-1-PT \
142
+ --dataset-path /path/to/your/dataset \
143
+ --embodiment-tag GENERAL_EMBODIMENT \
144
+ --video-length 4 --n-cog-tokens 64 \
145
+ --global-batch-size 64 --learning-rate 1e-4 \
146
+ --max-steps 60000 --save-steps 5000 \
147
+ --output-dir ./outputs/my_finetune
148
+ ```
149
+
150
+ To enable add-ons (memory / motion / physics) see the recipes in the
151
+ [main README](https://github.com/RLWRLD/RLDX-1#finetuning) and the
152
+ [`training.md`](https://github.com/RLWRLD/RLDX-1/blob/main/docs/training.md)
153
+ guide.
154
+
155
+ ## Model details
156
+
157
+ - **Architecture:** Multi-Stream Action Transformer (MSAT) policy with a
158
+ Qwen3-VL vision-language backbone, cognition-token perceptual summary,
159
+ optional Transformer memory, motion module, and tactile/torque physics
160
+ encoder/decoder. Trained with flow matching.
161
+ - **Inputs:** RGB video (default 4 frames), state proprioception, optional
162
+ tactile / torque signals, language instruction.
163
+ - **Outputs:** Action chunks of length 16 (default `--action-horizon 16`).
164
+ - **Backbone:** [`Qwen/Qwen3-VL-8B-Instruct`](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct).
165
+ - **Pretraining data:** A mixture of public manipulation corpora, covering
166
+ 27 [Open X-Embodiment (OXE)](https://robotics-transformer-x.github.io/)
167
+ datasets (DROID, Bridge, Fractal, Language Table, …) plus
168
+ [Galaxea](https://galaxea.ai/), [AgiBot World](https://agibot-world.com/)
169
+ (Gripper + Dexhand), ActionNet, Neural-Curated GR-1 humanoid trajectories,
170
+ and Unitree G1 / H1 from
171
+ [HumanoidEveryday](https://lipeng-zhou.github.io/HumanoidEveryday/).
172
+
173
+ For a full architectural walkthrough see
174
+ [`docs/architecture.md`](https://github.com/RLWRLD/RLDX-1/blob/main/docs/architecture.md).
175
+
176
+ ## Intended use & limitations
177
+
178
+ **Intended use.** Research on robotic manipulation, finetuning on custom
179
+ embodiments, simulation benchmarking, and non-commercial real-robot
180
+ deployment under the conditions of the RLWRLD Model License v1.0.
181
+
182
+ **Out of scope.** Commercial deployment, military or weapons applications,
183
+ non-consensual surveillance, and any use that violates applicable laws or
184
+ regulations. See [`LICENSE.md`](LICENSE.md) §3.5 for the full list.
185
+
186
+ **Limitations.** Performance depends heavily on embodiment match and data
187
+ distribution. The pretrained checkpoint is OXE-conditioned and is not
188
+ guaranteed to work zero-shot on novel embodiments without finetuning.
189
+ Memory, motion, and physics modules are dormant in `RLDX-1-PT` and only
190
+ activate when the corresponding flags are wired during finetuning (see
191
+ `RLDX-1-MT-ALLEX`).
192
+
193
+ ## Citation
194
+
195
+ ```bibtex
196
+ @article{rldx2026,
197
+ title={RLDX-1 Technical Report},
198
+ author={Kim, Dongyoung and Jang, Huiwon and Koo, Myungkyu and Jang, Suhyeok and Kim, Taeyoung and others},
199
+ year={2026},
200
+ note={RLWRLD},
201
+ eprint={2605.03269},
202
+ archivePrefix={arXiv},
203
+ url={https://arxiv.org/abs/2605.03269}
204
+ }
205
+ ```
206
+
207
+ ## License
208
+
209
+ Released under the **RLWRLD Model License v1.0** — a non-commercial license
210
+ with attribution and share-alike requirements. See [`LICENSE.md`](LICENSE.md) for
211
+ the full text. By using this model you agree to those terms, including the
212
+ use restrictions in §3.5.
architecture.png ADDED

Git LFS Details

  • SHA256: 8d0e305139502965d4289446add15e9e11c34dcc8106ad526fa8c957c12595d3
  • Pointer size: 132 Bytes
  • Size of remote file: 1.09 MB
config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RLDX"
4
+ ],
5
+ "backbone_model_type": "vtc_qwen3_vl",
6
+ "backbone_trainable_params_fp32": true,
7
+ "color_jitter_params": {
8
+ "brightness": 0.3,
9
+ "contrast": 0.4,
10
+ "hue": 0.08,
11
+ "saturation": 0.5
12
+ },
13
+ "conversation_image_first": false,
14
+ "diffusion_model_cfg": {
15
+ "action_model_max_seq_len": 512,
16
+ "attention_head_dim": 64,
17
+ "depth_multi_stream": 4,
18
+ "depth_single_stream": 8,
19
+ "dropout": 0.2,
20
+ "final_dropout": true,
21
+ "num_attention_heads": 24,
22
+ "output_dim": 1024,
23
+ "positional_embeddings": "rope_sa_only",
24
+ "pre_norm": "layer_norm",
25
+ "qk_norm": "rms_norm",
26
+ "rope_theta": 10000.0,
27
+ "sa_dim": 1536,
28
+ "set_triple_stream_for_mq": false,
29
+ "set_triple_stream_for_state": false,
30
+ "temb_type": "input_token",
31
+ "use_swiglu": true,
32
+ "vl_dim": 4096
33
+ },
34
+ "dtype": "bfloat16",
35
+ "load_bf16": true,
36
+ "memory_cfg": {
37
+ "hidden_size": 4096,
38
+ "intermediate_size": 16384,
39
+ "max_position_embeddings": 32,
40
+ "num_attention_heads": 16,
41
+ "num_hidden_layers": 2,
42
+ "num_key_value_heads": 16,
43
+ "rms_norm_eps": 1e-05,
44
+ "use_causal_attn": true,
45
+ "use_rope": true
46
+ },
47
+ "memory_video_delta_indices": [
48
+ -48,
49
+ -32,
50
+ -16,
51
+ 0
52
+ ],
53
+ "model_name": "RLWRLD/RLDX-1-VLM",
54
+ "model_type": "RLDX-1",
55
+ "n_cog_tokens": 64,
56
+ "general_embodiment_train_ratio": 0.03125,
57
+ "qwen3_collator": true,
58
+ "random_rotation_angle": null,
59
+ "reproject_vision": false,
60
+ "state_dropout_prob": 0.0,
61
+ "transformers_version": "4.57.0",
62
+ "tune_diffusion_model": true,
63
+ "tune_llm": false,
64
+ "tune_projector": true,
65
+ "tune_top_llm_layers": 4,
66
+ "tune_visual": false,
67
+ "use_relative_action": true,
68
+ "use_video": true,
69
+ "video_length": 4
70
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85d7605c902b7afac343d1dfe184cd8cfb5091b39ee87f38ae507374454cf58e
3
+ size 4912540968
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a51ca423831c94b07fd50f41201294bbe0d974d2dbc9fcd10d09af39bc940a50
3
+ size 4446192352
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d8da2877f549486e7b48c1fe43bea65cfab85bc31cf39c91d375ddb3c7cc222
3
+ size 4467155576
model.safetensors.index.json ADDED
@@ -0,0 +1,730 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 6912894784,
4
+ "total_size": 13825888896
5
+ },
6
+ "weight_map": {
7
+ "backbone.cog_emb": "model-00001-of-00003.safetensors",
8
+ "backbone.qwen_model.model.language_model.embed_tokens.weight": "model-00001-of-00003.safetensors",
9
+ "backbone.qwen_model.model.language_model.layers.0.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
10
+ "backbone.qwen_model.model.language_model.layers.0.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
11
+ "backbone.qwen_model.model.language_model.layers.0.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
12
+ "backbone.qwen_model.model.language_model.layers.0.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
13
+ "backbone.qwen_model.model.language_model.layers.0.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
14
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
15
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
16
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
17
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
18
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
19
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
20
+ "backbone.qwen_model.model.language_model.layers.1.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
21
+ "backbone.qwen_model.model.language_model.layers.1.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
22
+ "backbone.qwen_model.model.language_model.layers.1.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
23
+ "backbone.qwen_model.model.language_model.layers.1.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
24
+ "backbone.qwen_model.model.language_model.layers.1.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
25
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
26
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
27
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
28
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
29
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
30
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
31
+ "backbone.qwen_model.model.language_model.layers.2.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
32
+ "backbone.qwen_model.model.language_model.layers.2.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
33
+ "backbone.qwen_model.model.language_model.layers.2.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
34
+ "backbone.qwen_model.model.language_model.layers.2.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
35
+ "backbone.qwen_model.model.language_model.layers.2.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
36
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
37
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
38
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
39
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
40
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
41
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
42
+ "backbone.qwen_model.model.language_model.layers.3.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
43
+ "backbone.qwen_model.model.language_model.layers.3.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
44
+ "backbone.qwen_model.model.language_model.layers.3.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
45
+ "backbone.qwen_model.model.language_model.layers.3.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
46
+ "backbone.qwen_model.model.language_model.layers.3.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
47
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
48
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
49
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
50
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
51
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
52
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
53
+ "backbone.qwen_model.model.language_model.layers.4.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
54
+ "backbone.qwen_model.model.language_model.layers.4.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
55
+ "backbone.qwen_model.model.language_model.layers.4.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
56
+ "backbone.qwen_model.model.language_model.layers.4.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
57
+ "backbone.qwen_model.model.language_model.layers.4.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
58
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
59
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
60
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
61
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
62
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
63
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
64
+ "backbone.qwen_model.model.language_model.layers.5.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
65
+ "backbone.qwen_model.model.language_model.layers.5.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
66
+ "backbone.qwen_model.model.language_model.layers.5.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
67
+ "backbone.qwen_model.model.language_model.layers.5.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
68
+ "backbone.qwen_model.model.language_model.layers.5.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
69
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
70
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
71
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
72
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
73
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
74
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
75
+ "backbone.qwen_model.model.language_model.layers.6.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
76
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
77
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
78
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
79
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
80
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
81
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
82
+ "backbone.qwen_model.model.visual.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors",
83
+ "backbone.qwen_model.model.visual.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors",
84
+ "backbone.qwen_model.model.visual.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors",
85
+ "backbone.qwen_model.model.visual.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors",
86
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
87
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
88
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
89
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
90
+ "backbone.qwen_model.model.visual.blocks.0.norm1.bias": "model-00001-of-00003.safetensors",
91
+ "backbone.qwen_model.model.visual.blocks.0.norm1.weight": "model-00001-of-00003.safetensors",
92
+ "backbone.qwen_model.model.visual.blocks.0.norm2.bias": "model-00001-of-00003.safetensors",
93
+ "backbone.qwen_model.model.visual.blocks.0.norm2.weight": "model-00001-of-00003.safetensors",
94
+ "backbone.qwen_model.model.visual.blocks.1.attn.proj.bias": "model-00001-of-00003.safetensors",
95
+ "backbone.qwen_model.model.visual.blocks.1.attn.proj.weight": "model-00001-of-00003.safetensors",
96
+ "backbone.qwen_model.model.visual.blocks.1.attn.qkv.bias": "model-00001-of-00003.safetensors",
97
+ "backbone.qwen_model.model.visual.blocks.1.attn.qkv.weight": "model-00001-of-00003.safetensors",
98
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
99
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
100
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
101
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
102
+ "backbone.qwen_model.model.visual.blocks.1.norm1.bias": "model-00001-of-00003.safetensors",
103
+ "backbone.qwen_model.model.visual.blocks.1.norm1.weight": "model-00001-of-00003.safetensors",
104
+ "backbone.qwen_model.model.visual.blocks.1.norm2.bias": "model-00001-of-00003.safetensors",
105
+ "backbone.qwen_model.model.visual.blocks.1.norm2.weight": "model-00001-of-00003.safetensors",
106
+ "backbone.qwen_model.model.visual.blocks.10.attn.proj.bias": "model-00001-of-00003.safetensors",
107
+ "backbone.qwen_model.model.visual.blocks.10.attn.proj.weight": "model-00001-of-00003.safetensors",
108
+ "backbone.qwen_model.model.visual.blocks.10.attn.qkv.bias": "model-00001-of-00003.safetensors",
109
+ "backbone.qwen_model.model.visual.blocks.10.attn.qkv.weight": "model-00001-of-00003.safetensors",
110
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
111
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
112
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
113
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
114
+ "backbone.qwen_model.model.visual.blocks.10.norm1.bias": "model-00001-of-00003.safetensors",
115
+ "backbone.qwen_model.model.visual.blocks.10.norm1.weight": "model-00001-of-00003.safetensors",
116
+ "backbone.qwen_model.model.visual.blocks.10.norm2.bias": "model-00001-of-00003.safetensors",
117
+ "backbone.qwen_model.model.visual.blocks.10.norm2.weight": "model-00001-of-00003.safetensors",
118
+ "backbone.qwen_model.model.visual.blocks.11.attn.proj.bias": "model-00001-of-00003.safetensors",
119
+ "backbone.qwen_model.model.visual.blocks.11.attn.proj.weight": "model-00001-of-00003.safetensors",
120
+ "backbone.qwen_model.model.visual.blocks.11.attn.qkv.bias": "model-00001-of-00003.safetensors",
121
+ "backbone.qwen_model.model.visual.blocks.11.attn.qkv.weight": "model-00001-of-00003.safetensors",
122
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
123
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
124
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
125
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
126
+ "backbone.qwen_model.model.visual.blocks.11.norm1.bias": "model-00001-of-00003.safetensors",
127
+ "backbone.qwen_model.model.visual.blocks.11.norm1.weight": "model-00001-of-00003.safetensors",
128
+ "backbone.qwen_model.model.visual.blocks.11.norm2.bias": "model-00001-of-00003.safetensors",
129
+ "backbone.qwen_model.model.visual.blocks.11.norm2.weight": "model-00001-of-00003.safetensors",
130
+ "backbone.qwen_model.model.visual.blocks.12.attn.proj.bias": "model-00001-of-00003.safetensors",
131
+ "backbone.qwen_model.model.visual.blocks.12.attn.proj.weight": "model-00001-of-00003.safetensors",
132
+ "backbone.qwen_model.model.visual.blocks.12.attn.qkv.bias": "model-00001-of-00003.safetensors",
133
+ "backbone.qwen_model.model.visual.blocks.12.attn.qkv.weight": "model-00001-of-00003.safetensors",
134
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
135
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
136
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
137
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
138
+ "backbone.qwen_model.model.visual.blocks.12.norm1.bias": "model-00001-of-00003.safetensors",
139
+ "backbone.qwen_model.model.visual.blocks.12.norm1.weight": "model-00001-of-00003.safetensors",
140
+ "backbone.qwen_model.model.visual.blocks.12.norm2.bias": "model-00001-of-00003.safetensors",
141
+ "backbone.qwen_model.model.visual.blocks.12.norm2.weight": "model-00001-of-00003.safetensors",
142
+ "backbone.qwen_model.model.visual.blocks.13.attn.proj.bias": "model-00001-of-00003.safetensors",
143
+ "backbone.qwen_model.model.visual.blocks.13.attn.proj.weight": "model-00001-of-00003.safetensors",
144
+ "backbone.qwen_model.model.visual.blocks.13.attn.qkv.bias": "model-00001-of-00003.safetensors",
145
+ "backbone.qwen_model.model.visual.blocks.13.attn.qkv.weight": "model-00001-of-00003.safetensors",
146
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
147
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
148
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
149
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
150
+ "backbone.qwen_model.model.visual.blocks.13.norm1.bias": "model-00001-of-00003.safetensors",
151
+ "backbone.qwen_model.model.visual.blocks.13.norm1.weight": "model-00001-of-00003.safetensors",
152
+ "backbone.qwen_model.model.visual.blocks.13.norm2.bias": "model-00001-of-00003.safetensors",
153
+ "backbone.qwen_model.model.visual.blocks.13.norm2.weight": "model-00001-of-00003.safetensors",
154
+ "backbone.qwen_model.model.visual.blocks.14.attn.proj.bias": "model-00001-of-00003.safetensors",
155
+ "backbone.qwen_model.model.visual.blocks.14.attn.proj.weight": "model-00001-of-00003.safetensors",
156
+ "backbone.qwen_model.model.visual.blocks.14.attn.qkv.bias": "model-00001-of-00003.safetensors",
157
+ "backbone.qwen_model.model.visual.blocks.14.attn.qkv.weight": "model-00001-of-00003.safetensors",
158
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
159
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
160
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
161
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
162
+ "backbone.qwen_model.model.visual.blocks.14.norm1.bias": "model-00001-of-00003.safetensors",
163
+ "backbone.qwen_model.model.visual.blocks.14.norm1.weight": "model-00001-of-00003.safetensors",
164
+ "backbone.qwen_model.model.visual.blocks.14.norm2.bias": "model-00001-of-00003.safetensors",
165
+ "backbone.qwen_model.model.visual.blocks.14.norm2.weight": "model-00001-of-00003.safetensors",
166
+ "backbone.qwen_model.model.visual.blocks.15.attn.proj.bias": "model-00001-of-00003.safetensors",
167
+ "backbone.qwen_model.model.visual.blocks.15.attn.proj.weight": "model-00001-of-00003.safetensors",
168
+ "backbone.qwen_model.model.visual.blocks.15.attn.qkv.bias": "model-00001-of-00003.safetensors",
169
+ "backbone.qwen_model.model.visual.blocks.15.attn.qkv.weight": "model-00001-of-00003.safetensors",
170
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
171
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
172
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
173
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
174
+ "backbone.qwen_model.model.visual.blocks.15.norm1.bias": "model-00001-of-00003.safetensors",
175
+ "backbone.qwen_model.model.visual.blocks.15.norm1.weight": "model-00001-of-00003.safetensors",
176
+ "backbone.qwen_model.model.visual.blocks.15.norm2.bias": "model-00001-of-00003.safetensors",
177
+ "backbone.qwen_model.model.visual.blocks.15.norm2.weight": "model-00001-of-00003.safetensors",
178
+ "backbone.qwen_model.model.visual.blocks.16.attn.proj.bias": "model-00001-of-00003.safetensors",
179
+ "backbone.qwen_model.model.visual.blocks.16.attn.proj.weight": "model-00001-of-00003.safetensors",
180
+ "backbone.qwen_model.model.visual.blocks.16.attn.qkv.bias": "model-00001-of-00003.safetensors",
181
+ "backbone.qwen_model.model.visual.blocks.16.attn.qkv.weight": "model-00001-of-00003.safetensors",
182
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
183
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
184
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
185
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
186
+ "backbone.qwen_model.model.visual.blocks.16.norm1.bias": "model-00001-of-00003.safetensors",
187
+ "backbone.qwen_model.model.visual.blocks.16.norm1.weight": "model-00001-of-00003.safetensors",
188
+ "backbone.qwen_model.model.visual.blocks.16.norm2.bias": "model-00001-of-00003.safetensors",
189
+ "backbone.qwen_model.model.visual.blocks.16.norm2.weight": "model-00001-of-00003.safetensors",
190
+ "backbone.qwen_model.model.visual.blocks.17.attn.proj.bias": "model-00001-of-00003.safetensors",
191
+ "backbone.qwen_model.model.visual.blocks.17.attn.proj.weight": "model-00001-of-00003.safetensors",
192
+ "backbone.qwen_model.model.visual.blocks.17.attn.qkv.bias": "model-00001-of-00003.safetensors",
193
+ "backbone.qwen_model.model.visual.blocks.17.attn.qkv.weight": "model-00001-of-00003.safetensors",
194
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
195
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
196
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
197
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
198
+ "backbone.qwen_model.model.visual.blocks.17.norm1.bias": "model-00001-of-00003.safetensors",
199
+ "backbone.qwen_model.model.visual.blocks.17.norm1.weight": "model-00001-of-00003.safetensors",
200
+ "backbone.qwen_model.model.visual.blocks.17.norm2.bias": "model-00001-of-00003.safetensors",
201
+ "backbone.qwen_model.model.visual.blocks.17.norm2.weight": "model-00001-of-00003.safetensors",
202
+ "backbone.qwen_model.model.visual.blocks.18.attn.proj.bias": "model-00001-of-00003.safetensors",
203
+ "backbone.qwen_model.model.visual.blocks.18.attn.proj.weight": "model-00001-of-00003.safetensors",
204
+ "backbone.qwen_model.model.visual.blocks.18.attn.qkv.bias": "model-00001-of-00003.safetensors",
205
+ "backbone.qwen_model.model.visual.blocks.18.attn.qkv.weight": "model-00001-of-00003.safetensors",
206
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
207
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
208
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
209
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
210
+ "backbone.qwen_model.model.visual.blocks.18.norm1.bias": "model-00001-of-00003.safetensors",
211
+ "backbone.qwen_model.model.visual.blocks.18.norm1.weight": "model-00001-of-00003.safetensors",
212
+ "backbone.qwen_model.model.visual.blocks.18.norm2.bias": "model-00001-of-00003.safetensors",
213
+ "backbone.qwen_model.model.visual.blocks.18.norm2.weight": "model-00001-of-00003.safetensors",
214
+ "backbone.qwen_model.model.visual.blocks.19.attn.proj.bias": "model-00001-of-00003.safetensors",
215
+ "backbone.qwen_model.model.visual.blocks.19.attn.proj.weight": "model-00001-of-00003.safetensors",
216
+ "backbone.qwen_model.model.visual.blocks.19.attn.qkv.bias": "model-00001-of-00003.safetensors",
217
+ "backbone.qwen_model.model.visual.blocks.19.attn.qkv.weight": "model-00001-of-00003.safetensors",
218
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
219
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
220
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
221
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
222
+ "backbone.qwen_model.model.visual.blocks.19.norm1.bias": "model-00001-of-00003.safetensors",
223
+ "backbone.qwen_model.model.visual.blocks.19.norm1.weight": "model-00001-of-00003.safetensors",
224
+ "backbone.qwen_model.model.visual.blocks.19.norm2.bias": "model-00001-of-00003.safetensors",
225
+ "backbone.qwen_model.model.visual.blocks.19.norm2.weight": "model-00001-of-00003.safetensors",
226
+ "backbone.qwen_model.model.visual.blocks.2.attn.proj.bias": "model-00001-of-00003.safetensors",
227
+ "backbone.qwen_model.model.visual.blocks.2.attn.proj.weight": "model-00001-of-00003.safetensors",
228
+ "backbone.qwen_model.model.visual.blocks.2.attn.qkv.bias": "model-00001-of-00003.safetensors",
229
+ "backbone.qwen_model.model.visual.blocks.2.attn.qkv.weight": "model-00001-of-00003.safetensors",
230
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
231
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
232
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
233
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
234
+ "backbone.qwen_model.model.visual.blocks.2.norm1.bias": "model-00001-of-00003.safetensors",
235
+ "backbone.qwen_model.model.visual.blocks.2.norm1.weight": "model-00001-of-00003.safetensors",
236
+ "backbone.qwen_model.model.visual.blocks.2.norm2.bias": "model-00001-of-00003.safetensors",
237
+ "backbone.qwen_model.model.visual.blocks.2.norm2.weight": "model-00001-of-00003.safetensors",
238
+ "backbone.qwen_model.model.visual.blocks.20.attn.proj.bias": "model-00001-of-00003.safetensors",
239
+ "backbone.qwen_model.model.visual.blocks.20.attn.proj.weight": "model-00001-of-00003.safetensors",
240
+ "backbone.qwen_model.model.visual.blocks.20.attn.qkv.bias": "model-00001-of-00003.safetensors",
241
+ "backbone.qwen_model.model.visual.blocks.20.attn.qkv.weight": "model-00001-of-00003.safetensors",
242
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
243
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
244
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
245
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
246
+ "backbone.qwen_model.model.visual.blocks.20.norm1.bias": "model-00001-of-00003.safetensors",
247
+ "backbone.qwen_model.model.visual.blocks.20.norm1.weight": "model-00001-of-00003.safetensors",
248
+ "backbone.qwen_model.model.visual.blocks.20.norm2.bias": "model-00001-of-00003.safetensors",
249
+ "backbone.qwen_model.model.visual.blocks.20.norm2.weight": "model-00001-of-00003.safetensors",
250
+ "backbone.qwen_model.model.visual.blocks.21.attn.proj.bias": "model-00001-of-00003.safetensors",
251
+ "backbone.qwen_model.model.visual.blocks.21.attn.proj.weight": "model-00001-of-00003.safetensors",
252
+ "backbone.qwen_model.model.visual.blocks.21.attn.qkv.bias": "model-00001-of-00003.safetensors",
253
+ "backbone.qwen_model.model.visual.blocks.21.attn.qkv.weight": "model-00001-of-00003.safetensors",
254
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
255
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
256
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
257
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
258
+ "backbone.qwen_model.model.visual.blocks.21.norm1.bias": "model-00001-of-00003.safetensors",
259
+ "backbone.qwen_model.model.visual.blocks.21.norm1.weight": "model-00001-of-00003.safetensors",
260
+ "backbone.qwen_model.model.visual.blocks.21.norm2.bias": "model-00001-of-00003.safetensors",
261
+ "backbone.qwen_model.model.visual.blocks.21.norm2.weight": "model-00001-of-00003.safetensors",
262
+ "backbone.qwen_model.model.visual.blocks.22.attn.proj.bias": "model-00001-of-00003.safetensors",
263
+ "backbone.qwen_model.model.visual.blocks.22.attn.proj.weight": "model-00001-of-00003.safetensors",
264
+ "backbone.qwen_model.model.visual.blocks.22.attn.qkv.bias": "model-00001-of-00003.safetensors",
265
+ "backbone.qwen_model.model.visual.blocks.22.attn.qkv.weight": "model-00001-of-00003.safetensors",
266
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
267
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
268
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
269
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
270
+ "backbone.qwen_model.model.visual.blocks.22.norm1.bias": "model-00001-of-00003.safetensors",
271
+ "backbone.qwen_model.model.visual.blocks.22.norm1.weight": "model-00001-of-00003.safetensors",
272
+ "backbone.qwen_model.model.visual.blocks.22.norm2.bias": "model-00001-of-00003.safetensors",
273
+ "backbone.qwen_model.model.visual.blocks.22.norm2.weight": "model-00001-of-00003.safetensors",
274
+ "backbone.qwen_model.model.visual.blocks.23.attn.proj.bias": "model-00001-of-00003.safetensors",
275
+ "backbone.qwen_model.model.visual.blocks.23.attn.proj.weight": "model-00001-of-00003.safetensors",
276
+ "backbone.qwen_model.model.visual.blocks.23.attn.qkv.bias": "model-00001-of-00003.safetensors",
277
+ "backbone.qwen_model.model.visual.blocks.23.attn.qkv.weight": "model-00001-of-00003.safetensors",
278
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
279
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
280
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
281
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
282
+ "backbone.qwen_model.model.visual.blocks.23.norm1.bias": "model-00001-of-00003.safetensors",
283
+ "backbone.qwen_model.model.visual.blocks.23.norm1.weight": "model-00001-of-00003.safetensors",
284
+ "backbone.qwen_model.model.visual.blocks.23.norm2.bias": "model-00001-of-00003.safetensors",
285
+ "backbone.qwen_model.model.visual.blocks.23.norm2.weight": "model-00001-of-00003.safetensors",
286
+ "backbone.qwen_model.model.visual.blocks.24.attn.proj.bias": "model-00001-of-00003.safetensors",
287
+ "backbone.qwen_model.model.visual.blocks.24.attn.proj.weight": "model-00001-of-00003.safetensors",
288
+ "backbone.qwen_model.model.visual.blocks.24.attn.qkv.bias": "model-00001-of-00003.safetensors",
289
+ "backbone.qwen_model.model.visual.blocks.24.attn.qkv.weight": "model-00001-of-00003.safetensors",
290
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
291
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
292
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
293
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
294
+ "backbone.qwen_model.model.visual.blocks.24.norm1.bias": "model-00001-of-00003.safetensors",
295
+ "backbone.qwen_model.model.visual.blocks.24.norm1.weight": "model-00001-of-00003.safetensors",
296
+ "backbone.qwen_model.model.visual.blocks.24.norm2.bias": "model-00001-of-00003.safetensors",
297
+ "backbone.qwen_model.model.visual.blocks.24.norm2.weight": "model-00001-of-00003.safetensors",
298
+ "backbone.qwen_model.model.visual.blocks.25.attn.proj.bias": "model-00001-of-00003.safetensors",
299
+ "backbone.qwen_model.model.visual.blocks.25.attn.proj.weight": "model-00001-of-00003.safetensors",
300
+ "backbone.qwen_model.model.visual.blocks.25.attn.qkv.bias": "model-00001-of-00003.safetensors",
301
+ "backbone.qwen_model.model.visual.blocks.25.attn.qkv.weight": "model-00001-of-00003.safetensors",
302
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
303
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
304
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
305
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
306
+ "backbone.qwen_model.model.visual.blocks.25.norm1.bias": "model-00001-of-00003.safetensors",
307
+ "backbone.qwen_model.model.visual.blocks.25.norm1.weight": "model-00001-of-00003.safetensors",
308
+ "backbone.qwen_model.model.visual.blocks.25.norm2.bias": "model-00001-of-00003.safetensors",
309
+ "backbone.qwen_model.model.visual.blocks.25.norm2.weight": "model-00001-of-00003.safetensors",
310
+ "backbone.qwen_model.model.visual.blocks.26.attn.proj.bias": "model-00001-of-00003.safetensors",
311
+ "backbone.qwen_model.model.visual.blocks.26.attn.proj.weight": "model-00001-of-00003.safetensors",
312
+ "backbone.qwen_model.model.visual.blocks.26.attn.qkv.bias": "model-00001-of-00003.safetensors",
313
+ "backbone.qwen_model.model.visual.blocks.26.attn.qkv.weight": "model-00001-of-00003.safetensors",
314
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
315
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
316
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
317
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
318
+ "backbone.qwen_model.model.visual.blocks.26.norm1.bias": "model-00001-of-00003.safetensors",
319
+ "backbone.qwen_model.model.visual.blocks.26.norm1.weight": "model-00001-of-00003.safetensors",
320
+ "backbone.qwen_model.model.visual.blocks.26.norm2.bias": "model-00001-of-00003.safetensors",
321
+ "backbone.qwen_model.model.visual.blocks.26.norm2.weight": "model-00001-of-00003.safetensors",
322
+ "backbone.qwen_model.model.visual.blocks.3.attn.proj.bias": "model-00001-of-00003.safetensors",
323
+ "backbone.qwen_model.model.visual.blocks.3.attn.proj.weight": "model-00001-of-00003.safetensors",
324
+ "backbone.qwen_model.model.visual.blocks.3.attn.qkv.bias": "model-00001-of-00003.safetensors",
325
+ "backbone.qwen_model.model.visual.blocks.3.attn.qkv.weight": "model-00001-of-00003.safetensors",
326
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
327
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
328
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
329
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
330
+ "backbone.qwen_model.model.visual.blocks.3.norm1.bias": "model-00001-of-00003.safetensors",
331
+ "backbone.qwen_model.model.visual.blocks.3.norm1.weight": "model-00001-of-00003.safetensors",
332
+ "backbone.qwen_model.model.visual.blocks.3.norm2.bias": "model-00001-of-00003.safetensors",
333
+ "backbone.qwen_model.model.visual.blocks.3.norm2.weight": "model-00001-of-00003.safetensors",
334
+ "backbone.qwen_model.model.visual.blocks.4.attn.proj.bias": "model-00001-of-00003.safetensors",
335
+ "backbone.qwen_model.model.visual.blocks.4.attn.proj.weight": "model-00001-of-00003.safetensors",
336
+ "backbone.qwen_model.model.visual.blocks.4.attn.qkv.bias": "model-00001-of-00003.safetensors",
337
+ "backbone.qwen_model.model.visual.blocks.4.attn.qkv.weight": "model-00001-of-00003.safetensors",
338
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
339
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
340
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
341
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
342
+ "backbone.qwen_model.model.visual.blocks.4.norm1.bias": "model-00001-of-00003.safetensors",
343
+ "backbone.qwen_model.model.visual.blocks.4.norm1.weight": "model-00001-of-00003.safetensors",
344
+ "backbone.qwen_model.model.visual.blocks.4.norm2.bias": "model-00001-of-00003.safetensors",
345
+ "backbone.qwen_model.model.visual.blocks.4.norm2.weight": "model-00001-of-00003.safetensors",
346
+ "backbone.qwen_model.model.visual.blocks.5.attn.proj.bias": "model-00001-of-00003.safetensors",
347
+ "backbone.qwen_model.model.visual.blocks.5.attn.proj.weight": "model-00001-of-00003.safetensors",
348
+ "backbone.qwen_model.model.visual.blocks.5.attn.qkv.bias": "model-00001-of-00003.safetensors",
349
+ "backbone.qwen_model.model.visual.blocks.5.attn.qkv.weight": "model-00001-of-00003.safetensors",
350
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
351
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
352
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
353
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
354
+ "backbone.qwen_model.model.visual.blocks.5.norm1.bias": "model-00001-of-00003.safetensors",
355
+ "backbone.qwen_model.model.visual.blocks.5.norm1.weight": "model-00001-of-00003.safetensors",
356
+ "backbone.qwen_model.model.visual.blocks.5.norm2.bias": "model-00001-of-00003.safetensors",
357
+ "backbone.qwen_model.model.visual.blocks.5.norm2.weight": "model-00001-of-00003.safetensors",
358
+ "backbone.qwen_model.model.visual.blocks.6.attn.proj.bias": "model-00001-of-00003.safetensors",
359
+ "backbone.qwen_model.model.visual.blocks.6.attn.proj.weight": "model-00001-of-00003.safetensors",
360
+ "backbone.qwen_model.model.visual.blocks.6.attn.qkv.bias": "model-00001-of-00003.safetensors",
361
+ "backbone.qwen_model.model.visual.blocks.6.attn.qkv.weight": "model-00001-of-00003.safetensors",
362
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
363
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
364
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
365
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
366
+ "backbone.qwen_model.model.visual.blocks.6.norm1.bias": "model-00001-of-00003.safetensors",
367
+ "backbone.qwen_model.model.visual.blocks.6.norm1.weight": "model-00001-of-00003.safetensors",
368
+ "backbone.qwen_model.model.visual.blocks.6.norm2.bias": "model-00001-of-00003.safetensors",
369
+ "backbone.qwen_model.model.visual.blocks.6.norm2.weight": "model-00001-of-00003.safetensors",
370
+ "backbone.qwen_model.model.visual.blocks.7.attn.proj.bias": "model-00001-of-00003.safetensors",
371
+ "backbone.qwen_model.model.visual.blocks.7.attn.proj.weight": "model-00001-of-00003.safetensors",
372
+ "backbone.qwen_model.model.visual.blocks.7.attn.qkv.bias": "model-00001-of-00003.safetensors",
373
+ "backbone.qwen_model.model.visual.blocks.7.attn.qkv.weight": "model-00001-of-00003.safetensors",
374
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
375
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
376
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
377
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
378
+ "backbone.qwen_model.model.visual.blocks.7.norm1.bias": "model-00001-of-00003.safetensors",
379
+ "backbone.qwen_model.model.visual.blocks.7.norm1.weight": "model-00001-of-00003.safetensors",
380
+ "backbone.qwen_model.model.visual.blocks.7.norm2.bias": "model-00001-of-00003.safetensors",
381
+ "backbone.qwen_model.model.visual.blocks.7.norm2.weight": "model-00001-of-00003.safetensors",
382
+ "backbone.qwen_model.model.visual.blocks.8.attn.proj.bias": "model-00001-of-00003.safetensors",
383
+ "backbone.qwen_model.model.visual.blocks.8.attn.proj.weight": "model-00001-of-00003.safetensors",
384
+ "backbone.qwen_model.model.visual.blocks.8.attn.qkv.bias": "model-00001-of-00003.safetensors",
385
+ "backbone.qwen_model.model.visual.blocks.8.attn.qkv.weight": "model-00001-of-00003.safetensors",
386
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
387
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
388
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
389
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
390
+ "backbone.qwen_model.model.visual.blocks.8.norm1.bias": "model-00001-of-00003.safetensors",
391
+ "backbone.qwen_model.model.visual.blocks.8.norm1.weight": "model-00001-of-00003.safetensors",
392
+ "backbone.qwen_model.model.visual.blocks.8.norm2.bias": "model-00001-of-00003.safetensors",
393
+ "backbone.qwen_model.model.visual.blocks.8.norm2.weight": "model-00001-of-00003.safetensors",
394
+ "backbone.qwen_model.model.visual.blocks.9.attn.proj.bias": "model-00001-of-00003.safetensors",
395
+ "backbone.qwen_model.model.visual.blocks.9.attn.proj.weight": "model-00001-of-00003.safetensors",
396
+ "backbone.qwen_model.model.visual.blocks.9.attn.qkv.bias": "model-00001-of-00003.safetensors",
397
+ "backbone.qwen_model.model.visual.blocks.9.attn.qkv.weight": "model-00001-of-00003.safetensors",
398
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
399
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
400
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
401
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
402
+ "backbone.qwen_model.model.visual.blocks.9.norm1.bias": "model-00001-of-00003.safetensors",
403
+ "backbone.qwen_model.model.visual.blocks.9.norm1.weight": "model-00001-of-00003.safetensors",
404
+ "backbone.qwen_model.model.visual.blocks.9.norm2.bias": "model-00001-of-00003.safetensors",
405
+ "backbone.qwen_model.model.visual.blocks.9.norm2.weight": "model-00001-of-00003.safetensors",
406
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00003.safetensors",
407
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00003.safetensors",
408
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00003.safetensors",
409
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00003.safetensors",
410
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00003.safetensors",
411
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00003.safetensors",
412
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00003.safetensors",
413
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00003.safetensors",
414
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00003.safetensors",
415
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00003.safetensors",
416
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00003.safetensors",
417
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00003.safetensors",
418
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00003.safetensors",
419
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00003.safetensors",
420
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00003.safetensors",
421
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00003.safetensors",
422
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00003.safetensors",
423
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00003.safetensors",
424
+ "backbone.qwen_model.model.visual.merger.linear_fc1.bias": "model-00001-of-00003.safetensors",
425
+ "backbone.qwen_model.model.visual.merger.linear_fc1.weight": "model-00001-of-00003.safetensors",
426
+ "backbone.qwen_model.model.visual.merger.linear_fc2.bias": "model-00001-of-00003.safetensors",
427
+ "backbone.qwen_model.model.visual.merger.linear_fc2.weight": "model-00001-of-00003.safetensors",
428
+ "backbone.qwen_model.model.visual.merger.norm.bias": "model-00001-of-00003.safetensors",
429
+ "backbone.qwen_model.model.visual.merger.norm.weight": "model-00001-of-00003.safetensors",
430
+ "backbone.qwen_model.model.visual.patch_embed.proj.bias": "model-00001-of-00003.safetensors",
431
+ "backbone.qwen_model.model.visual.patch_embed.proj.weight": "model-00001-of-00003.safetensors",
432
+ "backbone.qwen_model.model.visual.pos_embed.weight": "model-00001-of-00003.safetensors",
433
+ "backbone.qwen_model.model.language_model.layers.10.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
434
+ "backbone.qwen_model.model.language_model.layers.10.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
435
+ "backbone.qwen_model.model.language_model.layers.10.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
436
+ "backbone.qwen_model.model.language_model.layers.10.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
437
+ "backbone.qwen_model.model.language_model.layers.10.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
438
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
439
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
440
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
441
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
442
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
443
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
444
+ "backbone.qwen_model.model.language_model.layers.11.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
445
+ "backbone.qwen_model.model.language_model.layers.11.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
446
+ "backbone.qwen_model.model.language_model.layers.11.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
447
+ "backbone.qwen_model.model.language_model.layers.11.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
448
+ "backbone.qwen_model.model.language_model.layers.11.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
449
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
450
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
451
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
452
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
453
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
454
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
455
+ "backbone.qwen_model.model.language_model.layers.12.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
456
+ "backbone.qwen_model.model.language_model.layers.12.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
457
+ "backbone.qwen_model.model.language_model.layers.12.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
458
+ "backbone.qwen_model.model.language_model.layers.12.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
459
+ "backbone.qwen_model.model.language_model.layers.12.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
460
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
461
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
462
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
463
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
464
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
465
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
466
+ "backbone.qwen_model.model.language_model.layers.13.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
467
+ "backbone.qwen_model.model.language_model.layers.13.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
468
+ "backbone.qwen_model.model.language_model.layers.13.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
469
+ "backbone.qwen_model.model.language_model.layers.13.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
470
+ "backbone.qwen_model.model.language_model.layers.13.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
471
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
472
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
473
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
474
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
475
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
476
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
477
+ "backbone.qwen_model.model.language_model.layers.14.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
478
+ "backbone.qwen_model.model.language_model.layers.14.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
479
+ "backbone.qwen_model.model.language_model.layers.14.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
480
+ "backbone.qwen_model.model.language_model.layers.14.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
481
+ "backbone.qwen_model.model.language_model.layers.14.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
482
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
483
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
484
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
485
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
486
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
487
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
488
+ "backbone.qwen_model.model.language_model.layers.15.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
489
+ "backbone.qwen_model.model.language_model.layers.15.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
490
+ "backbone.qwen_model.model.language_model.layers.15.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
491
+ "backbone.qwen_model.model.language_model.layers.15.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
492
+ "backbone.qwen_model.model.language_model.layers.15.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
493
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
494
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
495
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
496
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
497
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
498
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
499
+ "backbone.qwen_model.model.language_model.layers.16.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
500
+ "backbone.qwen_model.model.language_model.layers.16.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
501
+ "backbone.qwen_model.model.language_model.layers.16.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
502
+ "backbone.qwen_model.model.language_model.layers.16.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
503
+ "backbone.qwen_model.model.language_model.layers.16.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
504
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
505
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
506
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
507
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
508
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
509
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
510
+ "backbone.qwen_model.model.language_model.layers.17.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
511
+ "backbone.qwen_model.model.language_model.layers.17.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
512
+ "backbone.qwen_model.model.language_model.layers.17.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
513
+ "backbone.qwen_model.model.language_model.layers.17.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
514
+ "backbone.qwen_model.model.language_model.layers.17.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
515
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
516
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
517
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
518
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
519
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
520
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
521
+ "backbone.qwen_model.model.language_model.layers.6.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
522
+ "backbone.qwen_model.model.language_model.layers.6.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
523
+ "backbone.qwen_model.model.language_model.layers.6.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
524
+ "backbone.qwen_model.model.language_model.layers.6.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
525
+ "backbone.qwen_model.model.language_model.layers.7.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
526
+ "backbone.qwen_model.model.language_model.layers.7.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
527
+ "backbone.qwen_model.model.language_model.layers.7.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
528
+ "backbone.qwen_model.model.language_model.layers.7.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
529
+ "backbone.qwen_model.model.language_model.layers.7.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
530
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
531
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
532
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
533
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
534
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
535
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
536
+ "backbone.qwen_model.model.language_model.layers.8.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
537
+ "backbone.qwen_model.model.language_model.layers.8.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
538
+ "backbone.qwen_model.model.language_model.layers.8.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
539
+ "backbone.qwen_model.model.language_model.layers.8.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
540
+ "backbone.qwen_model.model.language_model.layers.8.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
541
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
542
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
543
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
544
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
545
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
546
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
547
+ "backbone.qwen_model.model.language_model.layers.9.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
548
+ "backbone.qwen_model.model.language_model.layers.9.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
549
+ "backbone.qwen_model.model.language_model.layers.9.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
550
+ "backbone.qwen_model.model.language_model.layers.9.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
551
+ "backbone.qwen_model.model.language_model.layers.9.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
552
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
553
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
554
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
555
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
556
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
557
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
558
+ "backbone.qwen_model.model.language_model.norm.weight": "model-00002-of-00003.safetensors",
559
+ "action_model.action_decoder.layer1.W": "model-00003-of-00003.safetensors",
560
+ "action_model.action_decoder.layer1.b": "model-00003-of-00003.safetensors",
561
+ "action_model.action_decoder.layer2.W": "model-00003-of-00003.safetensors",
562
+ "action_model.action_decoder.layer2.b": "model-00003-of-00003.safetensors",
563
+ "action_model.action_encoder.W1.W": "model-00003-of-00003.safetensors",
564
+ "action_model.action_encoder.W1.b": "model-00003-of-00003.safetensors",
565
+ "action_model.action_encoder.W2.W": "model-00003-of-00003.safetensors",
566
+ "action_model.action_encoder.W2.b": "model-00003-of-00003.safetensors",
567
+ "action_model.action_encoder.W3.W": "model-00003-of-00003.safetensors",
568
+ "action_model.action_encoder.W3.b": "model-00003-of-00003.safetensors",
569
+ "action_model.model.double_blocks.0.k_norm_sa.weight": "model-00003-of-00003.safetensors",
570
+ "action_model.model.double_blocks.0.k_norm_vl.weight": "model-00003-of-00003.safetensors",
571
+ "action_model.model.double_blocks.0.q_norm_sa.weight": "model-00003-of-00003.safetensors",
572
+ "action_model.model.double_blocks.0.q_norm_vl.weight": "model-00003-of-00003.safetensors",
573
+ "action_model.model.double_blocks.0.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
574
+ "action_model.model.double_blocks.0.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
575
+ "action_model.model.double_blocks.0.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
576
+ "action_model.model.double_blocks.0.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
577
+ "action_model.model.double_blocks.0.sa_proj.bias": "model-00003-of-00003.safetensors",
578
+ "action_model.model.double_blocks.0.sa_proj.weight": "model-00003-of-00003.safetensors",
579
+ "action_model.model.double_blocks.0.sa_qkv.bias": "model-00003-of-00003.safetensors",
580
+ "action_model.model.double_blocks.0.sa_qkv.weight": "model-00003-of-00003.safetensors",
581
+ "action_model.model.double_blocks.0.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
582
+ "action_model.model.double_blocks.0.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
583
+ "action_model.model.double_blocks.0.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
584
+ "action_model.model.double_blocks.0.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
585
+ "action_model.model.double_blocks.0.vl_proj.bias": "model-00003-of-00003.safetensors",
586
+ "action_model.model.double_blocks.0.vl_proj.weight": "model-00003-of-00003.safetensors",
587
+ "action_model.model.double_blocks.0.vl_qkv.bias": "model-00003-of-00003.safetensors",
588
+ "action_model.model.double_blocks.0.vl_qkv.weight": "model-00003-of-00003.safetensors",
589
+ "action_model.model.double_blocks.1.k_norm_sa.weight": "model-00003-of-00003.safetensors",
590
+ "action_model.model.double_blocks.1.k_norm_vl.weight": "model-00003-of-00003.safetensors",
591
+ "action_model.model.double_blocks.1.q_norm_sa.weight": "model-00003-of-00003.safetensors",
592
+ "action_model.model.double_blocks.1.q_norm_vl.weight": "model-00003-of-00003.safetensors",
593
+ "action_model.model.double_blocks.1.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
594
+ "action_model.model.double_blocks.1.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
595
+ "action_model.model.double_blocks.1.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
596
+ "action_model.model.double_blocks.1.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
597
+ "action_model.model.double_blocks.1.sa_proj.bias": "model-00003-of-00003.safetensors",
598
+ "action_model.model.double_blocks.1.sa_proj.weight": "model-00003-of-00003.safetensors",
599
+ "action_model.model.double_blocks.1.sa_qkv.bias": "model-00003-of-00003.safetensors",
600
+ "action_model.model.double_blocks.1.sa_qkv.weight": "model-00003-of-00003.safetensors",
601
+ "action_model.model.double_blocks.1.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
602
+ "action_model.model.double_blocks.1.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
603
+ "action_model.model.double_blocks.1.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
604
+ "action_model.model.double_blocks.1.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
605
+ "action_model.model.double_blocks.1.vl_proj.bias": "model-00003-of-00003.safetensors",
606
+ "action_model.model.double_blocks.1.vl_proj.weight": "model-00003-of-00003.safetensors",
607
+ "action_model.model.double_blocks.1.vl_qkv.bias": "model-00003-of-00003.safetensors",
608
+ "action_model.model.double_blocks.1.vl_qkv.weight": "model-00003-of-00003.safetensors",
609
+ "action_model.model.double_blocks.2.k_norm_sa.weight": "model-00003-of-00003.safetensors",
610
+ "action_model.model.double_blocks.2.k_norm_vl.weight": "model-00003-of-00003.safetensors",
611
+ "action_model.model.double_blocks.2.q_norm_sa.weight": "model-00003-of-00003.safetensors",
612
+ "action_model.model.double_blocks.2.q_norm_vl.weight": "model-00003-of-00003.safetensors",
613
+ "action_model.model.double_blocks.2.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
614
+ "action_model.model.double_blocks.2.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
615
+ "action_model.model.double_blocks.2.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
616
+ "action_model.model.double_blocks.2.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
617
+ "action_model.model.double_blocks.2.sa_proj.bias": "model-00003-of-00003.safetensors",
618
+ "action_model.model.double_blocks.2.sa_proj.weight": "model-00003-of-00003.safetensors",
619
+ "action_model.model.double_blocks.2.sa_qkv.bias": "model-00003-of-00003.safetensors",
620
+ "action_model.model.double_blocks.2.sa_qkv.weight": "model-00003-of-00003.safetensors",
621
+ "action_model.model.double_blocks.2.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
622
+ "action_model.model.double_blocks.2.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
623
+ "action_model.model.double_blocks.2.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
624
+ "action_model.model.double_blocks.2.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
625
+ "action_model.model.double_blocks.2.vl_proj.bias": "model-00003-of-00003.safetensors",
626
+ "action_model.model.double_blocks.2.vl_proj.weight": "model-00003-of-00003.safetensors",
627
+ "action_model.model.double_blocks.2.vl_qkv.bias": "model-00003-of-00003.safetensors",
628
+ "action_model.model.double_blocks.2.vl_qkv.weight": "model-00003-of-00003.safetensors",
629
+ "action_model.model.double_blocks.3.k_norm_sa.weight": "model-00003-of-00003.safetensors",
630
+ "action_model.model.double_blocks.3.k_norm_vl.weight": "model-00003-of-00003.safetensors",
631
+ "action_model.model.double_blocks.3.q_norm_sa.weight": "model-00003-of-00003.safetensors",
632
+ "action_model.model.double_blocks.3.q_norm_vl.weight": "model-00003-of-00003.safetensors",
633
+ "action_model.model.double_blocks.3.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
634
+ "action_model.model.double_blocks.3.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
635
+ "action_model.model.double_blocks.3.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
636
+ "action_model.model.double_blocks.3.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
637
+ "action_model.model.double_blocks.3.sa_proj.bias": "model-00003-of-00003.safetensors",
638
+ "action_model.model.double_blocks.3.sa_proj.weight": "model-00003-of-00003.safetensors",
639
+ "action_model.model.double_blocks.3.sa_qkv.bias": "model-00003-of-00003.safetensors",
640
+ "action_model.model.double_blocks.3.sa_qkv.weight": "model-00003-of-00003.safetensors",
641
+ "action_model.model.double_blocks.3.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
642
+ "action_model.model.double_blocks.3.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
643
+ "action_model.model.double_blocks.3.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
644
+ "action_model.model.double_blocks.3.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
645
+ "action_model.model.double_blocks.3.vl_proj.bias": "model-00003-of-00003.safetensors",
646
+ "action_model.model.double_blocks.3.vl_proj.weight": "model-00003-of-00003.safetensors",
647
+ "action_model.model.double_blocks.3.vl_qkv.bias": "model-00003-of-00003.safetensors",
648
+ "action_model.model.double_blocks.3.vl_qkv.weight": "model-00003-of-00003.safetensors",
649
+ "action_model.model.proj_out_1.bias": "model-00003-of-00003.safetensors",
650
+ "action_model.model.proj_out_1.weight": "model-00003-of-00003.safetensors",
651
+ "action_model.model.proj_out_2.bias": "model-00003-of-00003.safetensors",
652
+ "action_model.model.proj_out_2.weight": "model-00003-of-00003.safetensors",
653
+ "action_model.model.single_blocks.0.k_norm.weight": "model-00003-of-00003.safetensors",
654
+ "action_model.model.single_blocks.0.linear1.bias": "model-00003-of-00003.safetensors",
655
+ "action_model.model.single_blocks.0.linear1.weight": "model-00003-of-00003.safetensors",
656
+ "action_model.model.single_blocks.0.linear2.bias": "model-00003-of-00003.safetensors",
657
+ "action_model.model.single_blocks.0.linear2.weight": "model-00003-of-00003.safetensors",
658
+ "action_model.model.single_blocks.0.mlp_proj.bias": "model-00003-of-00003.safetensors",
659
+ "action_model.model.single_blocks.0.mlp_proj.weight": "model-00003-of-00003.safetensors",
660
+ "action_model.model.single_blocks.0.q_norm.weight": "model-00003-of-00003.safetensors",
661
+ "action_model.model.single_blocks.1.k_norm.weight": "model-00003-of-00003.safetensors",
662
+ "action_model.model.single_blocks.1.linear1.bias": "model-00003-of-00003.safetensors",
663
+ "action_model.model.single_blocks.1.linear1.weight": "model-00003-of-00003.safetensors",
664
+ "action_model.model.single_blocks.1.linear2.bias": "model-00003-of-00003.safetensors",
665
+ "action_model.model.single_blocks.1.linear2.weight": "model-00003-of-00003.safetensors",
666
+ "action_model.model.single_blocks.1.mlp_proj.bias": "model-00003-of-00003.safetensors",
667
+ "action_model.model.single_blocks.1.mlp_proj.weight": "model-00003-of-00003.safetensors",
668
+ "action_model.model.single_blocks.1.q_norm.weight": "model-00003-of-00003.safetensors",
669
+ "action_model.model.single_blocks.2.k_norm.weight": "model-00003-of-00003.safetensors",
670
+ "action_model.model.single_blocks.2.linear1.bias": "model-00003-of-00003.safetensors",
671
+ "action_model.model.single_blocks.2.linear1.weight": "model-00003-of-00003.safetensors",
672
+ "action_model.model.single_blocks.2.linear2.bias": "model-00003-of-00003.safetensors",
673
+ "action_model.model.single_blocks.2.linear2.weight": "model-00003-of-00003.safetensors",
674
+ "action_model.model.single_blocks.2.mlp_proj.bias": "model-00003-of-00003.safetensors",
675
+ "action_model.model.single_blocks.2.mlp_proj.weight": "model-00003-of-00003.safetensors",
676
+ "action_model.model.single_blocks.2.q_norm.weight": "model-00003-of-00003.safetensors",
677
+ "action_model.model.single_blocks.3.k_norm.weight": "model-00003-of-00003.safetensors",
678
+ "action_model.model.single_blocks.3.linear1.bias": "model-00003-of-00003.safetensors",
679
+ "action_model.model.single_blocks.3.linear1.weight": "model-00003-of-00003.safetensors",
680
+ "action_model.model.single_blocks.3.linear2.bias": "model-00003-of-00003.safetensors",
681
+ "action_model.model.single_blocks.3.linear2.weight": "model-00003-of-00003.safetensors",
682
+ "action_model.model.single_blocks.3.mlp_proj.bias": "model-00003-of-00003.safetensors",
683
+ "action_model.model.single_blocks.3.mlp_proj.weight": "model-00003-of-00003.safetensors",
684
+ "action_model.model.single_blocks.3.q_norm.weight": "model-00003-of-00003.safetensors",
685
+ "action_model.model.single_blocks.4.k_norm.weight": "model-00003-of-00003.safetensors",
686
+ "action_model.model.single_blocks.4.linear1.bias": "model-00003-of-00003.safetensors",
687
+ "action_model.model.single_blocks.4.linear1.weight": "model-00003-of-00003.safetensors",
688
+ "action_model.model.single_blocks.4.linear2.bias": "model-00003-of-00003.safetensors",
689
+ "action_model.model.single_blocks.4.linear2.weight": "model-00003-of-00003.safetensors",
690
+ "action_model.model.single_blocks.4.mlp_proj.bias": "model-00003-of-00003.safetensors",
691
+ "action_model.model.single_blocks.4.mlp_proj.weight": "model-00003-of-00003.safetensors",
692
+ "action_model.model.single_blocks.4.q_norm.weight": "model-00003-of-00003.safetensors",
693
+ "action_model.model.single_blocks.5.k_norm.weight": "model-00003-of-00003.safetensors",
694
+ "action_model.model.single_blocks.5.linear1.bias": "model-00003-of-00003.safetensors",
695
+ "action_model.model.single_blocks.5.linear1.weight": "model-00003-of-00003.safetensors",
696
+ "action_model.model.single_blocks.5.linear2.bias": "model-00003-of-00003.safetensors",
697
+ "action_model.model.single_blocks.5.linear2.weight": "model-00003-of-00003.safetensors",
698
+ "action_model.model.single_blocks.5.mlp_proj.bias": "model-00003-of-00003.safetensors",
699
+ "action_model.model.single_blocks.5.mlp_proj.weight": "model-00003-of-00003.safetensors",
700
+ "action_model.model.single_blocks.5.q_norm.weight": "model-00003-of-00003.safetensors",
701
+ "action_model.model.single_blocks.6.k_norm.weight": "model-00003-of-00003.safetensors",
702
+ "action_model.model.single_blocks.6.linear1.bias": "model-00003-of-00003.safetensors",
703
+ "action_model.model.single_blocks.6.linear1.weight": "model-00003-of-00003.safetensors",
704
+ "action_model.model.single_blocks.6.linear2.bias": "model-00003-of-00003.safetensors",
705
+ "action_model.model.single_blocks.6.linear2.weight": "model-00003-of-00003.safetensors",
706
+ "action_model.model.single_blocks.6.mlp_proj.bias": "model-00003-of-00003.safetensors",
707
+ "action_model.model.single_blocks.6.mlp_proj.weight": "model-00003-of-00003.safetensors",
708
+ "action_model.model.single_blocks.6.q_norm.weight": "model-00003-of-00003.safetensors",
709
+ "action_model.model.single_blocks.7.k_norm.weight": "model-00003-of-00003.safetensors",
710
+ "action_model.model.single_blocks.7.linear1.bias": "model-00003-of-00003.safetensors",
711
+ "action_model.model.single_blocks.7.linear1.weight": "model-00003-of-00003.safetensors",
712
+ "action_model.model.single_blocks.7.linear2.bias": "model-00003-of-00003.safetensors",
713
+ "action_model.model.single_blocks.7.linear2.weight": "model-00003-of-00003.safetensors",
714
+ "action_model.model.single_blocks.7.mlp_proj.bias": "model-00003-of-00003.safetensors",
715
+ "action_model.model.single_blocks.7.mlp_proj.weight": "model-00003-of-00003.safetensors",
716
+ "action_model.model.single_blocks.7.q_norm.weight": "model-00003-of-00003.safetensors",
717
+ "action_model.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00003-of-00003.safetensors",
718
+ "action_model.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00003-of-00003.safetensors",
719
+ "action_model.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00003-of-00003.safetensors",
720
+ "action_model.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00003-of-00003.safetensors",
721
+ "action_model.model.vl_proj_to_sa.bias": "model-00003-of-00003.safetensors",
722
+ "action_model.model.vl_proj_to_sa.weight": "model-00003-of-00003.safetensors",
723
+ "action_model.position_embedding.weight": "model-00003-of-00003.safetensors",
724
+ "action_model.state_encoder.layer1.W": "model-00003-of-00003.safetensors",
725
+ "action_model.state_encoder.layer1.b": "model-00003-of-00003.safetensors",
726
+ "action_model.state_encoder.layer2.W": "model-00003-of-00003.safetensors",
727
+ "action_model.state_encoder.layer2.b": "model-00003-of-00003.safetensors",
728
+ "backbone.qwen_model.lm_head.weight": "model-00003-of-00003.safetensors"
729
+ }
730
+ }
processor/embodiment_id.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "general_embodiment": 0,
3
+ "fractal20220817_data": 1,
4
+ "kuka": 2,
5
+ "bridge_orig": 3,
6
+ "taco_play": 4,
7
+ "jaco_play": 5,
8
+ "berkeley_cable_routing": 6,
9
+ "roboturk": 7,
10
+ "viola": 8,
11
+ "berkeley_autolab_ur5": 9,
12
+ "toto": 10,
13
+ "language_table": 11,
14
+ "stanford_hydra_dataset_converted_externally_to_rlds": 12,
15
+ "austin_buds_dataset_converted_externally_to_rlds": 13,
16
+ "nyu_franka_play_dataset_converted_externally_to_rlds": 14,
17
+ "furniture_bench_dataset_converted_externally_to_rlds": 15,
18
+ "ucsd_kitchen_dataset_converted_externally_to_rlds": 16,
19
+ "austin_sailor_dataset_converted_externally_to_rlds": 17,
20
+ "austin_sirius_dataset_converted_externally_to_rlds": 18,
21
+ "dlr_edan_shared_control_converted_externally_to_rlds": 19,
22
+ "iamlab_cmu_pickup_insert_converted_externally_to_rlds": 20,
23
+ "utaustin_mutex": 21,
24
+ "berkeley_fanuc_manipulation": 22,
25
+ "cmu_stretch": 23,
26
+ "bc_z": 24,
27
+ "fmb_dataset": 25,
28
+ "dobbe": 26,
29
+ "droid": 27,
30
+ "agibot_dexhand": 28,
31
+ "agibot_gripper": 29,
32
+ "galaxea": 30,
33
+ "humanoid_everyday_g1": 31,
34
+ "humanoid_everyday_h1": 32,
35
+ "action_net": 33,
36
+ "neural_gr1": 34,
37
+ "new_embodiment": 35
38
+ }
processor/processor_config.json ADDED
@@ -0,0 +1,2976 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "RLDXProcessor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "bridge_orig": {
6
+ "video": {
7
+ "delta_indices": [
8
+ -6,
9
+ -4,
10
+ -2,
11
+ 0
12
+ ],
13
+ "modality_keys": [
14
+ "primary",
15
+ "secondary"
16
+ ],
17
+ "sin_cos_embedding_keys": null,
18
+ "mean_std_embedding_keys": null,
19
+ "action_configs": null
20
+ },
21
+ "state": {
22
+ "delta_indices": [
23
+ 0
24
+ ],
25
+ "modality_keys": [
26
+ "end_effector_position",
27
+ "end_effector_rotation",
28
+ "gripper_position"
29
+ ],
30
+ "sin_cos_embedding_keys": null,
31
+ "mean_std_embedding_keys": null,
32
+ "action_configs": null
33
+ },
34
+ "action": {
35
+ "delta_indices": [
36
+ 0,
37
+ 1,
38
+ 2,
39
+ 3,
40
+ 4,
41
+ 5,
42
+ 6,
43
+ 7,
44
+ 8,
45
+ 9,
46
+ 10,
47
+ 11,
48
+ 12,
49
+ 13,
50
+ 14,
51
+ 15
52
+ ],
53
+ "modality_keys": [
54
+ "end_effector_position",
55
+ "end_effector_rotation",
56
+ "gripper_close"
57
+ ],
58
+ "sin_cos_embedding_keys": null,
59
+ "mean_std_embedding_keys": null,
60
+ "action_configs": [
61
+ {
62
+ "rep": "DELTA",
63
+ "type": "EEF",
64
+ "format": "DEFAULT",
65
+ "state_key": null
66
+ },
67
+ {
68
+ "rep": "DELTA",
69
+ "type": "EEF",
70
+ "format": "DEFAULT",
71
+ "state_key": null
72
+ },
73
+ {
74
+ "rep": "ABSOLUTE",
75
+ "type": "NON_EEF",
76
+ "format": "DEFAULT",
77
+ "state_key": null
78
+ }
79
+ ]
80
+ },
81
+ "language": {
82
+ "delta_indices": [
83
+ 0
84
+ ],
85
+ "modality_keys": [
86
+ "annotation.human.action.task_description"
87
+ ],
88
+ "sin_cos_embedding_keys": null,
89
+ "mean_std_embedding_keys": null,
90
+ "action_configs": null
91
+ }
92
+ },
93
+ "iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
94
+ "video": {
95
+ "delta_indices": [
96
+ -6,
97
+ -4,
98
+ -2,
99
+ 0
100
+ ],
101
+ "modality_keys": [
102
+ "primary",
103
+ "wrist"
104
+ ],
105
+ "sin_cos_embedding_keys": null,
106
+ "mean_std_embedding_keys": null,
107
+ "action_configs": null
108
+ },
109
+ "state": {
110
+ "delta_indices": [
111
+ 0
112
+ ],
113
+ "modality_keys": [
114
+ "end_effector_position",
115
+ "end_effector_rotation",
116
+ "gripper_position"
117
+ ],
118
+ "sin_cos_embedding_keys": null,
119
+ "mean_std_embedding_keys": null,
120
+ "action_configs": null
121
+ },
122
+ "action": {
123
+ "delta_indices": [
124
+ 0,
125
+ 1,
126
+ 2,
127
+ 3,
128
+ 4,
129
+ 5,
130
+ 6,
131
+ 7,
132
+ 8,
133
+ 9,
134
+ 10,
135
+ 11,
136
+ 12,
137
+ 13,
138
+ 14,
139
+ 15
140
+ ],
141
+ "modality_keys": [
142
+ "end_effector_position",
143
+ "end_effector_rotation",
144
+ "gripper_close"
145
+ ],
146
+ "sin_cos_embedding_keys": null,
147
+ "mean_std_embedding_keys": null,
148
+ "action_configs": [
149
+ {
150
+ "rep": "DELTA",
151
+ "type": "EEF",
152
+ "format": "DEFAULT",
153
+ "state_key": null
154
+ },
155
+ {
156
+ "rep": "DELTA",
157
+ "type": "EEF",
158
+ "format": "DEFAULT",
159
+ "state_key": null
160
+ },
161
+ {
162
+ "rep": "ABSOLUTE",
163
+ "type": "NON_EEF",
164
+ "format": "DEFAULT",
165
+ "state_key": null
166
+ }
167
+ ]
168
+ },
169
+ "language": {
170
+ "delta_indices": [
171
+ 0
172
+ ],
173
+ "modality_keys": [
174
+ "annotation.human.action.task_description"
175
+ ],
176
+ "sin_cos_embedding_keys": null,
177
+ "mean_std_embedding_keys": null,
178
+ "action_configs": null
179
+ }
180
+ },
181
+ "humanoid_everyday_g1": {
182
+ "video": {
183
+ "delta_indices": [
184
+ -6,
185
+ -4,
186
+ -2,
187
+ 0
188
+ ],
189
+ "modality_keys": [
190
+ "egocentric_resized"
191
+ ],
192
+ "sin_cos_embedding_keys": null,
193
+ "mean_std_embedding_keys": null,
194
+ "action_configs": null
195
+ },
196
+ "state": {
197
+ "delta_indices": [
198
+ 0
199
+ ],
200
+ "modality_keys": [
201
+ "left_arm",
202
+ "left_hand",
203
+ "right_arm",
204
+ "right_hand"
205
+ ],
206
+ "sin_cos_embedding_keys": null,
207
+ "mean_std_embedding_keys": null,
208
+ "action_configs": null
209
+ },
210
+ "action": {
211
+ "delta_indices": [
212
+ 0,
213
+ 1,
214
+ 2,
215
+ 3,
216
+ 4,
217
+ 5,
218
+ 6,
219
+ 7,
220
+ 8,
221
+ 9,
222
+ 10,
223
+ 11,
224
+ 12,
225
+ 13,
226
+ 14,
227
+ 15
228
+ ],
229
+ "modality_keys": [
230
+ "left_arm",
231
+ "left_hand",
232
+ "right_arm",
233
+ "right_hand"
234
+ ],
235
+ "sin_cos_embedding_keys": null,
236
+ "mean_std_embedding_keys": null,
237
+ "action_configs": [
238
+ {
239
+ "rep": "ABSOLUTE",
240
+ "type": "NON_EEF",
241
+ "format": "DEFAULT",
242
+ "state_key": null
243
+ },
244
+ {
245
+ "rep": "ABSOLUTE",
246
+ "type": "NON_EEF",
247
+ "format": "DEFAULT",
248
+ "state_key": null
249
+ },
250
+ {
251
+ "rep": "ABSOLUTE",
252
+ "type": "NON_EEF",
253
+ "format": "DEFAULT",
254
+ "state_key": null
255
+ },
256
+ {
257
+ "rep": "ABSOLUTE",
258
+ "type": "NON_EEF",
259
+ "format": "DEFAULT",
260
+ "state_key": null
261
+ }
262
+ ]
263
+ },
264
+ "language": {
265
+ "delta_indices": [
266
+ 0
267
+ ],
268
+ "modality_keys": [
269
+ "annotation.human.action.task_description"
270
+ ],
271
+ "sin_cos_embedding_keys": null,
272
+ "mean_std_embedding_keys": null,
273
+ "action_configs": null
274
+ }
275
+ },
276
+ "dlr_edan_shared_control_converted_externally_to_rlds": {
277
+ "video": {
278
+ "delta_indices": [
279
+ -6,
280
+ -4,
281
+ -2,
282
+ 0
283
+ ],
284
+ "modality_keys": [
285
+ "primary"
286
+ ],
287
+ "sin_cos_embedding_keys": null,
288
+ "mean_std_embedding_keys": null,
289
+ "action_configs": null
290
+ },
291
+ "state": {
292
+ "delta_indices": [
293
+ 0
294
+ ],
295
+ "modality_keys": [
296
+ "end_effector_position",
297
+ "end_effector_rotation",
298
+ "gripper_position"
299
+ ],
300
+ "sin_cos_embedding_keys": null,
301
+ "mean_std_embedding_keys": null,
302
+ "action_configs": null
303
+ },
304
+ "action": {
305
+ "delta_indices": [
306
+ 0,
307
+ 1,
308
+ 2,
309
+ 3,
310
+ 4,
311
+ 5,
312
+ 6,
313
+ 7,
314
+ 8,
315
+ 9,
316
+ 10,
317
+ 11,
318
+ 12,
319
+ 13,
320
+ 14,
321
+ 15
322
+ ],
323
+ "modality_keys": [
324
+ "end_effector_position",
325
+ "end_effector_rotation",
326
+ "gripper_close"
327
+ ],
328
+ "sin_cos_embedding_keys": null,
329
+ "mean_std_embedding_keys": null,
330
+ "action_configs": [
331
+ {
332
+ "rep": "DELTA",
333
+ "type": "EEF",
334
+ "format": "DEFAULT",
335
+ "state_key": null
336
+ },
337
+ {
338
+ "rep": "DELTA",
339
+ "type": "EEF",
340
+ "format": "DEFAULT",
341
+ "state_key": null
342
+ },
343
+ {
344
+ "rep": "ABSOLUTE",
345
+ "type": "NON_EEF",
346
+ "format": "DEFAULT",
347
+ "state_key": null
348
+ }
349
+ ]
350
+ },
351
+ "language": {
352
+ "delta_indices": [
353
+ 0
354
+ ],
355
+ "modality_keys": [
356
+ "annotation.human.action.task_description"
357
+ ],
358
+ "sin_cos_embedding_keys": null,
359
+ "mean_std_embedding_keys": null,
360
+ "action_configs": null
361
+ }
362
+ },
363
+ "austin_sailor_dataset_converted_externally_to_rlds": {
364
+ "video": {
365
+ "delta_indices": [
366
+ -6,
367
+ -4,
368
+ -2,
369
+ 0
370
+ ],
371
+ "modality_keys": [
372
+ "primary",
373
+ "wrist"
374
+ ],
375
+ "sin_cos_embedding_keys": null,
376
+ "mean_std_embedding_keys": null,
377
+ "action_configs": null
378
+ },
379
+ "state": {
380
+ "delta_indices": [
381
+ 0
382
+ ],
383
+ "modality_keys": [
384
+ "end_effector_position",
385
+ "end_effector_rotation",
386
+ "gripper_position"
387
+ ],
388
+ "sin_cos_embedding_keys": null,
389
+ "mean_std_embedding_keys": null,
390
+ "action_configs": null
391
+ },
392
+ "action": {
393
+ "delta_indices": [
394
+ 0,
395
+ 1,
396
+ 2,
397
+ 3,
398
+ 4,
399
+ 5,
400
+ 6,
401
+ 7,
402
+ 8,
403
+ 9,
404
+ 10,
405
+ 11,
406
+ 12,
407
+ 13,
408
+ 14,
409
+ 15
410
+ ],
411
+ "modality_keys": [
412
+ "end_effector_position",
413
+ "end_effector_rotation",
414
+ "gripper_close"
415
+ ],
416
+ "sin_cos_embedding_keys": null,
417
+ "mean_std_embedding_keys": null,
418
+ "action_configs": [
419
+ {
420
+ "rep": "DELTA",
421
+ "type": "EEF",
422
+ "format": "DEFAULT",
423
+ "state_key": null
424
+ },
425
+ {
426
+ "rep": "DELTA",
427
+ "type": "EEF",
428
+ "format": "DEFAULT",
429
+ "state_key": null
430
+ },
431
+ {
432
+ "rep": "ABSOLUTE",
433
+ "type": "NON_EEF",
434
+ "format": "DEFAULT",
435
+ "state_key": null
436
+ }
437
+ ]
438
+ },
439
+ "language": {
440
+ "delta_indices": [
441
+ 0
442
+ ],
443
+ "modality_keys": [
444
+ "annotation.human.action.task_description"
445
+ ],
446
+ "sin_cos_embedding_keys": null,
447
+ "mean_std_embedding_keys": null,
448
+ "action_configs": null
449
+ }
450
+ },
451
+ "berkeley_autolab_ur5": {
452
+ "video": {
453
+ "delta_indices": [
454
+ -6,
455
+ -4,
456
+ -2,
457
+ 0
458
+ ],
459
+ "modality_keys": [
460
+ "primary",
461
+ "wrist"
462
+ ],
463
+ "sin_cos_embedding_keys": null,
464
+ "mean_std_embedding_keys": null,
465
+ "action_configs": null
466
+ },
467
+ "state": {
468
+ "delta_indices": [
469
+ 0
470
+ ],
471
+ "modality_keys": [
472
+ "end_effector_position",
473
+ "end_effector_rotation",
474
+ "gripper_position"
475
+ ],
476
+ "sin_cos_embedding_keys": null,
477
+ "mean_std_embedding_keys": null,
478
+ "action_configs": null
479
+ },
480
+ "action": {
481
+ "delta_indices": [
482
+ 0,
483
+ 1,
484
+ 2,
485
+ 3,
486
+ 4,
487
+ 5,
488
+ 6,
489
+ 7,
490
+ 8,
491
+ 9,
492
+ 10,
493
+ 11,
494
+ 12,
495
+ 13,
496
+ 14,
497
+ 15
498
+ ],
499
+ "modality_keys": [
500
+ "end_effector_position",
501
+ "end_effector_rotation",
502
+ "gripper_close"
503
+ ],
504
+ "sin_cos_embedding_keys": null,
505
+ "mean_std_embedding_keys": null,
506
+ "action_configs": [
507
+ {
508
+ "rep": "DELTA",
509
+ "type": "EEF",
510
+ "format": "DEFAULT",
511
+ "state_key": null
512
+ },
513
+ {
514
+ "rep": "DELTA",
515
+ "type": "EEF",
516
+ "format": "DEFAULT",
517
+ "state_key": null
518
+ },
519
+ {
520
+ "rep": "ABSOLUTE",
521
+ "type": "NON_EEF",
522
+ "format": "DEFAULT",
523
+ "state_key": null
524
+ }
525
+ ]
526
+ },
527
+ "language": {
528
+ "delta_indices": [
529
+ 0
530
+ ],
531
+ "modality_keys": [
532
+ "annotation.human.action.task_description"
533
+ ],
534
+ "sin_cos_embedding_keys": null,
535
+ "mean_std_embedding_keys": null,
536
+ "action_configs": null
537
+ }
538
+ },
539
+ "fractal20220817_data": {
540
+ "video": {
541
+ "delta_indices": [
542
+ -6,
543
+ -4,
544
+ -2,
545
+ 0
546
+ ],
547
+ "modality_keys": [
548
+ "primary"
549
+ ],
550
+ "sin_cos_embedding_keys": null,
551
+ "mean_std_embedding_keys": null,
552
+ "action_configs": null
553
+ },
554
+ "state": {
555
+ "delta_indices": [
556
+ 0
557
+ ],
558
+ "modality_keys": [
559
+ "end_effector_position",
560
+ "end_effector_rotation",
561
+ "gripper_position"
562
+ ],
563
+ "sin_cos_embedding_keys": null,
564
+ "mean_std_embedding_keys": null,
565
+ "action_configs": null
566
+ },
567
+ "action": {
568
+ "delta_indices": [
569
+ 0,
570
+ 1,
571
+ 2,
572
+ 3,
573
+ 4,
574
+ 5,
575
+ 6,
576
+ 7,
577
+ 8,
578
+ 9,
579
+ 10,
580
+ 11,
581
+ 12,
582
+ 13,
583
+ 14,
584
+ 15
585
+ ],
586
+ "modality_keys": [
587
+ "end_effector_position",
588
+ "end_effector_rotation",
589
+ "gripper_close"
590
+ ],
591
+ "sin_cos_embedding_keys": null,
592
+ "mean_std_embedding_keys": null,
593
+ "action_configs": [
594
+ {
595
+ "rep": "DELTA",
596
+ "type": "EEF",
597
+ "format": "DEFAULT",
598
+ "state_key": null
599
+ },
600
+ {
601
+ "rep": "DELTA",
602
+ "type": "EEF",
603
+ "format": "DEFAULT",
604
+ "state_key": null
605
+ },
606
+ {
607
+ "rep": "ABSOLUTE",
608
+ "type": "NON_EEF",
609
+ "format": "DEFAULT",
610
+ "state_key": null
611
+ }
612
+ ]
613
+ },
614
+ "language": {
615
+ "delta_indices": [
616
+ 0
617
+ ],
618
+ "modality_keys": [
619
+ "annotation.human.action.task_description"
620
+ ],
621
+ "sin_cos_embedding_keys": null,
622
+ "mean_std_embedding_keys": null,
623
+ "action_configs": null
624
+ }
625
+ },
626
+ "cmu_stretch": {
627
+ "video": {
628
+ "delta_indices": [
629
+ -6,
630
+ -4,
631
+ -2,
632
+ 0
633
+ ],
634
+ "modality_keys": [
635
+ "primary"
636
+ ],
637
+ "sin_cos_embedding_keys": null,
638
+ "mean_std_embedding_keys": null,
639
+ "action_configs": null
640
+ },
641
+ "state": {
642
+ "delta_indices": [
643
+ 0
644
+ ],
645
+ "modality_keys": [
646
+ "end_effector_position",
647
+ "end_effector_rotation",
648
+ "gripper_position"
649
+ ],
650
+ "sin_cos_embedding_keys": null,
651
+ "mean_std_embedding_keys": null,
652
+ "action_configs": null
653
+ },
654
+ "action": {
655
+ "delta_indices": [
656
+ 0,
657
+ 1,
658
+ 2,
659
+ 3,
660
+ 4,
661
+ 5,
662
+ 6,
663
+ 7,
664
+ 8,
665
+ 9,
666
+ 10,
667
+ 11,
668
+ 12,
669
+ 13,
670
+ 14,
671
+ 15
672
+ ],
673
+ "modality_keys": [
674
+ "end_effector_position",
675
+ "end_effector_rotation",
676
+ "gripper_close"
677
+ ],
678
+ "sin_cos_embedding_keys": null,
679
+ "mean_std_embedding_keys": null,
680
+ "action_configs": [
681
+ {
682
+ "rep": "DELTA",
683
+ "type": "EEF",
684
+ "format": "DEFAULT",
685
+ "state_key": null
686
+ },
687
+ {
688
+ "rep": "DELTA",
689
+ "type": "EEF",
690
+ "format": "DEFAULT",
691
+ "state_key": null
692
+ },
693
+ {
694
+ "rep": "ABSOLUTE",
695
+ "type": "NON_EEF",
696
+ "format": "DEFAULT",
697
+ "state_key": null
698
+ }
699
+ ]
700
+ },
701
+ "language": {
702
+ "delta_indices": [
703
+ 0
704
+ ],
705
+ "modality_keys": [
706
+ "annotation.human.action.task_description"
707
+ ],
708
+ "sin_cos_embedding_keys": null,
709
+ "mean_std_embedding_keys": null,
710
+ "action_configs": null
711
+ }
712
+ },
713
+ "berkeley_cable_routing": {
714
+ "video": {
715
+ "delta_indices": [
716
+ -6,
717
+ -4,
718
+ -2,
719
+ 0
720
+ ],
721
+ "modality_keys": [
722
+ "primary",
723
+ "secondary",
724
+ "wrist"
725
+ ],
726
+ "sin_cos_embedding_keys": null,
727
+ "mean_std_embedding_keys": null,
728
+ "action_configs": null
729
+ },
730
+ "state": {
731
+ "delta_indices": [
732
+ 0
733
+ ],
734
+ "modality_keys": [
735
+ "joint_position"
736
+ ],
737
+ "sin_cos_embedding_keys": null,
738
+ "mean_std_embedding_keys": null,
739
+ "action_configs": null
740
+ },
741
+ "action": {
742
+ "delta_indices": [
743
+ 0,
744
+ 1,
745
+ 2,
746
+ 3,
747
+ 4,
748
+ 5,
749
+ 6,
750
+ 7,
751
+ 8,
752
+ 9,
753
+ 10,
754
+ 11,
755
+ 12,
756
+ 13,
757
+ 14,
758
+ 15
759
+ ],
760
+ "modality_keys": [
761
+ "end_effector_position",
762
+ "end_effector_rotation",
763
+ "gripper_close"
764
+ ],
765
+ "sin_cos_embedding_keys": null,
766
+ "mean_std_embedding_keys": null,
767
+ "action_configs": [
768
+ {
769
+ "rep": "DELTA",
770
+ "type": "EEF",
771
+ "format": "DEFAULT",
772
+ "state_key": null
773
+ },
774
+ {
775
+ "rep": "DELTA",
776
+ "type": "EEF",
777
+ "format": "DEFAULT",
778
+ "state_key": null
779
+ },
780
+ {
781
+ "rep": "ABSOLUTE",
782
+ "type": "NON_EEF",
783
+ "format": "DEFAULT",
784
+ "state_key": null
785
+ }
786
+ ]
787
+ },
788
+ "language": {
789
+ "delta_indices": [
790
+ 0
791
+ ],
792
+ "modality_keys": [
793
+ "annotation.human.action.task_description"
794
+ ],
795
+ "sin_cos_embedding_keys": null,
796
+ "mean_std_embedding_keys": null,
797
+ "action_configs": null
798
+ }
799
+ },
800
+ "stanford_hydra_dataset_converted_externally_to_rlds": {
801
+ "video": {
802
+ "delta_indices": [
803
+ -6,
804
+ -4,
805
+ -2,
806
+ 0
807
+ ],
808
+ "modality_keys": [
809
+ "primary",
810
+ "wrist"
811
+ ],
812
+ "sin_cos_embedding_keys": null,
813
+ "mean_std_embedding_keys": null,
814
+ "action_configs": null
815
+ },
816
+ "state": {
817
+ "delta_indices": [
818
+ 0
819
+ ],
820
+ "modality_keys": [
821
+ "end_effector_position",
822
+ "end_effector_rotation",
823
+ "gripper_position"
824
+ ],
825
+ "sin_cos_embedding_keys": null,
826
+ "mean_std_embedding_keys": null,
827
+ "action_configs": null
828
+ },
829
+ "action": {
830
+ "delta_indices": [
831
+ 0,
832
+ 1,
833
+ 2,
834
+ 3,
835
+ 4,
836
+ 5,
837
+ 6,
838
+ 7,
839
+ 8,
840
+ 9,
841
+ 10,
842
+ 11,
843
+ 12,
844
+ 13,
845
+ 14,
846
+ 15
847
+ ],
848
+ "modality_keys": [
849
+ "end_effector_position",
850
+ "end_effector_rotation",
851
+ "gripper_close"
852
+ ],
853
+ "sin_cos_embedding_keys": null,
854
+ "mean_std_embedding_keys": null,
855
+ "action_configs": [
856
+ {
857
+ "rep": "DELTA",
858
+ "type": "EEF",
859
+ "format": "DEFAULT",
860
+ "state_key": null
861
+ },
862
+ {
863
+ "rep": "DELTA",
864
+ "type": "EEF",
865
+ "format": "DEFAULT",
866
+ "state_key": null
867
+ },
868
+ {
869
+ "rep": "ABSOLUTE",
870
+ "type": "NON_EEF",
871
+ "format": "DEFAULT",
872
+ "state_key": null
873
+ }
874
+ ]
875
+ },
876
+ "language": {
877
+ "delta_indices": [
878
+ 0
879
+ ],
880
+ "modality_keys": [
881
+ "annotation.human.action.task_description"
882
+ ],
883
+ "sin_cos_embedding_keys": null,
884
+ "mean_std_embedding_keys": null,
885
+ "action_configs": null
886
+ }
887
+ },
888
+ "utaustin_mutex": {
889
+ "video": {
890
+ "delta_indices": [
891
+ -6,
892
+ -4,
893
+ -2,
894
+ 0
895
+ ],
896
+ "modality_keys": [
897
+ "primary",
898
+ "wrist"
899
+ ],
900
+ "sin_cos_embedding_keys": null,
901
+ "mean_std_embedding_keys": null,
902
+ "action_configs": null
903
+ },
904
+ "state": {
905
+ "delta_indices": [
906
+ 0
907
+ ],
908
+ "modality_keys": [
909
+ "joint_position",
910
+ "gripper_position"
911
+ ],
912
+ "sin_cos_embedding_keys": null,
913
+ "mean_std_embedding_keys": null,
914
+ "action_configs": null
915
+ },
916
+ "action": {
917
+ "delta_indices": [
918
+ 0,
919
+ 1,
920
+ 2,
921
+ 3,
922
+ 4,
923
+ 5,
924
+ 6,
925
+ 7,
926
+ 8,
927
+ 9,
928
+ 10,
929
+ 11,
930
+ 12,
931
+ 13,
932
+ 14,
933
+ 15
934
+ ],
935
+ "modality_keys": [
936
+ "end_effector_position",
937
+ "end_effector_rotation",
938
+ "gripper_close"
939
+ ],
940
+ "sin_cos_embedding_keys": null,
941
+ "mean_std_embedding_keys": null,
942
+ "action_configs": [
943
+ {
944
+ "rep": "DELTA",
945
+ "type": "EEF",
946
+ "format": "DEFAULT",
947
+ "state_key": null
948
+ },
949
+ {
950
+ "rep": "DELTA",
951
+ "type": "EEF",
952
+ "format": "DEFAULT",
953
+ "state_key": null
954
+ },
955
+ {
956
+ "rep": "ABSOLUTE",
957
+ "type": "NON_EEF",
958
+ "format": "DEFAULT",
959
+ "state_key": null
960
+ }
961
+ ]
962
+ },
963
+ "language": {
964
+ "delta_indices": [
965
+ 0
966
+ ],
967
+ "modality_keys": [
968
+ "annotation.human.action.task_description"
969
+ ],
970
+ "sin_cos_embedding_keys": null,
971
+ "mean_std_embedding_keys": null,
972
+ "action_configs": null
973
+ }
974
+ },
975
+ "furniture_bench_dataset_converted_externally_to_rlds": {
976
+ "video": {
977
+ "delta_indices": [
978
+ -6,
979
+ -4,
980
+ -2,
981
+ 0
982
+ ],
983
+ "modality_keys": [
984
+ "primary",
985
+ "wrist"
986
+ ],
987
+ "sin_cos_embedding_keys": null,
988
+ "mean_std_embedding_keys": null,
989
+ "action_configs": null
990
+ },
991
+ "state": {
992
+ "delta_indices": [
993
+ 0
994
+ ],
995
+ "modality_keys": [
996
+ "end_effector_position",
997
+ "end_effector_rotation",
998
+ "gripper_position"
999
+ ],
1000
+ "sin_cos_embedding_keys": null,
1001
+ "mean_std_embedding_keys": null,
1002
+ "action_configs": null
1003
+ },
1004
+ "action": {
1005
+ "delta_indices": [
1006
+ 0,
1007
+ 1,
1008
+ 2,
1009
+ 3,
1010
+ 4,
1011
+ 5,
1012
+ 6,
1013
+ 7,
1014
+ 8,
1015
+ 9,
1016
+ 10,
1017
+ 11,
1018
+ 12,
1019
+ 13,
1020
+ 14,
1021
+ 15
1022
+ ],
1023
+ "modality_keys": [
1024
+ "end_effector_position",
1025
+ "end_effector_rotation",
1026
+ "gripper_close"
1027
+ ],
1028
+ "sin_cos_embedding_keys": null,
1029
+ "mean_std_embedding_keys": null,
1030
+ "action_configs": [
1031
+ {
1032
+ "rep": "DELTA",
1033
+ "type": "EEF",
1034
+ "format": "DEFAULT",
1035
+ "state_key": null
1036
+ },
1037
+ {
1038
+ "rep": "DELTA",
1039
+ "type": "EEF",
1040
+ "format": "DEFAULT",
1041
+ "state_key": null
1042
+ },
1043
+ {
1044
+ "rep": "ABSOLUTE",
1045
+ "type": "NON_EEF",
1046
+ "format": "DEFAULT",
1047
+ "state_key": null
1048
+ }
1049
+ ]
1050
+ },
1051
+ "language": {
1052
+ "delta_indices": [
1053
+ 0
1054
+ ],
1055
+ "modality_keys": [
1056
+ "annotation.human.action.task_description"
1057
+ ],
1058
+ "sin_cos_embedding_keys": null,
1059
+ "mean_std_embedding_keys": null,
1060
+ "action_configs": null
1061
+ }
1062
+ },
1063
+ "neural_gr1": {
1064
+ "video": {
1065
+ "delta_indices": [
1066
+ -6,
1067
+ -4,
1068
+ -2,
1069
+ 0
1070
+ ],
1071
+ "modality_keys": [
1072
+ "ego_view"
1073
+ ],
1074
+ "sin_cos_embedding_keys": null,
1075
+ "mean_std_embedding_keys": null,
1076
+ "action_configs": null
1077
+ },
1078
+ "state": {
1079
+ "delta_indices": [
1080
+ 0
1081
+ ],
1082
+ "modality_keys": [
1083
+ "left_arm",
1084
+ "left_hand",
1085
+ "left_leg",
1086
+ "neck",
1087
+ "right_arm",
1088
+ "right_hand",
1089
+ "right_leg",
1090
+ "waist"
1091
+ ],
1092
+ "sin_cos_embedding_keys": null,
1093
+ "mean_std_embedding_keys": null,
1094
+ "action_configs": null
1095
+ },
1096
+ "action": {
1097
+ "delta_indices": [
1098
+ 0,
1099
+ 1,
1100
+ 2,
1101
+ 3,
1102
+ 4,
1103
+ 5,
1104
+ 6,
1105
+ 7,
1106
+ 8,
1107
+ 9,
1108
+ 10,
1109
+ 11,
1110
+ 12,
1111
+ 13,
1112
+ 14,
1113
+ 15
1114
+ ],
1115
+ "modality_keys": [
1116
+ "left_arm",
1117
+ "left_hand",
1118
+ "left_leg",
1119
+ "neck",
1120
+ "right_arm",
1121
+ "right_hand",
1122
+ "right_leg",
1123
+ "waist"
1124
+ ],
1125
+ "sin_cos_embedding_keys": null,
1126
+ "mean_std_embedding_keys": null,
1127
+ "action_configs": [
1128
+ {
1129
+ "rep": "ABSOLUTE",
1130
+ "type": "NON_EEF",
1131
+ "format": "DEFAULT",
1132
+ "state_key": null
1133
+ },
1134
+ {
1135
+ "rep": "ABSOLUTE",
1136
+ "type": "NON_EEF",
1137
+ "format": "DEFAULT",
1138
+ "state_key": null
1139
+ },
1140
+ {
1141
+ "rep": "ABSOLUTE",
1142
+ "type": "NON_EEF",
1143
+ "format": "DEFAULT",
1144
+ "state_key": null
1145
+ },
1146
+ {
1147
+ "rep": "ABSOLUTE",
1148
+ "type": "NON_EEF",
1149
+ "format": "DEFAULT",
1150
+ "state_key": null
1151
+ },
1152
+ {
1153
+ "rep": "ABSOLUTE",
1154
+ "type": "NON_EEF",
1155
+ "format": "DEFAULT",
1156
+ "state_key": null
1157
+ },
1158
+ {
1159
+ "rep": "ABSOLUTE",
1160
+ "type": "NON_EEF",
1161
+ "format": "DEFAULT",
1162
+ "state_key": null
1163
+ },
1164
+ {
1165
+ "rep": "ABSOLUTE",
1166
+ "type": "NON_EEF",
1167
+ "format": "DEFAULT",
1168
+ "state_key": null
1169
+ },
1170
+ {
1171
+ "rep": "ABSOLUTE",
1172
+ "type": "NON_EEF",
1173
+ "format": "DEFAULT",
1174
+ "state_key": null
1175
+ }
1176
+ ]
1177
+ },
1178
+ "language": {
1179
+ "delta_indices": [
1180
+ 0
1181
+ ],
1182
+ "modality_keys": [
1183
+ "annotation.human.action.task_description"
1184
+ ],
1185
+ "sin_cos_embedding_keys": null,
1186
+ "mean_std_embedding_keys": null,
1187
+ "action_configs": null
1188
+ }
1189
+ },
1190
+ "agibot_gripper": {
1191
+ "video": {
1192
+ "delta_indices": [
1193
+ -6,
1194
+ -4,
1195
+ -2,
1196
+ 0
1197
+ ],
1198
+ "modality_keys": [
1199
+ "primary",
1200
+ "wrist_left",
1201
+ "wrist_right"
1202
+ ],
1203
+ "sin_cos_embedding_keys": null,
1204
+ "mean_std_embedding_keys": null,
1205
+ "action_configs": null
1206
+ },
1207
+ "state": {
1208
+ "delta_indices": [
1209
+ 0
1210
+ ],
1211
+ "modality_keys": [
1212
+ "state"
1213
+ ],
1214
+ "sin_cos_embedding_keys": null,
1215
+ "mean_std_embedding_keys": null,
1216
+ "action_configs": null
1217
+ },
1218
+ "action": {
1219
+ "delta_indices": [
1220
+ 0,
1221
+ 1,
1222
+ 2,
1223
+ 3,
1224
+ 4,
1225
+ 5,
1226
+ 6,
1227
+ 7,
1228
+ 8,
1229
+ 9,
1230
+ 10,
1231
+ 11,
1232
+ 12,
1233
+ 13,
1234
+ 14,
1235
+ 15
1236
+ ],
1237
+ "modality_keys": [
1238
+ "action"
1239
+ ],
1240
+ "sin_cos_embedding_keys": null,
1241
+ "mean_std_embedding_keys": null,
1242
+ "action_configs": [
1243
+ {
1244
+ "rep": "ABSOLUTE",
1245
+ "type": "NON_EEF",
1246
+ "format": "DEFAULT",
1247
+ "state_key": null
1248
+ }
1249
+ ]
1250
+ },
1251
+ "language": {
1252
+ "delta_indices": [
1253
+ 0
1254
+ ],
1255
+ "modality_keys": [
1256
+ "annotation.human.action.task_description"
1257
+ ],
1258
+ "sin_cos_embedding_keys": null,
1259
+ "mean_std_embedding_keys": null,
1260
+ "action_configs": null
1261
+ }
1262
+ },
1263
+ "fmb_dataset": {
1264
+ "video": {
1265
+ "delta_indices": [
1266
+ -6,
1267
+ -4,
1268
+ -2,
1269
+ 0
1270
+ ],
1271
+ "modality_keys": [
1272
+ "primary",
1273
+ "secondary",
1274
+ "wrist"
1275
+ ],
1276
+ "sin_cos_embedding_keys": null,
1277
+ "mean_std_embedding_keys": null,
1278
+ "action_configs": null
1279
+ },
1280
+ "state": {
1281
+ "delta_indices": [
1282
+ 0
1283
+ ],
1284
+ "modality_keys": [
1285
+ "end_effector_position",
1286
+ "end_effector_rotation",
1287
+ "gripper_position"
1288
+ ],
1289
+ "sin_cos_embedding_keys": null,
1290
+ "mean_std_embedding_keys": null,
1291
+ "action_configs": null
1292
+ },
1293
+ "action": {
1294
+ "delta_indices": [
1295
+ 0,
1296
+ 1,
1297
+ 2,
1298
+ 3,
1299
+ 4,
1300
+ 5,
1301
+ 6,
1302
+ 7,
1303
+ 8,
1304
+ 9,
1305
+ 10,
1306
+ 11,
1307
+ 12,
1308
+ 13,
1309
+ 14,
1310
+ 15
1311
+ ],
1312
+ "modality_keys": [
1313
+ "end_effector_position",
1314
+ "end_effector_rotation",
1315
+ "gripper_close"
1316
+ ],
1317
+ "sin_cos_embedding_keys": null,
1318
+ "mean_std_embedding_keys": null,
1319
+ "action_configs": [
1320
+ {
1321
+ "rep": "DELTA",
1322
+ "type": "EEF",
1323
+ "format": "DEFAULT",
1324
+ "state_key": null
1325
+ },
1326
+ {
1327
+ "rep": "DELTA",
1328
+ "type": "EEF",
1329
+ "format": "DEFAULT",
1330
+ "state_key": null
1331
+ },
1332
+ {
1333
+ "rep": "ABSOLUTE",
1334
+ "type": "NON_EEF",
1335
+ "format": "DEFAULT",
1336
+ "state_key": null
1337
+ }
1338
+ ]
1339
+ },
1340
+ "language": {
1341
+ "delta_indices": [
1342
+ 0
1343
+ ],
1344
+ "modality_keys": [
1345
+ "annotation.human.action.task_description"
1346
+ ],
1347
+ "sin_cos_embedding_keys": null,
1348
+ "mean_std_embedding_keys": null,
1349
+ "action_configs": null
1350
+ }
1351
+ },
1352
+ "dobbe": {
1353
+ "video": {
1354
+ "delta_indices": [
1355
+ -6,
1356
+ -4,
1357
+ -2,
1358
+ 0
1359
+ ],
1360
+ "modality_keys": [
1361
+ "wrist"
1362
+ ],
1363
+ "sin_cos_embedding_keys": null,
1364
+ "mean_std_embedding_keys": null,
1365
+ "action_configs": null
1366
+ },
1367
+ "state": {
1368
+ "delta_indices": [
1369
+ 0
1370
+ ],
1371
+ "modality_keys": [
1372
+ "end_effector_position",
1373
+ "end_effector_rotation",
1374
+ "gripper_position"
1375
+ ],
1376
+ "sin_cos_embedding_keys": null,
1377
+ "mean_std_embedding_keys": null,
1378
+ "action_configs": null
1379
+ },
1380
+ "action": {
1381
+ "delta_indices": [
1382
+ 0,
1383
+ 1,
1384
+ 2,
1385
+ 3,
1386
+ 4,
1387
+ 5,
1388
+ 6,
1389
+ 7,
1390
+ 8,
1391
+ 9,
1392
+ 10,
1393
+ 11,
1394
+ 12,
1395
+ 13,
1396
+ 14,
1397
+ 15
1398
+ ],
1399
+ "modality_keys": [
1400
+ "end_effector_position",
1401
+ "end_effector_rotation",
1402
+ "gripper_close"
1403
+ ],
1404
+ "sin_cos_embedding_keys": null,
1405
+ "mean_std_embedding_keys": null,
1406
+ "action_configs": [
1407
+ {
1408
+ "rep": "DELTA",
1409
+ "type": "EEF",
1410
+ "format": "DEFAULT",
1411
+ "state_key": null
1412
+ },
1413
+ {
1414
+ "rep": "DELTA",
1415
+ "type": "EEF",
1416
+ "format": "DEFAULT",
1417
+ "state_key": null
1418
+ },
1419
+ {
1420
+ "rep": "ABSOLUTE",
1421
+ "type": "NON_EEF",
1422
+ "format": "DEFAULT",
1423
+ "state_key": null
1424
+ }
1425
+ ]
1426
+ },
1427
+ "language": {
1428
+ "delta_indices": [
1429
+ 0
1430
+ ],
1431
+ "modality_keys": [
1432
+ "annotation.human.action.task_description"
1433
+ ],
1434
+ "sin_cos_embedding_keys": null,
1435
+ "mean_std_embedding_keys": null,
1436
+ "action_configs": null
1437
+ }
1438
+ },
1439
+ "viola": {
1440
+ "video": {
1441
+ "delta_indices": [
1442
+ -6,
1443
+ -4,
1444
+ -2,
1445
+ 0
1446
+ ],
1447
+ "modality_keys": [
1448
+ "primary",
1449
+ "wrist"
1450
+ ],
1451
+ "sin_cos_embedding_keys": null,
1452
+ "mean_std_embedding_keys": null,
1453
+ "action_configs": null
1454
+ },
1455
+ "state": {
1456
+ "delta_indices": [
1457
+ 0
1458
+ ],
1459
+ "modality_keys": [
1460
+ "joint_position",
1461
+ "gripper_position"
1462
+ ],
1463
+ "sin_cos_embedding_keys": null,
1464
+ "mean_std_embedding_keys": null,
1465
+ "action_configs": null
1466
+ },
1467
+ "action": {
1468
+ "delta_indices": [
1469
+ 0,
1470
+ 1,
1471
+ 2,
1472
+ 3,
1473
+ 4,
1474
+ 5,
1475
+ 6,
1476
+ 7,
1477
+ 8,
1478
+ 9,
1479
+ 10,
1480
+ 11,
1481
+ 12,
1482
+ 13,
1483
+ 14,
1484
+ 15
1485
+ ],
1486
+ "modality_keys": [
1487
+ "end_effector_position",
1488
+ "end_effector_rotation",
1489
+ "gripper_close"
1490
+ ],
1491
+ "sin_cos_embedding_keys": null,
1492
+ "mean_std_embedding_keys": null,
1493
+ "action_configs": [
1494
+ {
1495
+ "rep": "DELTA",
1496
+ "type": "EEF",
1497
+ "format": "DEFAULT",
1498
+ "state_key": null
1499
+ },
1500
+ {
1501
+ "rep": "DELTA",
1502
+ "type": "EEF",
1503
+ "format": "DEFAULT",
1504
+ "state_key": null
1505
+ },
1506
+ {
1507
+ "rep": "ABSOLUTE",
1508
+ "type": "NON_EEF",
1509
+ "format": "DEFAULT",
1510
+ "state_key": null
1511
+ }
1512
+ ]
1513
+ },
1514
+ "language": {
1515
+ "delta_indices": [
1516
+ 0
1517
+ ],
1518
+ "modality_keys": [
1519
+ "annotation.human.action.task_description"
1520
+ ],
1521
+ "sin_cos_embedding_keys": null,
1522
+ "mean_std_embedding_keys": null,
1523
+ "action_configs": null
1524
+ }
1525
+ },
1526
+ "humanoid_everyday_h1": {
1527
+ "video": {
1528
+ "delta_indices": [
1529
+ -6,
1530
+ -4,
1531
+ -2,
1532
+ 0
1533
+ ],
1534
+ "modality_keys": [
1535
+ "egocentric_resized"
1536
+ ],
1537
+ "sin_cos_embedding_keys": null,
1538
+ "mean_std_embedding_keys": null,
1539
+ "action_configs": null
1540
+ },
1541
+ "state": {
1542
+ "delta_indices": [
1543
+ 0
1544
+ ],
1545
+ "modality_keys": [
1546
+ "left_arm",
1547
+ "left_hand",
1548
+ "right_arm",
1549
+ "right_hand"
1550
+ ],
1551
+ "sin_cos_embedding_keys": null,
1552
+ "mean_std_embedding_keys": null,
1553
+ "action_configs": null
1554
+ },
1555
+ "action": {
1556
+ "delta_indices": [
1557
+ 0,
1558
+ 1,
1559
+ 2,
1560
+ 3,
1561
+ 4,
1562
+ 5,
1563
+ 6,
1564
+ 7,
1565
+ 8,
1566
+ 9,
1567
+ 10,
1568
+ 11,
1569
+ 12,
1570
+ 13,
1571
+ 14,
1572
+ 15
1573
+ ],
1574
+ "modality_keys": [
1575
+ "left_arm",
1576
+ "left_hand",
1577
+ "right_arm",
1578
+ "right_hand"
1579
+ ],
1580
+ "sin_cos_embedding_keys": null,
1581
+ "mean_std_embedding_keys": null,
1582
+ "action_configs": [
1583
+ {
1584
+ "rep": "ABSOLUTE",
1585
+ "type": "NON_EEF",
1586
+ "format": "DEFAULT",
1587
+ "state_key": null
1588
+ },
1589
+ {
1590
+ "rep": "ABSOLUTE",
1591
+ "type": "NON_EEF",
1592
+ "format": "DEFAULT",
1593
+ "state_key": null
1594
+ },
1595
+ {
1596
+ "rep": "ABSOLUTE",
1597
+ "type": "NON_EEF",
1598
+ "format": "DEFAULT",
1599
+ "state_key": null
1600
+ },
1601
+ {
1602
+ "rep": "ABSOLUTE",
1603
+ "type": "NON_EEF",
1604
+ "format": "DEFAULT",
1605
+ "state_key": null
1606
+ }
1607
+ ]
1608
+ },
1609
+ "language": {
1610
+ "delta_indices": [
1611
+ 0
1612
+ ],
1613
+ "modality_keys": [
1614
+ "annotation.human.action.task_description"
1615
+ ],
1616
+ "sin_cos_embedding_keys": null,
1617
+ "mean_std_embedding_keys": null,
1618
+ "action_configs": null
1619
+ }
1620
+ },
1621
+ "austin_buds_dataset_converted_externally_to_rlds": {
1622
+ "video": {
1623
+ "delta_indices": [
1624
+ -6,
1625
+ -4,
1626
+ -2,
1627
+ 0
1628
+ ],
1629
+ "modality_keys": [
1630
+ "primary",
1631
+ "wrist"
1632
+ ],
1633
+ "sin_cos_embedding_keys": null,
1634
+ "mean_std_embedding_keys": null,
1635
+ "action_configs": null
1636
+ },
1637
+ "state": {
1638
+ "delta_indices": [
1639
+ 0
1640
+ ],
1641
+ "modality_keys": [
1642
+ "joint_position",
1643
+ "gripper_position"
1644
+ ],
1645
+ "sin_cos_embedding_keys": null,
1646
+ "mean_std_embedding_keys": null,
1647
+ "action_configs": null
1648
+ },
1649
+ "action": {
1650
+ "delta_indices": [
1651
+ 0,
1652
+ 1,
1653
+ 2,
1654
+ 3,
1655
+ 4,
1656
+ 5,
1657
+ 6,
1658
+ 7,
1659
+ 8,
1660
+ 9,
1661
+ 10,
1662
+ 11,
1663
+ 12,
1664
+ 13,
1665
+ 14,
1666
+ 15
1667
+ ],
1668
+ "modality_keys": [
1669
+ "end_effector_position",
1670
+ "end_effector_rotation",
1671
+ "gripper_close"
1672
+ ],
1673
+ "sin_cos_embedding_keys": null,
1674
+ "mean_std_embedding_keys": null,
1675
+ "action_configs": [
1676
+ {
1677
+ "rep": "DELTA",
1678
+ "type": "EEF",
1679
+ "format": "DEFAULT",
1680
+ "state_key": null
1681
+ },
1682
+ {
1683
+ "rep": "DELTA",
1684
+ "type": "EEF",
1685
+ "format": "DEFAULT",
1686
+ "state_key": null
1687
+ },
1688
+ {
1689
+ "rep": "ABSOLUTE",
1690
+ "type": "NON_EEF",
1691
+ "format": "DEFAULT",
1692
+ "state_key": null
1693
+ }
1694
+ ]
1695
+ },
1696
+ "language": {
1697
+ "delta_indices": [
1698
+ 0
1699
+ ],
1700
+ "modality_keys": [
1701
+ "annotation.human.action.task_description"
1702
+ ],
1703
+ "sin_cos_embedding_keys": null,
1704
+ "mean_std_embedding_keys": null,
1705
+ "action_configs": null
1706
+ }
1707
+ },
1708
+ "taco_play": {
1709
+ "video": {
1710
+ "delta_indices": [
1711
+ -6,
1712
+ -4,
1713
+ -2,
1714
+ 0
1715
+ ],
1716
+ "modality_keys": [
1717
+ "primary",
1718
+ "wrist"
1719
+ ],
1720
+ "sin_cos_embedding_keys": null,
1721
+ "mean_std_embedding_keys": null,
1722
+ "action_configs": null
1723
+ },
1724
+ "state": {
1725
+ "delta_indices": [
1726
+ 0
1727
+ ],
1728
+ "modality_keys": [
1729
+ "end_effector_position",
1730
+ "end_effector_rotation",
1731
+ "gripper_position"
1732
+ ],
1733
+ "sin_cos_embedding_keys": null,
1734
+ "mean_std_embedding_keys": null,
1735
+ "action_configs": null
1736
+ },
1737
+ "action": {
1738
+ "delta_indices": [
1739
+ 0,
1740
+ 1,
1741
+ 2,
1742
+ 3,
1743
+ 4,
1744
+ 5,
1745
+ 6,
1746
+ 7,
1747
+ 8,
1748
+ 9,
1749
+ 10,
1750
+ 11,
1751
+ 12,
1752
+ 13,
1753
+ 14,
1754
+ 15
1755
+ ],
1756
+ "modality_keys": [
1757
+ "end_effector_position",
1758
+ "end_effector_rotation",
1759
+ "gripper_close"
1760
+ ],
1761
+ "sin_cos_embedding_keys": null,
1762
+ "mean_std_embedding_keys": null,
1763
+ "action_configs": [
1764
+ {
1765
+ "rep": "DELTA",
1766
+ "type": "EEF",
1767
+ "format": "DEFAULT",
1768
+ "state_key": null
1769
+ },
1770
+ {
1771
+ "rep": "DELTA",
1772
+ "type": "EEF",
1773
+ "format": "DEFAULT",
1774
+ "state_key": null
1775
+ },
1776
+ {
1777
+ "rep": "ABSOLUTE",
1778
+ "type": "NON_EEF",
1779
+ "format": "DEFAULT",
1780
+ "state_key": null
1781
+ }
1782
+ ]
1783
+ },
1784
+ "language": {
1785
+ "delta_indices": [
1786
+ 0
1787
+ ],
1788
+ "modality_keys": [
1789
+ "annotation.human.action.task_description"
1790
+ ],
1791
+ "sin_cos_embedding_keys": null,
1792
+ "mean_std_embedding_keys": null,
1793
+ "action_configs": null
1794
+ }
1795
+ },
1796
+ "toto": {
1797
+ "video": {
1798
+ "delta_indices": [
1799
+ -6,
1800
+ -4,
1801
+ -2,
1802
+ 0
1803
+ ],
1804
+ "modality_keys": [
1805
+ "primary"
1806
+ ],
1807
+ "sin_cos_embedding_keys": null,
1808
+ "mean_std_embedding_keys": null,
1809
+ "action_configs": null
1810
+ },
1811
+ "state": {
1812
+ "delta_indices": [
1813
+ 0
1814
+ ],
1815
+ "modality_keys": [
1816
+ "joint_position",
1817
+ "gripper_position"
1818
+ ],
1819
+ "sin_cos_embedding_keys": null,
1820
+ "mean_std_embedding_keys": null,
1821
+ "action_configs": null
1822
+ },
1823
+ "action": {
1824
+ "delta_indices": [
1825
+ 0,
1826
+ 1,
1827
+ 2,
1828
+ 3,
1829
+ 4,
1830
+ 5,
1831
+ 6,
1832
+ 7,
1833
+ 8,
1834
+ 9,
1835
+ 10,
1836
+ 11,
1837
+ 12,
1838
+ 13,
1839
+ 14,
1840
+ 15
1841
+ ],
1842
+ "modality_keys": [
1843
+ "end_effector_position",
1844
+ "end_effector_rotation",
1845
+ "gripper_close"
1846
+ ],
1847
+ "sin_cos_embedding_keys": null,
1848
+ "mean_std_embedding_keys": null,
1849
+ "action_configs": [
1850
+ {
1851
+ "rep": "DELTA",
1852
+ "type": "EEF",
1853
+ "format": "DEFAULT",
1854
+ "state_key": null
1855
+ },
1856
+ {
1857
+ "rep": "DELTA",
1858
+ "type": "EEF",
1859
+ "format": "DEFAULT",
1860
+ "state_key": null
1861
+ },
1862
+ {
1863
+ "rep": "ABSOLUTE",
1864
+ "type": "NON_EEF",
1865
+ "format": "DEFAULT",
1866
+ "state_key": null
1867
+ }
1868
+ ]
1869
+ },
1870
+ "language": {
1871
+ "delta_indices": [
1872
+ 0
1873
+ ],
1874
+ "modality_keys": [
1875
+ "annotation.human.action.task_description"
1876
+ ],
1877
+ "sin_cos_embedding_keys": null,
1878
+ "mean_std_embedding_keys": null,
1879
+ "action_configs": null
1880
+ }
1881
+ },
1882
+ "language_table": {
1883
+ "video": {
1884
+ "delta_indices": [
1885
+ -6,
1886
+ -4,
1887
+ -2,
1888
+ 0
1889
+ ],
1890
+ "modality_keys": [
1891
+ "primary"
1892
+ ],
1893
+ "sin_cos_embedding_keys": null,
1894
+ "mean_std_embedding_keys": null,
1895
+ "action_configs": null
1896
+ },
1897
+ "state": {
1898
+ "delta_indices": [
1899
+ 0
1900
+ ],
1901
+ "modality_keys": [
1902
+ "end_effector_position"
1903
+ ],
1904
+ "sin_cos_embedding_keys": null,
1905
+ "mean_std_embedding_keys": null,
1906
+ "action_configs": null
1907
+ },
1908
+ "action": {
1909
+ "delta_indices": [
1910
+ 0,
1911
+ 1,
1912
+ 2,
1913
+ 3,
1914
+ 4,
1915
+ 5,
1916
+ 6,
1917
+ 7,
1918
+ 8,
1919
+ 9,
1920
+ 10,
1921
+ 11,
1922
+ 12,
1923
+ 13,
1924
+ 14,
1925
+ 15
1926
+ ],
1927
+ "modality_keys": [
1928
+ "end_effector_position"
1929
+ ],
1930
+ "sin_cos_embedding_keys": null,
1931
+ "mean_std_embedding_keys": null,
1932
+ "action_configs": [
1933
+ {
1934
+ "rep": "DELTA",
1935
+ "type": "EEF",
1936
+ "format": "DEFAULT",
1937
+ "state_key": null
1938
+ }
1939
+ ]
1940
+ },
1941
+ "language": {
1942
+ "delta_indices": [
1943
+ 0
1944
+ ],
1945
+ "modality_keys": [
1946
+ "annotation.human.action.task_description"
1947
+ ],
1948
+ "sin_cos_embedding_keys": null,
1949
+ "mean_std_embedding_keys": null,
1950
+ "action_configs": null
1951
+ }
1952
+ },
1953
+ "nyu_franka_play_dataset_converted_externally_to_rlds": {
1954
+ "video": {
1955
+ "delta_indices": [
1956
+ -6,
1957
+ -4,
1958
+ -2,
1959
+ 0
1960
+ ],
1961
+ "modality_keys": [
1962
+ "primary",
1963
+ "secondary"
1964
+ ],
1965
+ "sin_cos_embedding_keys": null,
1966
+ "mean_std_embedding_keys": null,
1967
+ "action_configs": null
1968
+ },
1969
+ "state": {
1970
+ "delta_indices": [
1971
+ 0
1972
+ ],
1973
+ "modality_keys": [
1974
+ "end_effector_position",
1975
+ "end_effector_rotation",
1976
+ "gripper_position"
1977
+ ],
1978
+ "sin_cos_embedding_keys": null,
1979
+ "mean_std_embedding_keys": null,
1980
+ "action_configs": null
1981
+ },
1982
+ "action": {
1983
+ "delta_indices": [
1984
+ 0,
1985
+ 1,
1986
+ 2,
1987
+ 3,
1988
+ 4,
1989
+ 5,
1990
+ 6,
1991
+ 7,
1992
+ 8,
1993
+ 9,
1994
+ 10,
1995
+ 11,
1996
+ 12,
1997
+ 13,
1998
+ 14,
1999
+ 15
2000
+ ],
2001
+ "modality_keys": [
2002
+ "end_effector_position",
2003
+ "end_effector_rotation",
2004
+ "gripper_close"
2005
+ ],
2006
+ "sin_cos_embedding_keys": null,
2007
+ "mean_std_embedding_keys": null,
2008
+ "action_configs": [
2009
+ {
2010
+ "rep": "DELTA",
2011
+ "type": "EEF",
2012
+ "format": "DEFAULT",
2013
+ "state_key": null
2014
+ },
2015
+ {
2016
+ "rep": "DELTA",
2017
+ "type": "EEF",
2018
+ "format": "DEFAULT",
2019
+ "state_key": null
2020
+ },
2021
+ {
2022
+ "rep": "ABSOLUTE",
2023
+ "type": "NON_EEF",
2024
+ "format": "DEFAULT",
2025
+ "state_key": null
2026
+ }
2027
+ ]
2028
+ },
2029
+ "language": {
2030
+ "delta_indices": [
2031
+ 0
2032
+ ],
2033
+ "modality_keys": [
2034
+ "annotation.human.action.task_description"
2035
+ ],
2036
+ "sin_cos_embedding_keys": null,
2037
+ "mean_std_embedding_keys": null,
2038
+ "action_configs": null
2039
+ }
2040
+ },
2041
+ "ucsd_kitchen_dataset_converted_externally_to_rlds": {
2042
+ "video": {
2043
+ "delta_indices": [
2044
+ -6,
2045
+ -4,
2046
+ -2,
2047
+ 0
2048
+ ],
2049
+ "modality_keys": [
2050
+ "primary"
2051
+ ],
2052
+ "sin_cos_embedding_keys": null,
2053
+ "mean_std_embedding_keys": null,
2054
+ "action_configs": null
2055
+ },
2056
+ "state": {
2057
+ "delta_indices": [
2058
+ 0
2059
+ ],
2060
+ "modality_keys": [
2061
+ "joint_position"
2062
+ ],
2063
+ "sin_cos_embedding_keys": null,
2064
+ "mean_std_embedding_keys": null,
2065
+ "action_configs": null
2066
+ },
2067
+ "action": {
2068
+ "delta_indices": [
2069
+ 0,
2070
+ 1,
2071
+ 2,
2072
+ 3,
2073
+ 4,
2074
+ 5,
2075
+ 6,
2076
+ 7,
2077
+ 8,
2078
+ 9,
2079
+ 10,
2080
+ 11,
2081
+ 12,
2082
+ 13,
2083
+ 14,
2084
+ 15
2085
+ ],
2086
+ "modality_keys": [
2087
+ "end_effector_position",
2088
+ "end_effector_rotation",
2089
+ "gripper_close"
2090
+ ],
2091
+ "sin_cos_embedding_keys": null,
2092
+ "mean_std_embedding_keys": null,
2093
+ "action_configs": [
2094
+ {
2095
+ "rep": "DELTA",
2096
+ "type": "EEF",
2097
+ "format": "DEFAULT",
2098
+ "state_key": null
2099
+ },
2100
+ {
2101
+ "rep": "DELTA",
2102
+ "type": "EEF",
2103
+ "format": "DEFAULT",
2104
+ "state_key": null
2105
+ },
2106
+ {
2107
+ "rep": "ABSOLUTE",
2108
+ "type": "NON_EEF",
2109
+ "format": "DEFAULT",
2110
+ "state_key": null
2111
+ }
2112
+ ]
2113
+ },
2114
+ "language": {
2115
+ "delta_indices": [
2116
+ 0
2117
+ ],
2118
+ "modality_keys": [
2119
+ "annotation.human.action.task_description"
2120
+ ],
2121
+ "sin_cos_embedding_keys": null,
2122
+ "mean_std_embedding_keys": null,
2123
+ "action_configs": null
2124
+ }
2125
+ },
2126
+ "austin_sirius_dataset_converted_externally_to_rlds": {
2127
+ "video": {
2128
+ "delta_indices": [
2129
+ -6,
2130
+ -4,
2131
+ -2,
2132
+ 0
2133
+ ],
2134
+ "modality_keys": [
2135
+ "primary",
2136
+ "wrist"
2137
+ ],
2138
+ "sin_cos_embedding_keys": null,
2139
+ "mean_std_embedding_keys": null,
2140
+ "action_configs": null
2141
+ },
2142
+ "state": {
2143
+ "delta_indices": [
2144
+ 0
2145
+ ],
2146
+ "modality_keys": [
2147
+ "end_effector_position",
2148
+ "end_effector_rotation",
2149
+ "gripper_position"
2150
+ ],
2151
+ "sin_cos_embedding_keys": null,
2152
+ "mean_std_embedding_keys": null,
2153
+ "action_configs": null
2154
+ },
2155
+ "action": {
2156
+ "delta_indices": [
2157
+ 0,
2158
+ 1,
2159
+ 2,
2160
+ 3,
2161
+ 4,
2162
+ 5,
2163
+ 6,
2164
+ 7,
2165
+ 8,
2166
+ 9,
2167
+ 10,
2168
+ 11,
2169
+ 12,
2170
+ 13,
2171
+ 14,
2172
+ 15
2173
+ ],
2174
+ "modality_keys": [
2175
+ "end_effector_position",
2176
+ "end_effector_rotation",
2177
+ "gripper_close"
2178
+ ],
2179
+ "sin_cos_embedding_keys": null,
2180
+ "mean_std_embedding_keys": null,
2181
+ "action_configs": [
2182
+ {
2183
+ "rep": "DELTA",
2184
+ "type": "EEF",
2185
+ "format": "DEFAULT",
2186
+ "state_key": null
2187
+ },
2188
+ {
2189
+ "rep": "DELTA",
2190
+ "type": "EEF",
2191
+ "format": "DEFAULT",
2192
+ "state_key": null
2193
+ },
2194
+ {
2195
+ "rep": "ABSOLUTE",
2196
+ "type": "NON_EEF",
2197
+ "format": "DEFAULT",
2198
+ "state_key": null
2199
+ }
2200
+ ]
2201
+ },
2202
+ "language": {
2203
+ "delta_indices": [
2204
+ 0
2205
+ ],
2206
+ "modality_keys": [
2207
+ "annotation.human.action.task_description"
2208
+ ],
2209
+ "sin_cos_embedding_keys": null,
2210
+ "mean_std_embedding_keys": null,
2211
+ "action_configs": null
2212
+ }
2213
+ },
2214
+ "droid": {
2215
+ "video": {
2216
+ "delta_indices": [
2217
+ -6,
2218
+ -4,
2219
+ -2,
2220
+ 0
2221
+ ],
2222
+ "modality_keys": [
2223
+ "primary",
2224
+ "secondary",
2225
+ "wrist"
2226
+ ],
2227
+ "sin_cos_embedding_keys": null,
2228
+ "mean_std_embedding_keys": null,
2229
+ "action_configs": null
2230
+ },
2231
+ "state": {
2232
+ "delta_indices": [
2233
+ 0
2234
+ ],
2235
+ "modality_keys": [
2236
+ "end_effector_position",
2237
+ "end_effector_rotation",
2238
+ "gripper_position"
2239
+ ],
2240
+ "sin_cos_embedding_keys": null,
2241
+ "mean_std_embedding_keys": null,
2242
+ "action_configs": null
2243
+ },
2244
+ "action": {
2245
+ "delta_indices": [
2246
+ 0,
2247
+ 1,
2248
+ 2,
2249
+ 3,
2250
+ 4,
2251
+ 5,
2252
+ 6,
2253
+ 7,
2254
+ 8,
2255
+ 9,
2256
+ 10,
2257
+ 11,
2258
+ 12,
2259
+ 13,
2260
+ 14,
2261
+ 15
2262
+ ],
2263
+ "modality_keys": [
2264
+ "end_effector_position",
2265
+ "end_effector_rotation",
2266
+ "gripper_close"
2267
+ ],
2268
+ "sin_cos_embedding_keys": null,
2269
+ "mean_std_embedding_keys": null,
2270
+ "action_configs": [
2271
+ {
2272
+ "rep": "DELTA",
2273
+ "type": "EEF",
2274
+ "format": "DEFAULT",
2275
+ "state_key": null
2276
+ },
2277
+ {
2278
+ "rep": "DELTA",
2279
+ "type": "EEF",
2280
+ "format": "DEFAULT",
2281
+ "state_key": null
2282
+ },
2283
+ {
2284
+ "rep": "ABSOLUTE",
2285
+ "type": "NON_EEF",
2286
+ "format": "DEFAULT",
2287
+ "state_key": null
2288
+ }
2289
+ ]
2290
+ },
2291
+ "language": {
2292
+ "delta_indices": [
2293
+ 0
2294
+ ],
2295
+ "modality_keys": [
2296
+ "annotation.human.action.task_description"
2297
+ ],
2298
+ "sin_cos_embedding_keys": null,
2299
+ "mean_std_embedding_keys": null,
2300
+ "action_configs": null
2301
+ }
2302
+ },
2303
+ "bc_z": {
2304
+ "video": {
2305
+ "delta_indices": [
2306
+ -6,
2307
+ -4,
2308
+ -2,
2309
+ 0
2310
+ ],
2311
+ "modality_keys": [
2312
+ "primary"
2313
+ ],
2314
+ "sin_cos_embedding_keys": null,
2315
+ "mean_std_embedding_keys": null,
2316
+ "action_configs": null
2317
+ },
2318
+ "state": {
2319
+ "delta_indices": [
2320
+ 0
2321
+ ],
2322
+ "modality_keys": [
2323
+ "end_effector_position",
2324
+ "end_effector_rotation",
2325
+ "gripper_position"
2326
+ ],
2327
+ "sin_cos_embedding_keys": null,
2328
+ "mean_std_embedding_keys": null,
2329
+ "action_configs": null
2330
+ },
2331
+ "action": {
2332
+ "delta_indices": [
2333
+ 0,
2334
+ 1,
2335
+ 2,
2336
+ 3,
2337
+ 4,
2338
+ 5,
2339
+ 6,
2340
+ 7,
2341
+ 8,
2342
+ 9,
2343
+ 10,
2344
+ 11,
2345
+ 12,
2346
+ 13,
2347
+ 14,
2348
+ 15
2349
+ ],
2350
+ "modality_keys": [
2351
+ "end_effector_position",
2352
+ "end_effector_rotation",
2353
+ "gripper_close"
2354
+ ],
2355
+ "sin_cos_embedding_keys": null,
2356
+ "mean_std_embedding_keys": null,
2357
+ "action_configs": [
2358
+ {
2359
+ "rep": "DELTA",
2360
+ "type": "EEF",
2361
+ "format": "DEFAULT",
2362
+ "state_key": null
2363
+ },
2364
+ {
2365
+ "rep": "DELTA",
2366
+ "type": "EEF",
2367
+ "format": "DEFAULT",
2368
+ "state_key": null
2369
+ },
2370
+ {
2371
+ "rep": "ABSOLUTE",
2372
+ "type": "NON_EEF",
2373
+ "format": "DEFAULT",
2374
+ "state_key": null
2375
+ }
2376
+ ]
2377
+ },
2378
+ "language": {
2379
+ "delta_indices": [
2380
+ 0
2381
+ ],
2382
+ "modality_keys": [
2383
+ "annotation.human.action.task_description"
2384
+ ],
2385
+ "sin_cos_embedding_keys": null,
2386
+ "mean_std_embedding_keys": null,
2387
+ "action_configs": null
2388
+ }
2389
+ },
2390
+ "kuka": {
2391
+ "video": {
2392
+ "delta_indices": [
2393
+ -6,
2394
+ -4,
2395
+ -2,
2396
+ 0
2397
+ ],
2398
+ "modality_keys": [
2399
+ "primary"
2400
+ ],
2401
+ "sin_cos_embedding_keys": null,
2402
+ "mean_std_embedding_keys": null,
2403
+ "action_configs": null
2404
+ },
2405
+ "state": {
2406
+ "delta_indices": [
2407
+ 0
2408
+ ],
2409
+ "modality_keys": [
2410
+ "end_effector_position",
2411
+ "end_effector_rotation",
2412
+ "gripper_position"
2413
+ ],
2414
+ "sin_cos_embedding_keys": null,
2415
+ "mean_std_embedding_keys": null,
2416
+ "action_configs": null
2417
+ },
2418
+ "action": {
2419
+ "delta_indices": [
2420
+ 0,
2421
+ 1,
2422
+ 2,
2423
+ 3,
2424
+ 4,
2425
+ 5,
2426
+ 6,
2427
+ 7,
2428
+ 8,
2429
+ 9,
2430
+ 10,
2431
+ 11,
2432
+ 12,
2433
+ 13,
2434
+ 14,
2435
+ 15
2436
+ ],
2437
+ "modality_keys": [
2438
+ "end_effector_position",
2439
+ "end_effector_rotation",
2440
+ "gripper_close"
2441
+ ],
2442
+ "sin_cos_embedding_keys": null,
2443
+ "mean_std_embedding_keys": null,
2444
+ "action_configs": [
2445
+ {
2446
+ "rep": "DELTA",
2447
+ "type": "EEF",
2448
+ "format": "DEFAULT",
2449
+ "state_key": null
2450
+ },
2451
+ {
2452
+ "rep": "DELTA",
2453
+ "type": "EEF",
2454
+ "format": "DEFAULT",
2455
+ "state_key": null
2456
+ },
2457
+ {
2458
+ "rep": "ABSOLUTE",
2459
+ "type": "NON_EEF",
2460
+ "format": "DEFAULT",
2461
+ "state_key": null
2462
+ }
2463
+ ]
2464
+ },
2465
+ "language": {
2466
+ "delta_indices": [
2467
+ 0
2468
+ ],
2469
+ "modality_keys": [
2470
+ "annotation.human.action.task_description"
2471
+ ],
2472
+ "sin_cos_embedding_keys": null,
2473
+ "mean_std_embedding_keys": null,
2474
+ "action_configs": null
2475
+ }
2476
+ },
2477
+ "agibot_dexhand": {
2478
+ "video": {
2479
+ "delta_indices": [
2480
+ -6,
2481
+ -4,
2482
+ -2,
2483
+ 0
2484
+ ],
2485
+ "modality_keys": [
2486
+ "primary"
2487
+ ],
2488
+ "sin_cos_embedding_keys": null,
2489
+ "mean_std_embedding_keys": null,
2490
+ "action_configs": null
2491
+ },
2492
+ "state": {
2493
+ "delta_indices": [
2494
+ 0
2495
+ ],
2496
+ "modality_keys": [
2497
+ "state"
2498
+ ],
2499
+ "sin_cos_embedding_keys": null,
2500
+ "mean_std_embedding_keys": null,
2501
+ "action_configs": null
2502
+ },
2503
+ "action": {
2504
+ "delta_indices": [
2505
+ 0,
2506
+ 1,
2507
+ 2,
2508
+ 3,
2509
+ 4,
2510
+ 5,
2511
+ 6,
2512
+ 7,
2513
+ 8,
2514
+ 9,
2515
+ 10,
2516
+ 11,
2517
+ 12,
2518
+ 13,
2519
+ 14,
2520
+ 15
2521
+ ],
2522
+ "modality_keys": [
2523
+ "action"
2524
+ ],
2525
+ "sin_cos_embedding_keys": null,
2526
+ "mean_std_embedding_keys": null,
2527
+ "action_configs": [
2528
+ {
2529
+ "rep": "ABSOLUTE",
2530
+ "type": "NON_EEF",
2531
+ "format": "DEFAULT",
2532
+ "state_key": null
2533
+ }
2534
+ ]
2535
+ },
2536
+ "language": {
2537
+ "delta_indices": [
2538
+ 0
2539
+ ],
2540
+ "modality_keys": [
2541
+ "annotation.human.action.task_description"
2542
+ ],
2543
+ "sin_cos_embedding_keys": null,
2544
+ "mean_std_embedding_keys": null,
2545
+ "action_configs": null
2546
+ }
2547
+ },
2548
+ "action_net": {
2549
+ "video": {
2550
+ "delta_indices": [
2551
+ -6,
2552
+ -4,
2553
+ -2,
2554
+ 0
2555
+ ],
2556
+ "modality_keys": [
2557
+ "primary"
2558
+ ],
2559
+ "sin_cos_embedding_keys": null,
2560
+ "mean_std_embedding_keys": null,
2561
+ "action_configs": null
2562
+ },
2563
+ "state": {
2564
+ "delta_indices": [
2565
+ 0
2566
+ ],
2567
+ "modality_keys": [
2568
+ "state"
2569
+ ],
2570
+ "sin_cos_embedding_keys": null,
2571
+ "mean_std_embedding_keys": null,
2572
+ "action_configs": null
2573
+ },
2574
+ "action": {
2575
+ "delta_indices": [
2576
+ 0,
2577
+ 1,
2578
+ 2,
2579
+ 3,
2580
+ 4,
2581
+ 5,
2582
+ 6,
2583
+ 7,
2584
+ 8,
2585
+ 9,
2586
+ 10,
2587
+ 11,
2588
+ 12,
2589
+ 13,
2590
+ 14,
2591
+ 15
2592
+ ],
2593
+ "modality_keys": [
2594
+ "action"
2595
+ ],
2596
+ "sin_cos_embedding_keys": null,
2597
+ "mean_std_embedding_keys": null,
2598
+ "action_configs": [
2599
+ {
2600
+ "rep": "ABSOLUTE",
2601
+ "type": "NON_EEF",
2602
+ "format": "DEFAULT",
2603
+ "state_key": null
2604
+ }
2605
+ ]
2606
+ },
2607
+ "language": {
2608
+ "delta_indices": [
2609
+ 0
2610
+ ],
2611
+ "modality_keys": [
2612
+ "annotation.human.action.task_description"
2613
+ ],
2614
+ "sin_cos_embedding_keys": null,
2615
+ "mean_std_embedding_keys": null,
2616
+ "action_configs": null
2617
+ }
2618
+ },
2619
+ "galaxea": {
2620
+ "video": {
2621
+ "delta_indices": [
2622
+ -6,
2623
+ -4,
2624
+ -2,
2625
+ 0
2626
+ ],
2627
+ "modality_keys": [
2628
+ "primary",
2629
+ "wrist_left",
2630
+ "wrist_right"
2631
+ ],
2632
+ "sin_cos_embedding_keys": null,
2633
+ "mean_std_embedding_keys": null,
2634
+ "action_configs": null
2635
+ },
2636
+ "state": {
2637
+ "delta_indices": [
2638
+ 0
2639
+ ],
2640
+ "modality_keys": [
2641
+ "state"
2642
+ ],
2643
+ "sin_cos_embedding_keys": null,
2644
+ "mean_std_embedding_keys": null,
2645
+ "action_configs": null
2646
+ },
2647
+ "action": {
2648
+ "delta_indices": [
2649
+ 0,
2650
+ 1,
2651
+ 2,
2652
+ 3,
2653
+ 4,
2654
+ 5,
2655
+ 6,
2656
+ 7,
2657
+ 8,
2658
+ 9,
2659
+ 10,
2660
+ 11,
2661
+ 12,
2662
+ 13,
2663
+ 14,
2664
+ 15
2665
+ ],
2666
+ "modality_keys": [
2667
+ "action"
2668
+ ],
2669
+ "sin_cos_embedding_keys": null,
2670
+ "mean_std_embedding_keys": null,
2671
+ "action_configs": [
2672
+ {
2673
+ "rep": "ABSOLUTE",
2674
+ "type": "NON_EEF",
2675
+ "format": "DEFAULT",
2676
+ "state_key": null
2677
+ }
2678
+ ]
2679
+ },
2680
+ "language": {
2681
+ "delta_indices": [
2682
+ 0
2683
+ ],
2684
+ "modality_keys": [
2685
+ "annotation.human.action.task_description"
2686
+ ],
2687
+ "sin_cos_embedding_keys": null,
2688
+ "mean_std_embedding_keys": null,
2689
+ "action_configs": null
2690
+ }
2691
+ },
2692
+ "roboturk": {
2693
+ "video": {
2694
+ "delta_indices": [
2695
+ -6,
2696
+ -4,
2697
+ -2,
2698
+ 0
2699
+ ],
2700
+ "modality_keys": [
2701
+ "primary"
2702
+ ],
2703
+ "sin_cos_embedding_keys": null,
2704
+ "mean_std_embedding_keys": null,
2705
+ "action_configs": null
2706
+ },
2707
+ "state": {
2708
+ "delta_indices": [
2709
+ 0
2710
+ ],
2711
+ "modality_keys": [
2712
+ "none"
2713
+ ],
2714
+ "sin_cos_embedding_keys": null,
2715
+ "mean_std_embedding_keys": null,
2716
+ "action_configs": null
2717
+ },
2718
+ "action": {
2719
+ "delta_indices": [
2720
+ 0,
2721
+ 1,
2722
+ 2,
2723
+ 3,
2724
+ 4,
2725
+ 5,
2726
+ 6,
2727
+ 7,
2728
+ 8,
2729
+ 9,
2730
+ 10,
2731
+ 11,
2732
+ 12,
2733
+ 13,
2734
+ 14,
2735
+ 15
2736
+ ],
2737
+ "modality_keys": [
2738
+ "end_effector_position",
2739
+ "end_effector_rotation",
2740
+ "gripper_close"
2741
+ ],
2742
+ "sin_cos_embedding_keys": null,
2743
+ "mean_std_embedding_keys": null,
2744
+ "action_configs": [
2745
+ {
2746
+ "rep": "DELTA",
2747
+ "type": "EEF",
2748
+ "format": "DEFAULT",
2749
+ "state_key": null
2750
+ },
2751
+ {
2752
+ "rep": "DELTA",
2753
+ "type": "EEF",
2754
+ "format": "DEFAULT",
2755
+ "state_key": null
2756
+ },
2757
+ {
2758
+ "rep": "ABSOLUTE",
2759
+ "type": "NON_EEF",
2760
+ "format": "DEFAULT",
2761
+ "state_key": null
2762
+ }
2763
+ ]
2764
+ },
2765
+ "language": {
2766
+ "delta_indices": [
2767
+ 0
2768
+ ],
2769
+ "modality_keys": [
2770
+ "annotation.human.action.task_description"
2771
+ ],
2772
+ "sin_cos_embedding_keys": null,
2773
+ "mean_std_embedding_keys": null,
2774
+ "action_configs": null
2775
+ }
2776
+ },
2777
+ "berkeley_fanuc_manipulation": {
2778
+ "video": {
2779
+ "delta_indices": [
2780
+ -6,
2781
+ -4,
2782
+ -2,
2783
+ 0
2784
+ ],
2785
+ "modality_keys": [
2786
+ "primary",
2787
+ "wrist"
2788
+ ],
2789
+ "sin_cos_embedding_keys": null,
2790
+ "mean_std_embedding_keys": null,
2791
+ "action_configs": null
2792
+ },
2793
+ "state": {
2794
+ "delta_indices": [
2795
+ 0
2796
+ ],
2797
+ "modality_keys": [
2798
+ "joint_position",
2799
+ "gripper_position"
2800
+ ],
2801
+ "sin_cos_embedding_keys": null,
2802
+ "mean_std_embedding_keys": null,
2803
+ "action_configs": null
2804
+ },
2805
+ "action": {
2806
+ "delta_indices": [
2807
+ 0,
2808
+ 1,
2809
+ 2,
2810
+ 3,
2811
+ 4,
2812
+ 5,
2813
+ 6,
2814
+ 7,
2815
+ 8,
2816
+ 9,
2817
+ 10,
2818
+ 11,
2819
+ 12,
2820
+ 13,
2821
+ 14,
2822
+ 15
2823
+ ],
2824
+ "modality_keys": [
2825
+ "end_effector_position",
2826
+ "end_effector_rotation",
2827
+ "gripper_close"
2828
+ ],
2829
+ "sin_cos_embedding_keys": null,
2830
+ "mean_std_embedding_keys": null,
2831
+ "action_configs": [
2832
+ {
2833
+ "rep": "DELTA",
2834
+ "type": "EEF",
2835
+ "format": "DEFAULT",
2836
+ "state_key": null
2837
+ },
2838
+ {
2839
+ "rep": "DELTA",
2840
+ "type": "EEF",
2841
+ "format": "DEFAULT",
2842
+ "state_key": null
2843
+ },
2844
+ {
2845
+ "rep": "ABSOLUTE",
2846
+ "type": "NON_EEF",
2847
+ "format": "DEFAULT",
2848
+ "state_key": null
2849
+ }
2850
+ ]
2851
+ },
2852
+ "language": {
2853
+ "delta_indices": [
2854
+ 0
2855
+ ],
2856
+ "modality_keys": [
2857
+ "annotation.human.action.task_description"
2858
+ ],
2859
+ "sin_cos_embedding_keys": null,
2860
+ "mean_std_embedding_keys": null,
2861
+ "action_configs": null
2862
+ }
2863
+ },
2864
+ "jaco_play": {
2865
+ "video": {
2866
+ "delta_indices": [
2867
+ -6,
2868
+ -4,
2869
+ -2,
2870
+ 0
2871
+ ],
2872
+ "modality_keys": [
2873
+ "primary",
2874
+ "wrist"
2875
+ ],
2876
+ "sin_cos_embedding_keys": null,
2877
+ "mean_std_embedding_keys": null,
2878
+ "action_configs": null
2879
+ },
2880
+ "state": {
2881
+ "delta_indices": [
2882
+ 0
2883
+ ],
2884
+ "modality_keys": [
2885
+ "end_effector_position",
2886
+ "end_effector_rotation",
2887
+ "gripper_position"
2888
+ ],
2889
+ "sin_cos_embedding_keys": null,
2890
+ "mean_std_embedding_keys": null,
2891
+ "action_configs": null
2892
+ },
2893
+ "action": {
2894
+ "delta_indices": [
2895
+ 0,
2896
+ 1,
2897
+ 2,
2898
+ 3,
2899
+ 4,
2900
+ 5,
2901
+ 6,
2902
+ 7,
2903
+ 8,
2904
+ 9,
2905
+ 10,
2906
+ 11,
2907
+ 12,
2908
+ 13,
2909
+ 14,
2910
+ 15
2911
+ ],
2912
+ "modality_keys": [
2913
+ "end_effector_position",
2914
+ "end_effector_rotation",
2915
+ "gripper_close"
2916
+ ],
2917
+ "sin_cos_embedding_keys": null,
2918
+ "mean_std_embedding_keys": null,
2919
+ "action_configs": [
2920
+ {
2921
+ "rep": "DELTA",
2922
+ "type": "EEF",
2923
+ "format": "DEFAULT",
2924
+ "state_key": null
2925
+ },
2926
+ {
2927
+ "rep": "DELTA",
2928
+ "type": "EEF",
2929
+ "format": "DEFAULT",
2930
+ "state_key": null
2931
+ },
2932
+ {
2933
+ "rep": "ABSOLUTE",
2934
+ "type": "NON_EEF",
2935
+ "format": "DEFAULT",
2936
+ "state_key": null
2937
+ }
2938
+ ]
2939
+ },
2940
+ "language": {
2941
+ "delta_indices": [
2942
+ 0
2943
+ ],
2944
+ "modality_keys": [
2945
+ "annotation.human.action.task_description"
2946
+ ],
2947
+ "sin_cos_embedding_keys": null,
2948
+ "mean_std_embedding_keys": null,
2949
+ "action_configs": null
2950
+ }
2951
+ }
2952
+ },
2953
+ "random_rotation_angle": null,
2954
+ "color_jitter_params": {
2955
+ "brightness": 0.3,
2956
+ "contrast": 0.4,
2957
+ "saturation": 0.5,
2958
+ "hue": 0.08
2959
+ },
2960
+ "model_name": "RLWRLD/RLDX-1-VLM",
2961
+ "model_type": "vtc_qwen3_vl",
2962
+ "formalize_language": true,
2963
+ "max_state_dim": 64,
2964
+ "max_action_dim": 64,
2965
+ "max_action_horizon": 16,
2966
+ "use_percentiles": true,
2967
+ "clip_outliers": true,
2968
+ "apply_sincos_state_encoding": false,
2969
+ "use_relative_action": true,
2970
+ "memory_length": 1,
2971
+ "general_embodiment_train_ratio": 0.03125,
2972
+ "image_max_area": 65536,
2973
+ "image_resize_m": 32,
2974
+ "random_crop_fraction": null
2975
+ }
2976
+ }
processor/statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
teaser.png ADDED

Git LFS Details

  • SHA256: 6b34b11f6c8e2699766e26aa210be9e4b3e5f3f8f45ed009ae5c7ef07c7c7cd7
  • Pointer size: 133 Bytes
  • Size of remote file: 10.4 MB