jaehyunkang commited on
Commit
e8279a3
·
0 Parent(s):

RLDX-1 Release

Browse files
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ teaser.png filter=lfs diff=lfs merge=lfs -text
37
+ architecture.png filter=lfs diff=lfs merge=lfs -text
LICENSE.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RLWRLD Model License v1.0
2
+
3
+ ## 1. Definitions
4
+
5
+ "Licensor" means RLWRLD, INC. and its affiliates.
6
+
7
+ "Model" means the machine learning model, including learnt weights, parameters, configuration files, and documentation made available under this license.
8
+
9
+ "Derivative Model" means all (a) modifications to the Model, (b) works based on the Model, and (c) any other derivative works of the Model, including models fine-tuned from the Model.
10
+
11
+ "You" means an individual or legal entity exercising permissions granted by this license.
12
+
13
+ ## 2. License Grant
14
+
15
+ Subject to the terms and conditions of this license, Licensor grants to You a perpetual, worldwide, non-exclusive, royalty-free license to use, reproduce, prepare derivative works of, publicly display, publicly perform, and distribute the Model and any Derivative Models.
16
+
17
+ ## 3. Conditions and Limitations
18
+
19
+ **3.1 Non-Commercial Use.** The Model and any Derivative Models may only be used for non-commercial purposes. "Non-commercial" means for academic research, educational, personal, or evaluation purposes only, and does not include any use primarily intended for or directed toward commercial advantage or monetary compensation.
20
+
21
+ **3.2 Attribution.** You must give appropriate credit to Licensor, provide a link to this license, and indicate if changes were made. You must include the following attribution notice with any distribution of the Model or Derivative Model:
22
+
23
+ > "Licensed under the RLWRLD Model License v1.0"
24
+
25
+ **3.3 Share-Alike.** If You distribute a Derivative Model, You must do so under this same license, or another license that includes at minimum (a) a non-commercial use limitation no less restrictive than Section 3.1 and (b) a share-alike requirement no less restrictive than this Section 3.3.
26
+
27
+ **3.4 Redistribution.** You may distribute copies of the Model or Derivative Models provided that You (a) include a complete copy of this license, (b) retain all copyright, trademark, and attribution notices, and (c) comply with all conditions in this Section 3.
28
+
29
+ **3.5 Use Restrictions.** The Model and any Derivative Models shall not be used for: (a) military, weapons development, or defense applications; (b) surveillance or monitoring of individuals without their consent; or (c) any use that violates applicable laws or regulations.
30
+
31
+ **3.6 Trademarks.** This license does not grant any rights to use Licensor's names, logos, or trademarks, except as required for reasonable and customary use in describing the origin of the Model and reproducing the notices described in this license.
32
+
33
+ **3.7 Patent Claims.** If You or Your affiliate(s) bring or threaten to bring any claim or litigation (including any claim, cross-claim, or counterclaim in a lawsuit) against any entity to enforce any patents that You allege are infringed by the Model, then any rights granted to You under this license will terminate immediately.
34
+
35
+ **3.8 Termination.** If You violate any term of this license, Your rights under this license will terminate immediately.
36
+
37
+ ## 4. Third-Party Components
38
+
39
+ The Model may include or be distributed with third-party components that are subject to separate license terms and notices. Such components are subject to their respective licenses, including any notices and disclaimers contained therein. Licensor does not grant any rights with respect to third-party components beyond those provided under the applicable third-party licenses.
40
+
41
+ ## 5. Disclaimer of Warranty
42
+
43
+ THE MODEL IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NONINFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OR LAWFULNESS OF USING OR REDISTRIBUTING THE MODEL, DERIVATIVE MODELS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE MODEL, DERIVATIVE MODELS AND ANY OUTPUT AND RESULTS.
44
+
45
+ ## 6. Limitation of Liability
46
+
47
+ IN NO EVENT SHALL LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE OR THE USE OR INABILITY TO USE THE MODEL, DERIVATIVE MODELS, OR ANY OUTPUTS THEREOF, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
48
+
49
+ ## 7. Indemnity
50
+
51
+ You shall indemnify and hold harmless Licensor from and against any claim by any third party arising out of or related to Your use or distribution of the Model, Derivative Models, or any outputs thereof.
52
+
53
+ ## 8. Feedback
54
+
55
+ If You provide feedback, suggestions, or improvements regarding the Model, Licensor may use such feedback without restriction or compensation to You.
56
+
57
+ ## 9. General Provisions
58
+
59
+ **9.1 Governing Law.** This license will be governed by and construed in accordance with the laws of the State of Delaware, United States, without regard to its conflict of laws rules. The UN Convention on Contracts for International Sale of Goods does not apply to this license.
60
+
61
+ **9.2 License Updates.** Licensor may update this license to comply with legal and regulatory requirements at any time. You agree to either comply with any updated license or cease Your use and distribution of the Model and any Derivative Model.
README.md ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: rlwrld-model-license-v1.0
4
+ license_link: LICENSE.md
5
+ library_name: transformers
6
+ pipeline_tag: robotics
7
+ tags:
8
+ - robotics
9
+ - vla
10
+ - vision-language-action
11
+ - manipulation
12
+ - flow-matching
13
+ - rldx
14
+ - robocasa
15
+ base_model: RLWRLD/RLDX-1-PT
16
+ ---
17
+
18
+ # RLDX-1-FT-ROBOCASA
19
+
20
+ [Paper](https://arxiv.org/abs/2605.03269)  ·  [Project page](https://rlwrld.ai/rldx-1)  ·  [Code](https://github.com/RLWRLD/RLDX-1)  ·  [Models](https://huggingface.co/collections/RLWRLD/rldx-1)
21
+
22
+ <p align="center">
23
+ <img src="teaser.png" width="100%" alt="RLDX-1 teaser">
24
+ </p>
25
+
26
+ **RLDX-1** is a general-purpose Robot Foundation Model designed for dexterous
27
+ manipulation. Powered by a **Multi-Stream Action Transformer (MSAT)**, it
28
+ seamlessly unifies multimodal perception (visual + tactile), high-DoF
29
+ actuation, and memory-aware decision-making in a single architecture.
30
+
31
+ This repository hosts **`RLDX-1-FT-ROBOCASA`** — RLDX-1 finetuned on the
32
+ **RoboCasa Kitchen** benchmark (24 tasks). It achieves **70.6%** average
33
+ success on the held-out evaluation suite.
34
+
35
+ ## Highlights
36
+
37
+ - **Multi-Stream Action Transformer (MSAT).** Cognition, physics, and
38
+ action each get a dedicated stream coupled by joint self-attention —
39
+ an extension of MM-DiT to action modeling.
40
+ - **Motion awareness.** Multi-frame observations + a motion module
41
+ capture temporal dynamics; intermediate VLM layers compress video
42
+ tokens to keep the policy efficient.
43
+ - **Long-term memory.** A memory module fuses past cognition features
44
+ with the current ones for history-grounded decisions beyond a short
45
+ multi-frame window.
46
+ - **Physical sensing.** Tactile and torque enter as a dedicated physics
47
+ stream; the decoder is jointly trained to predict future physical
48
+ signals.
49
+ - **Three-stage training.** Pre-training (generalization) → mid-training
50
+ (functionality) → post-training (task adaptation), with synthetic data
51
+ augmenting rare manipulation scenarios.
52
+ - **Real-time inference.** Static graph capture + custom fused kernels
53
+ bring the all-modality model to **43.7 ms / step on RTX 5090
54
+ (1.63× speedup, >22 Hz)**.
55
+
56
+ ## Performance
57
+
58
+ | Benchmark | Success Rate |
59
+ |---|---|
60
+ | RoboCasa Kitchen (24-task avg) | **70.6%** |
61
+
62
+ ## Quick start
63
+
64
+ ### Installation
65
+
66
+ ```bash
67
+ git clone https://github.com/RLWRLD/RLDX-1.git
68
+ cd RLDX
69
+ uv sync --python 3.10
70
+ uv pip install -e .
71
+ ```
72
+
73
+ ### Inference
74
+
75
+ ```python
76
+ from rldx.policy.rldx_policy import RLDXPolicy
77
+ from rldx.data.embodiment_tags import EmbodimentTag
78
+
79
+ policy = RLDXPolicy(
80
+ model_path="RLWRLD/RLDX-1-FT-ROBOCASA",
81
+ embodiment_tag=EmbodimentTag.GENERAL_EMBODIMENT,
82
+ device="cuda:0",
83
+ )
84
+
85
+ action = policy.get_action(observation)
86
+ ```
87
+
88
+ ### Real-time serving (ZeroMQ)
89
+
90
+ For real-robot or simulator deployment, run the policy as a server and
91
+ connect with `PolicyClient`:
92
+
93
+ ```bash
94
+ uv run python rldx/eval/run_rldx_server.py \
95
+ --model-path RLWRLD/RLDX-1-FT-ROBOCASA \
96
+ --embodiment-tag GENERAL_EMBODIMENT \
97
+ --host 0.0.0.0 --port 20000
98
+ ```
99
+
100
+ To reproduce the benchmark numbers end-to-end, see
101
+ [`run_scripts/eval/robocasa_kitchen/README.md`](https://github.com/RLWRLD/RLDX-1/blob/main/run_scripts/eval/robocasa_kitchen/README.md).
102
+
103
+ ## Model details
104
+
105
+ - **Architecture:** Multi-Stream Action Transformer (MSAT) policy on a
106
+ Qwen3-VL backbone with cognition-token perceptual summary. Trained with
107
+ flow matching.
108
+ - **Inputs:** RGB video (default 4 frames), state proprioception, language
109
+ instruction.
110
+ - **Outputs:** Action chunks of length 16.
111
+ - **Embodiment tag:** `GENERAL_EMBODIMENT`.
112
+ - **Base model:** [`RLWRLD/RLDX-1-PT`](https://huggingface.co/RLWRLD/RLDX-1-PT).
113
+ - **Backbone:** [`Qwen/Qwen3-VL-8B-Instruct`](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct).
114
+ - **Finetune data:** RoboCasa Kitchen (24 tasks).
115
+ - **Params:** 6.9B.
116
+
117
+ For the full architectural walkthrough see
118
+ [`docs/architecture.md`](https://github.com/RLWRLD/RLDX-1/blob/main/docs/architecture.md).
119
+
120
+ ## RLDX-1 model family
121
+
122
+ | Checkpoint | Description |
123
+ |---|---|
124
+ | [`RLDX-1-PT`](https://huggingface.co/RLWRLD/RLDX-1-PT) | Multi-source pretrained foundation |
125
+ | [`RLDX-1-VLM`](https://huggingface.co/RLWRLD/RLDX-1-VLM) | Qwen3-VL-8B vision-language backbone |
126
+ | [`RLDX-1-FT-ROBOCASA`](https://huggingface.co/RLWRLD/RLDX-1-FT-ROBOCASA) | RoboCasa Kitchen 24-task finetune (this repo) |
127
+ | [`RLDX-1-FT-RC365`](https://huggingface.co/RLWRLD/RLDX-1-FT-RC365) | RoboCasa-365 cross-task finetune |
128
+ | [`RLDX-1-FT-LIBERO`](https://huggingface.co/RLWRLD/RLDX-1-FT-LIBERO) | LIBERO 4-task suite (goal, object, spatial, long) finetune |
129
+ | [`RLDX-1-FT-SIMPLER-GOOGLE`](https://huggingface.co/RLWRLD/RLDX-1-FT-SIMPLER-GOOGLE) | SIMPLER Google VM/VA finetune |
130
+ | [`RLDX-1-FT-SIMPLER-WIDOWX`](https://huggingface.co/RLWRLD/RLDX-1-FT-SIMPLER-WIDOWX) | SIMPLER WidowX finetune |
131
+ | [`RLDX-1-FT-GR1`](https://huggingface.co/RLWRLD/RLDX-1-FT-GR1) | GR-1 Tabletop finetune |
132
+ | [`RLDX-1-MT-DROID`](https://huggingface.co/RLWRLD/RLDX-1-MT-DROID) | DROID mid-train |
133
+ | [`RLDX-1-MT-ALLEX`](https://huggingface.co/RLWRLD/RLDX-1-MT-ALLEX) | All add-ons (memory + motion + physics + video) |
134
+
135
+ ## Intended use & limitations
136
+
137
+ **Intended use.** Research on robotic manipulation, simulation benchmarking
138
+ on RoboCasa Kitchen, and non-commercial real-robot deployment under the
139
+ conditions of the RLWRLD Model License v1.0.
140
+
141
+ **Out of scope.** Commercial deployment, military or weapons applications,
142
+ non-consensual surveillance, and any use that violates applicable laws or
143
+ regulations. See [`LICENSE.md`](LICENSE.md) §3.5 for the full list.
144
+
145
+ **Limitations.** Performance is reported on RoboCasa Kitchen's 24-task
146
+ suite; out-of-distribution kitchen layouts, novel object instances, or
147
+ non-Franka embodiments are not guaranteed. For other embodiments or
148
+ datasets, finetune from
149
+ [`RLDX-1-PT`](https://huggingface.co/RLWRLD/RLDX-1-PT) instead.
150
+
151
+ ## Citation
152
+
153
+ ```bibtex
154
+ @article{rldx2026,
155
+ title={RLDX-1 Technical Report},
156
+ author={Kim, Dongyoung and Jang, Huiwon and Koo, Myungkyu and Jang, Suhyeok and Kim, Taeyoung and others},
157
+ year={2026},
158
+ note={RLWRLD},
159
+ eprint={2605.03269},
160
+ archivePrefix={arXiv},
161
+ url={https://arxiv.org/abs/2605.03269}
162
+ }
163
+ ```
164
+
165
+ ## License
166
+
167
+ Released under the **RLWRLD Model License v1.0** — a non-commercial license
168
+ with attribution and share-alike requirements. See [`LICENSE.md`](LICENSE.md) for
169
+ the full text. By using this model you agree to those terms, including the
170
+ use restrictions in §3.5.
architecture.png ADDED

Git LFS Details

  • SHA256: 8d0e305139502965d4289446add15e9e11c34dcc8106ad526fa8c957c12595d3
  • Pointer size: 132 Bytes
  • Size of remote file: 1.09 MB
config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RLDX"
4
+ ],
5
+ "attn_implementation": null,
6
+ "backbone_model_type": "vtc_qwen3_vl",
7
+ "backbone_trainable_params_fp32": true,
8
+ "color_jitter_params": {
9
+ "brightness": 0.3,
10
+ "contrast": 0.4,
11
+ "hue": 0.08,
12
+ "saturation": 0.5
13
+ },
14
+ "conversation_image_first": false,
15
+ "diffusion_model_cfg": {
16
+ "action_model_max_seq_len": 512,
17
+ "attention_head_dim": 64,
18
+ "depth_multi_stream": 4,
19
+ "depth_single_stream": 8,
20
+ "dropout": 0.2,
21
+ "final_dropout": true,
22
+ "num_attention_heads": 24,
23
+ "output_dim": 1024,
24
+ "positional_embeddings": "rope_sa_only",
25
+ "pre_norm": "layer_norm",
26
+ "qk_norm": "rms_norm",
27
+ "rope_theta": 10000.0,
28
+ "sa_dim": 1536,
29
+ "set_triple_stream_for_mq": false,
30
+ "set_triple_stream_for_state": false,
31
+ "temb_type": "input_token",
32
+ "use_swiglu": true,
33
+ "vl_dim": 4096
34
+ },
35
+ "dtype": "bfloat16",
36
+ "load_bf16": true,
37
+ "memory_cfg": {
38
+ "hidden_size": 4096,
39
+ "intermediate_size": 16384,
40
+ "max_position_embeddings": 32,
41
+ "num_attention_heads": 16,
42
+ "num_hidden_layers": 2,
43
+ "num_key_value_heads": 16,
44
+ "rms_norm_eps": 1e-05,
45
+ "use_causal_attn": true,
46
+ "use_rope": true
47
+ },
48
+ "memory_video_delta_indices": [
49
+ -48,
50
+ -32,
51
+ -16,
52
+ 0
53
+ ],
54
+ "model_name": "RLWRLD/RLDX-1-VLM",
55
+ "model_type": "RLDX-1",
56
+ "n_cog_tokens": 64,
57
+ "general_embodiment_train_ratio": 0,
58
+ "qwen3_collator": true,
59
+ "random_rotation_angle": null,
60
+ "reproject_vision": false,
61
+ "state_dropout_prob": 0.0,
62
+ "transformers_version": "4.57.0",
63
+ "tune_diffusion_model": true,
64
+ "tune_llm": false,
65
+ "tune_projector": true,
66
+ "tune_top_llm_layers": 4,
67
+ "tune_visual": false,
68
+ "tune_vlln": true,
69
+ "use_relative_action": true,
70
+ "use_video": true,
71
+ "video_length": 4
72
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae38df4a1a9f6bb51a1d8a486600829ebe40831480240e558c30837a3ad3c229
3
+ size 4912540968
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7509f02e210e87ff1c68d537e230c68d645d321e5b20c3fb1b8864bc28630967
3
+ size 4446192352
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f85534ec6cc53a99765c85ecc0a7a147655d5985095ec54643def1bc3e45cdcf
3
+ size 4467155576
model.safetensors.index.json ADDED
@@ -0,0 +1,730 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 6912894784,
4
+ "total_size": 13825888896
5
+ },
6
+ "weight_map": {
7
+ "backbone.cog_emb": "model-00001-of-00003.safetensors",
8
+ "backbone.qwen_model.model.language_model.embed_tokens.weight": "model-00001-of-00003.safetensors",
9
+ "backbone.qwen_model.model.language_model.layers.0.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
10
+ "backbone.qwen_model.model.language_model.layers.0.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
11
+ "backbone.qwen_model.model.language_model.layers.0.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
12
+ "backbone.qwen_model.model.language_model.layers.0.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
13
+ "backbone.qwen_model.model.language_model.layers.0.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
14
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
15
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
16
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
17
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
18
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
19
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
20
+ "backbone.qwen_model.model.language_model.layers.1.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
21
+ "backbone.qwen_model.model.language_model.layers.1.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
22
+ "backbone.qwen_model.model.language_model.layers.1.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
23
+ "backbone.qwen_model.model.language_model.layers.1.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
24
+ "backbone.qwen_model.model.language_model.layers.1.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
25
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
26
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
27
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
28
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
29
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
30
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
31
+ "backbone.qwen_model.model.language_model.layers.2.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
32
+ "backbone.qwen_model.model.language_model.layers.2.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
33
+ "backbone.qwen_model.model.language_model.layers.2.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
34
+ "backbone.qwen_model.model.language_model.layers.2.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
35
+ "backbone.qwen_model.model.language_model.layers.2.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
36
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
37
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
38
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
39
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
40
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
41
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
42
+ "backbone.qwen_model.model.language_model.layers.3.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
43
+ "backbone.qwen_model.model.language_model.layers.3.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
44
+ "backbone.qwen_model.model.language_model.layers.3.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
45
+ "backbone.qwen_model.model.language_model.layers.3.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
46
+ "backbone.qwen_model.model.language_model.layers.3.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
47
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
48
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
49
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
50
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
51
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
52
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
53
+ "backbone.qwen_model.model.language_model.layers.4.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
54
+ "backbone.qwen_model.model.language_model.layers.4.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
55
+ "backbone.qwen_model.model.language_model.layers.4.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
56
+ "backbone.qwen_model.model.language_model.layers.4.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
57
+ "backbone.qwen_model.model.language_model.layers.4.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
58
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
59
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
60
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
61
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
62
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
63
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
64
+ "backbone.qwen_model.model.language_model.layers.5.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
65
+ "backbone.qwen_model.model.language_model.layers.5.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
66
+ "backbone.qwen_model.model.language_model.layers.5.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
67
+ "backbone.qwen_model.model.language_model.layers.5.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
68
+ "backbone.qwen_model.model.language_model.layers.5.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
69
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
70
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
71
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
72
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
73
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
74
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
75
+ "backbone.qwen_model.model.language_model.layers.6.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
76
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
77
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
78
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
79
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
80
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
81
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
82
+ "backbone.qwen_model.model.visual.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors",
83
+ "backbone.qwen_model.model.visual.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors",
84
+ "backbone.qwen_model.model.visual.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors",
85
+ "backbone.qwen_model.model.visual.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors",
86
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
87
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
88
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
89
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
90
+ "backbone.qwen_model.model.visual.blocks.0.norm1.bias": "model-00001-of-00003.safetensors",
91
+ "backbone.qwen_model.model.visual.blocks.0.norm1.weight": "model-00001-of-00003.safetensors",
92
+ "backbone.qwen_model.model.visual.blocks.0.norm2.bias": "model-00001-of-00003.safetensors",
93
+ "backbone.qwen_model.model.visual.blocks.0.norm2.weight": "model-00001-of-00003.safetensors",
94
+ "backbone.qwen_model.model.visual.blocks.1.attn.proj.bias": "model-00001-of-00003.safetensors",
95
+ "backbone.qwen_model.model.visual.blocks.1.attn.proj.weight": "model-00001-of-00003.safetensors",
96
+ "backbone.qwen_model.model.visual.blocks.1.attn.qkv.bias": "model-00001-of-00003.safetensors",
97
+ "backbone.qwen_model.model.visual.blocks.1.attn.qkv.weight": "model-00001-of-00003.safetensors",
98
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
99
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
100
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
101
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
102
+ "backbone.qwen_model.model.visual.blocks.1.norm1.bias": "model-00001-of-00003.safetensors",
103
+ "backbone.qwen_model.model.visual.blocks.1.norm1.weight": "model-00001-of-00003.safetensors",
104
+ "backbone.qwen_model.model.visual.blocks.1.norm2.bias": "model-00001-of-00003.safetensors",
105
+ "backbone.qwen_model.model.visual.blocks.1.norm2.weight": "model-00001-of-00003.safetensors",
106
+ "backbone.qwen_model.model.visual.blocks.10.attn.proj.bias": "model-00001-of-00003.safetensors",
107
+ "backbone.qwen_model.model.visual.blocks.10.attn.proj.weight": "model-00001-of-00003.safetensors",
108
+ "backbone.qwen_model.model.visual.blocks.10.attn.qkv.bias": "model-00001-of-00003.safetensors",
109
+ "backbone.qwen_model.model.visual.blocks.10.attn.qkv.weight": "model-00001-of-00003.safetensors",
110
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
111
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
112
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
113
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
114
+ "backbone.qwen_model.model.visual.blocks.10.norm1.bias": "model-00001-of-00003.safetensors",
115
+ "backbone.qwen_model.model.visual.blocks.10.norm1.weight": "model-00001-of-00003.safetensors",
116
+ "backbone.qwen_model.model.visual.blocks.10.norm2.bias": "model-00001-of-00003.safetensors",
117
+ "backbone.qwen_model.model.visual.blocks.10.norm2.weight": "model-00001-of-00003.safetensors",
118
+ "backbone.qwen_model.model.visual.blocks.11.attn.proj.bias": "model-00001-of-00003.safetensors",
119
+ "backbone.qwen_model.model.visual.blocks.11.attn.proj.weight": "model-00001-of-00003.safetensors",
120
+ "backbone.qwen_model.model.visual.blocks.11.attn.qkv.bias": "model-00001-of-00003.safetensors",
121
+ "backbone.qwen_model.model.visual.blocks.11.attn.qkv.weight": "model-00001-of-00003.safetensors",
122
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
123
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
124
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
125
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
126
+ "backbone.qwen_model.model.visual.blocks.11.norm1.bias": "model-00001-of-00003.safetensors",
127
+ "backbone.qwen_model.model.visual.blocks.11.norm1.weight": "model-00001-of-00003.safetensors",
128
+ "backbone.qwen_model.model.visual.blocks.11.norm2.bias": "model-00001-of-00003.safetensors",
129
+ "backbone.qwen_model.model.visual.blocks.11.norm2.weight": "model-00001-of-00003.safetensors",
130
+ "backbone.qwen_model.model.visual.blocks.12.attn.proj.bias": "model-00001-of-00003.safetensors",
131
+ "backbone.qwen_model.model.visual.blocks.12.attn.proj.weight": "model-00001-of-00003.safetensors",
132
+ "backbone.qwen_model.model.visual.blocks.12.attn.qkv.bias": "model-00001-of-00003.safetensors",
133
+ "backbone.qwen_model.model.visual.blocks.12.attn.qkv.weight": "model-00001-of-00003.safetensors",
134
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
135
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
136
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
137
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
138
+ "backbone.qwen_model.model.visual.blocks.12.norm1.bias": "model-00001-of-00003.safetensors",
139
+ "backbone.qwen_model.model.visual.blocks.12.norm1.weight": "model-00001-of-00003.safetensors",
140
+ "backbone.qwen_model.model.visual.blocks.12.norm2.bias": "model-00001-of-00003.safetensors",
141
+ "backbone.qwen_model.model.visual.blocks.12.norm2.weight": "model-00001-of-00003.safetensors",
142
+ "backbone.qwen_model.model.visual.blocks.13.attn.proj.bias": "model-00001-of-00003.safetensors",
143
+ "backbone.qwen_model.model.visual.blocks.13.attn.proj.weight": "model-00001-of-00003.safetensors",
144
+ "backbone.qwen_model.model.visual.blocks.13.attn.qkv.bias": "model-00001-of-00003.safetensors",
145
+ "backbone.qwen_model.model.visual.blocks.13.attn.qkv.weight": "model-00001-of-00003.safetensors",
146
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
147
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
148
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
149
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
150
+ "backbone.qwen_model.model.visual.blocks.13.norm1.bias": "model-00001-of-00003.safetensors",
151
+ "backbone.qwen_model.model.visual.blocks.13.norm1.weight": "model-00001-of-00003.safetensors",
152
+ "backbone.qwen_model.model.visual.blocks.13.norm2.bias": "model-00001-of-00003.safetensors",
153
+ "backbone.qwen_model.model.visual.blocks.13.norm2.weight": "model-00001-of-00003.safetensors",
154
+ "backbone.qwen_model.model.visual.blocks.14.attn.proj.bias": "model-00001-of-00003.safetensors",
155
+ "backbone.qwen_model.model.visual.blocks.14.attn.proj.weight": "model-00001-of-00003.safetensors",
156
+ "backbone.qwen_model.model.visual.blocks.14.attn.qkv.bias": "model-00001-of-00003.safetensors",
157
+ "backbone.qwen_model.model.visual.blocks.14.attn.qkv.weight": "model-00001-of-00003.safetensors",
158
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
159
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
160
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
161
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
162
+ "backbone.qwen_model.model.visual.blocks.14.norm1.bias": "model-00001-of-00003.safetensors",
163
+ "backbone.qwen_model.model.visual.blocks.14.norm1.weight": "model-00001-of-00003.safetensors",
164
+ "backbone.qwen_model.model.visual.blocks.14.norm2.bias": "model-00001-of-00003.safetensors",
165
+ "backbone.qwen_model.model.visual.blocks.14.norm2.weight": "model-00001-of-00003.safetensors",
166
+ "backbone.qwen_model.model.visual.blocks.15.attn.proj.bias": "model-00001-of-00003.safetensors",
167
+ "backbone.qwen_model.model.visual.blocks.15.attn.proj.weight": "model-00001-of-00003.safetensors",
168
+ "backbone.qwen_model.model.visual.blocks.15.attn.qkv.bias": "model-00001-of-00003.safetensors",
169
+ "backbone.qwen_model.model.visual.blocks.15.attn.qkv.weight": "model-00001-of-00003.safetensors",
170
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
171
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
172
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
173
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
174
+ "backbone.qwen_model.model.visual.blocks.15.norm1.bias": "model-00001-of-00003.safetensors",
175
+ "backbone.qwen_model.model.visual.blocks.15.norm1.weight": "model-00001-of-00003.safetensors",
176
+ "backbone.qwen_model.model.visual.blocks.15.norm2.bias": "model-00001-of-00003.safetensors",
177
+ "backbone.qwen_model.model.visual.blocks.15.norm2.weight": "model-00001-of-00003.safetensors",
178
+ "backbone.qwen_model.model.visual.blocks.16.attn.proj.bias": "model-00001-of-00003.safetensors",
179
+ "backbone.qwen_model.model.visual.blocks.16.attn.proj.weight": "model-00001-of-00003.safetensors",
180
+ "backbone.qwen_model.model.visual.blocks.16.attn.qkv.bias": "model-00001-of-00003.safetensors",
181
+ "backbone.qwen_model.model.visual.blocks.16.attn.qkv.weight": "model-00001-of-00003.safetensors",
182
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
183
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
184
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
185
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
186
+ "backbone.qwen_model.model.visual.blocks.16.norm1.bias": "model-00001-of-00003.safetensors",
187
+ "backbone.qwen_model.model.visual.blocks.16.norm1.weight": "model-00001-of-00003.safetensors",
188
+ "backbone.qwen_model.model.visual.blocks.16.norm2.bias": "model-00001-of-00003.safetensors",
189
+ "backbone.qwen_model.model.visual.blocks.16.norm2.weight": "model-00001-of-00003.safetensors",
190
+ "backbone.qwen_model.model.visual.blocks.17.attn.proj.bias": "model-00001-of-00003.safetensors",
191
+ "backbone.qwen_model.model.visual.blocks.17.attn.proj.weight": "model-00001-of-00003.safetensors",
192
+ "backbone.qwen_model.model.visual.blocks.17.attn.qkv.bias": "model-00001-of-00003.safetensors",
193
+ "backbone.qwen_model.model.visual.blocks.17.attn.qkv.weight": "model-00001-of-00003.safetensors",
194
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
195
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
196
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
197
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
198
+ "backbone.qwen_model.model.visual.blocks.17.norm1.bias": "model-00001-of-00003.safetensors",
199
+ "backbone.qwen_model.model.visual.blocks.17.norm1.weight": "model-00001-of-00003.safetensors",
200
+ "backbone.qwen_model.model.visual.blocks.17.norm2.bias": "model-00001-of-00003.safetensors",
201
+ "backbone.qwen_model.model.visual.blocks.17.norm2.weight": "model-00001-of-00003.safetensors",
202
+ "backbone.qwen_model.model.visual.blocks.18.attn.proj.bias": "model-00001-of-00003.safetensors",
203
+ "backbone.qwen_model.model.visual.blocks.18.attn.proj.weight": "model-00001-of-00003.safetensors",
204
+ "backbone.qwen_model.model.visual.blocks.18.attn.qkv.bias": "model-00001-of-00003.safetensors",
205
+ "backbone.qwen_model.model.visual.blocks.18.attn.qkv.weight": "model-00001-of-00003.safetensors",
206
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
207
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
208
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
209
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
210
+ "backbone.qwen_model.model.visual.blocks.18.norm1.bias": "model-00001-of-00003.safetensors",
211
+ "backbone.qwen_model.model.visual.blocks.18.norm1.weight": "model-00001-of-00003.safetensors",
212
+ "backbone.qwen_model.model.visual.blocks.18.norm2.bias": "model-00001-of-00003.safetensors",
213
+ "backbone.qwen_model.model.visual.blocks.18.norm2.weight": "model-00001-of-00003.safetensors",
214
+ "backbone.qwen_model.model.visual.blocks.19.attn.proj.bias": "model-00001-of-00003.safetensors",
215
+ "backbone.qwen_model.model.visual.blocks.19.attn.proj.weight": "model-00001-of-00003.safetensors",
216
+ "backbone.qwen_model.model.visual.blocks.19.attn.qkv.bias": "model-00001-of-00003.safetensors",
217
+ "backbone.qwen_model.model.visual.blocks.19.attn.qkv.weight": "model-00001-of-00003.safetensors",
218
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
219
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
220
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
221
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
222
+ "backbone.qwen_model.model.visual.blocks.19.norm1.bias": "model-00001-of-00003.safetensors",
223
+ "backbone.qwen_model.model.visual.blocks.19.norm1.weight": "model-00001-of-00003.safetensors",
224
+ "backbone.qwen_model.model.visual.blocks.19.norm2.bias": "model-00001-of-00003.safetensors",
225
+ "backbone.qwen_model.model.visual.blocks.19.norm2.weight": "model-00001-of-00003.safetensors",
226
+ "backbone.qwen_model.model.visual.blocks.2.attn.proj.bias": "model-00001-of-00003.safetensors",
227
+ "backbone.qwen_model.model.visual.blocks.2.attn.proj.weight": "model-00001-of-00003.safetensors",
228
+ "backbone.qwen_model.model.visual.blocks.2.attn.qkv.bias": "model-00001-of-00003.safetensors",
229
+ "backbone.qwen_model.model.visual.blocks.2.attn.qkv.weight": "model-00001-of-00003.safetensors",
230
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
231
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
232
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
233
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
234
+ "backbone.qwen_model.model.visual.blocks.2.norm1.bias": "model-00001-of-00003.safetensors",
235
+ "backbone.qwen_model.model.visual.blocks.2.norm1.weight": "model-00001-of-00003.safetensors",
236
+ "backbone.qwen_model.model.visual.blocks.2.norm2.bias": "model-00001-of-00003.safetensors",
237
+ "backbone.qwen_model.model.visual.blocks.2.norm2.weight": "model-00001-of-00003.safetensors",
238
+ "backbone.qwen_model.model.visual.blocks.20.attn.proj.bias": "model-00001-of-00003.safetensors",
239
+ "backbone.qwen_model.model.visual.blocks.20.attn.proj.weight": "model-00001-of-00003.safetensors",
240
+ "backbone.qwen_model.model.visual.blocks.20.attn.qkv.bias": "model-00001-of-00003.safetensors",
241
+ "backbone.qwen_model.model.visual.blocks.20.attn.qkv.weight": "model-00001-of-00003.safetensors",
242
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
243
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
244
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
245
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
246
+ "backbone.qwen_model.model.visual.blocks.20.norm1.bias": "model-00001-of-00003.safetensors",
247
+ "backbone.qwen_model.model.visual.blocks.20.norm1.weight": "model-00001-of-00003.safetensors",
248
+ "backbone.qwen_model.model.visual.blocks.20.norm2.bias": "model-00001-of-00003.safetensors",
249
+ "backbone.qwen_model.model.visual.blocks.20.norm2.weight": "model-00001-of-00003.safetensors",
250
+ "backbone.qwen_model.model.visual.blocks.21.attn.proj.bias": "model-00001-of-00003.safetensors",
251
+ "backbone.qwen_model.model.visual.blocks.21.attn.proj.weight": "model-00001-of-00003.safetensors",
252
+ "backbone.qwen_model.model.visual.blocks.21.attn.qkv.bias": "model-00001-of-00003.safetensors",
253
+ "backbone.qwen_model.model.visual.blocks.21.attn.qkv.weight": "model-00001-of-00003.safetensors",
254
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
255
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
256
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
257
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
258
+ "backbone.qwen_model.model.visual.blocks.21.norm1.bias": "model-00001-of-00003.safetensors",
259
+ "backbone.qwen_model.model.visual.blocks.21.norm1.weight": "model-00001-of-00003.safetensors",
260
+ "backbone.qwen_model.model.visual.blocks.21.norm2.bias": "model-00001-of-00003.safetensors",
261
+ "backbone.qwen_model.model.visual.blocks.21.norm2.weight": "model-00001-of-00003.safetensors",
262
+ "backbone.qwen_model.model.visual.blocks.22.attn.proj.bias": "model-00001-of-00003.safetensors",
263
+ "backbone.qwen_model.model.visual.blocks.22.attn.proj.weight": "model-00001-of-00003.safetensors",
264
+ "backbone.qwen_model.model.visual.blocks.22.attn.qkv.bias": "model-00001-of-00003.safetensors",
265
+ "backbone.qwen_model.model.visual.blocks.22.attn.qkv.weight": "model-00001-of-00003.safetensors",
266
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
267
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
268
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
269
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
270
+ "backbone.qwen_model.model.visual.blocks.22.norm1.bias": "model-00001-of-00003.safetensors",
271
+ "backbone.qwen_model.model.visual.blocks.22.norm1.weight": "model-00001-of-00003.safetensors",
272
+ "backbone.qwen_model.model.visual.blocks.22.norm2.bias": "model-00001-of-00003.safetensors",
273
+ "backbone.qwen_model.model.visual.blocks.22.norm2.weight": "model-00001-of-00003.safetensors",
274
+ "backbone.qwen_model.model.visual.blocks.23.attn.proj.bias": "model-00001-of-00003.safetensors",
275
+ "backbone.qwen_model.model.visual.blocks.23.attn.proj.weight": "model-00001-of-00003.safetensors",
276
+ "backbone.qwen_model.model.visual.blocks.23.attn.qkv.bias": "model-00001-of-00003.safetensors",
277
+ "backbone.qwen_model.model.visual.blocks.23.attn.qkv.weight": "model-00001-of-00003.safetensors",
278
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
279
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
280
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
281
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
282
+ "backbone.qwen_model.model.visual.blocks.23.norm1.bias": "model-00001-of-00003.safetensors",
283
+ "backbone.qwen_model.model.visual.blocks.23.norm1.weight": "model-00001-of-00003.safetensors",
284
+ "backbone.qwen_model.model.visual.blocks.23.norm2.bias": "model-00001-of-00003.safetensors",
285
+ "backbone.qwen_model.model.visual.blocks.23.norm2.weight": "model-00001-of-00003.safetensors",
286
+ "backbone.qwen_model.model.visual.blocks.24.attn.proj.bias": "model-00001-of-00003.safetensors",
287
+ "backbone.qwen_model.model.visual.blocks.24.attn.proj.weight": "model-00001-of-00003.safetensors",
288
+ "backbone.qwen_model.model.visual.blocks.24.attn.qkv.bias": "model-00001-of-00003.safetensors",
289
+ "backbone.qwen_model.model.visual.blocks.24.attn.qkv.weight": "model-00001-of-00003.safetensors",
290
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
291
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
292
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
293
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
294
+ "backbone.qwen_model.model.visual.blocks.24.norm1.bias": "model-00001-of-00003.safetensors",
295
+ "backbone.qwen_model.model.visual.blocks.24.norm1.weight": "model-00001-of-00003.safetensors",
296
+ "backbone.qwen_model.model.visual.blocks.24.norm2.bias": "model-00001-of-00003.safetensors",
297
+ "backbone.qwen_model.model.visual.blocks.24.norm2.weight": "model-00001-of-00003.safetensors",
298
+ "backbone.qwen_model.model.visual.blocks.25.attn.proj.bias": "model-00001-of-00003.safetensors",
299
+ "backbone.qwen_model.model.visual.blocks.25.attn.proj.weight": "model-00001-of-00003.safetensors",
300
+ "backbone.qwen_model.model.visual.blocks.25.attn.qkv.bias": "model-00001-of-00003.safetensors",
301
+ "backbone.qwen_model.model.visual.blocks.25.attn.qkv.weight": "model-00001-of-00003.safetensors",
302
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
303
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
304
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
305
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
306
+ "backbone.qwen_model.model.visual.blocks.25.norm1.bias": "model-00001-of-00003.safetensors",
307
+ "backbone.qwen_model.model.visual.blocks.25.norm1.weight": "model-00001-of-00003.safetensors",
308
+ "backbone.qwen_model.model.visual.blocks.25.norm2.bias": "model-00001-of-00003.safetensors",
309
+ "backbone.qwen_model.model.visual.blocks.25.norm2.weight": "model-00001-of-00003.safetensors",
310
+ "backbone.qwen_model.model.visual.blocks.26.attn.proj.bias": "model-00001-of-00003.safetensors",
311
+ "backbone.qwen_model.model.visual.blocks.26.attn.proj.weight": "model-00001-of-00003.safetensors",
312
+ "backbone.qwen_model.model.visual.blocks.26.attn.qkv.bias": "model-00001-of-00003.safetensors",
313
+ "backbone.qwen_model.model.visual.blocks.26.attn.qkv.weight": "model-00001-of-00003.safetensors",
314
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
315
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
316
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
317
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
318
+ "backbone.qwen_model.model.visual.blocks.26.norm1.bias": "model-00001-of-00003.safetensors",
319
+ "backbone.qwen_model.model.visual.blocks.26.norm1.weight": "model-00001-of-00003.safetensors",
320
+ "backbone.qwen_model.model.visual.blocks.26.norm2.bias": "model-00001-of-00003.safetensors",
321
+ "backbone.qwen_model.model.visual.blocks.26.norm2.weight": "model-00001-of-00003.safetensors",
322
+ "backbone.qwen_model.model.visual.blocks.3.attn.proj.bias": "model-00001-of-00003.safetensors",
323
+ "backbone.qwen_model.model.visual.blocks.3.attn.proj.weight": "model-00001-of-00003.safetensors",
324
+ "backbone.qwen_model.model.visual.blocks.3.attn.qkv.bias": "model-00001-of-00003.safetensors",
325
+ "backbone.qwen_model.model.visual.blocks.3.attn.qkv.weight": "model-00001-of-00003.safetensors",
326
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
327
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
328
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
329
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
330
+ "backbone.qwen_model.model.visual.blocks.3.norm1.bias": "model-00001-of-00003.safetensors",
331
+ "backbone.qwen_model.model.visual.blocks.3.norm1.weight": "model-00001-of-00003.safetensors",
332
+ "backbone.qwen_model.model.visual.blocks.3.norm2.bias": "model-00001-of-00003.safetensors",
333
+ "backbone.qwen_model.model.visual.blocks.3.norm2.weight": "model-00001-of-00003.safetensors",
334
+ "backbone.qwen_model.model.visual.blocks.4.attn.proj.bias": "model-00001-of-00003.safetensors",
335
+ "backbone.qwen_model.model.visual.blocks.4.attn.proj.weight": "model-00001-of-00003.safetensors",
336
+ "backbone.qwen_model.model.visual.blocks.4.attn.qkv.bias": "model-00001-of-00003.safetensors",
337
+ "backbone.qwen_model.model.visual.blocks.4.attn.qkv.weight": "model-00001-of-00003.safetensors",
338
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
339
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
340
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
341
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
342
+ "backbone.qwen_model.model.visual.blocks.4.norm1.bias": "model-00001-of-00003.safetensors",
343
+ "backbone.qwen_model.model.visual.blocks.4.norm1.weight": "model-00001-of-00003.safetensors",
344
+ "backbone.qwen_model.model.visual.blocks.4.norm2.bias": "model-00001-of-00003.safetensors",
345
+ "backbone.qwen_model.model.visual.blocks.4.norm2.weight": "model-00001-of-00003.safetensors",
346
+ "backbone.qwen_model.model.visual.blocks.5.attn.proj.bias": "model-00001-of-00003.safetensors",
347
+ "backbone.qwen_model.model.visual.blocks.5.attn.proj.weight": "model-00001-of-00003.safetensors",
348
+ "backbone.qwen_model.model.visual.blocks.5.attn.qkv.bias": "model-00001-of-00003.safetensors",
349
+ "backbone.qwen_model.model.visual.blocks.5.attn.qkv.weight": "model-00001-of-00003.safetensors",
350
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
351
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
352
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
353
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
354
+ "backbone.qwen_model.model.visual.blocks.5.norm1.bias": "model-00001-of-00003.safetensors",
355
+ "backbone.qwen_model.model.visual.blocks.5.norm1.weight": "model-00001-of-00003.safetensors",
356
+ "backbone.qwen_model.model.visual.blocks.5.norm2.bias": "model-00001-of-00003.safetensors",
357
+ "backbone.qwen_model.model.visual.blocks.5.norm2.weight": "model-00001-of-00003.safetensors",
358
+ "backbone.qwen_model.model.visual.blocks.6.attn.proj.bias": "model-00001-of-00003.safetensors",
359
+ "backbone.qwen_model.model.visual.blocks.6.attn.proj.weight": "model-00001-of-00003.safetensors",
360
+ "backbone.qwen_model.model.visual.blocks.6.attn.qkv.bias": "model-00001-of-00003.safetensors",
361
+ "backbone.qwen_model.model.visual.blocks.6.attn.qkv.weight": "model-00001-of-00003.safetensors",
362
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
363
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
364
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
365
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
366
+ "backbone.qwen_model.model.visual.blocks.6.norm1.bias": "model-00001-of-00003.safetensors",
367
+ "backbone.qwen_model.model.visual.blocks.6.norm1.weight": "model-00001-of-00003.safetensors",
368
+ "backbone.qwen_model.model.visual.blocks.6.norm2.bias": "model-00001-of-00003.safetensors",
369
+ "backbone.qwen_model.model.visual.blocks.6.norm2.weight": "model-00001-of-00003.safetensors",
370
+ "backbone.qwen_model.model.visual.blocks.7.attn.proj.bias": "model-00001-of-00003.safetensors",
371
+ "backbone.qwen_model.model.visual.blocks.7.attn.proj.weight": "model-00001-of-00003.safetensors",
372
+ "backbone.qwen_model.model.visual.blocks.7.attn.qkv.bias": "model-00001-of-00003.safetensors",
373
+ "backbone.qwen_model.model.visual.blocks.7.attn.qkv.weight": "model-00001-of-00003.safetensors",
374
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
375
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
376
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
377
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
378
+ "backbone.qwen_model.model.visual.blocks.7.norm1.bias": "model-00001-of-00003.safetensors",
379
+ "backbone.qwen_model.model.visual.blocks.7.norm1.weight": "model-00001-of-00003.safetensors",
380
+ "backbone.qwen_model.model.visual.blocks.7.norm2.bias": "model-00001-of-00003.safetensors",
381
+ "backbone.qwen_model.model.visual.blocks.7.norm2.weight": "model-00001-of-00003.safetensors",
382
+ "backbone.qwen_model.model.visual.blocks.8.attn.proj.bias": "model-00001-of-00003.safetensors",
383
+ "backbone.qwen_model.model.visual.blocks.8.attn.proj.weight": "model-00001-of-00003.safetensors",
384
+ "backbone.qwen_model.model.visual.blocks.8.attn.qkv.bias": "model-00001-of-00003.safetensors",
385
+ "backbone.qwen_model.model.visual.blocks.8.attn.qkv.weight": "model-00001-of-00003.safetensors",
386
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
387
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
388
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
389
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
390
+ "backbone.qwen_model.model.visual.blocks.8.norm1.bias": "model-00001-of-00003.safetensors",
391
+ "backbone.qwen_model.model.visual.blocks.8.norm1.weight": "model-00001-of-00003.safetensors",
392
+ "backbone.qwen_model.model.visual.blocks.8.norm2.bias": "model-00001-of-00003.safetensors",
393
+ "backbone.qwen_model.model.visual.blocks.8.norm2.weight": "model-00001-of-00003.safetensors",
394
+ "backbone.qwen_model.model.visual.blocks.9.attn.proj.bias": "model-00001-of-00003.safetensors",
395
+ "backbone.qwen_model.model.visual.blocks.9.attn.proj.weight": "model-00001-of-00003.safetensors",
396
+ "backbone.qwen_model.model.visual.blocks.9.attn.qkv.bias": "model-00001-of-00003.safetensors",
397
+ "backbone.qwen_model.model.visual.blocks.9.attn.qkv.weight": "model-00001-of-00003.safetensors",
398
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
399
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
400
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
401
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
402
+ "backbone.qwen_model.model.visual.blocks.9.norm1.bias": "model-00001-of-00003.safetensors",
403
+ "backbone.qwen_model.model.visual.blocks.9.norm1.weight": "model-00001-of-00003.safetensors",
404
+ "backbone.qwen_model.model.visual.blocks.9.norm2.bias": "model-00001-of-00003.safetensors",
405
+ "backbone.qwen_model.model.visual.blocks.9.norm2.weight": "model-00001-of-00003.safetensors",
406
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00003.safetensors",
407
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00003.safetensors",
408
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00003.safetensors",
409
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00003.safetensors",
410
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00003.safetensors",
411
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00003.safetensors",
412
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00003.safetensors",
413
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00003.safetensors",
414
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00003.safetensors",
415
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00003.safetensors",
416
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00003.safetensors",
417
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00003.safetensors",
418
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00003.safetensors",
419
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00003.safetensors",
420
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00003.safetensors",
421
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00003.safetensors",
422
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00003.safetensors",
423
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00003.safetensors",
424
+ "backbone.qwen_model.model.visual.merger.linear_fc1.bias": "model-00001-of-00003.safetensors",
425
+ "backbone.qwen_model.model.visual.merger.linear_fc1.weight": "model-00001-of-00003.safetensors",
426
+ "backbone.qwen_model.model.visual.merger.linear_fc2.bias": "model-00001-of-00003.safetensors",
427
+ "backbone.qwen_model.model.visual.merger.linear_fc2.weight": "model-00001-of-00003.safetensors",
428
+ "backbone.qwen_model.model.visual.merger.norm.bias": "model-00001-of-00003.safetensors",
429
+ "backbone.qwen_model.model.visual.merger.norm.weight": "model-00001-of-00003.safetensors",
430
+ "backbone.qwen_model.model.visual.patch_embed.proj.bias": "model-00001-of-00003.safetensors",
431
+ "backbone.qwen_model.model.visual.patch_embed.proj.weight": "model-00001-of-00003.safetensors",
432
+ "backbone.qwen_model.model.visual.pos_embed.weight": "model-00001-of-00003.safetensors",
433
+ "backbone.qwen_model.model.language_model.layers.10.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
434
+ "backbone.qwen_model.model.language_model.layers.10.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
435
+ "backbone.qwen_model.model.language_model.layers.10.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
436
+ "backbone.qwen_model.model.language_model.layers.10.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
437
+ "backbone.qwen_model.model.language_model.layers.10.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
438
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
439
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
440
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
441
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
442
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
443
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
444
+ "backbone.qwen_model.model.language_model.layers.11.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
445
+ "backbone.qwen_model.model.language_model.layers.11.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
446
+ "backbone.qwen_model.model.language_model.layers.11.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
447
+ "backbone.qwen_model.model.language_model.layers.11.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
448
+ "backbone.qwen_model.model.language_model.layers.11.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
449
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
450
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
451
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
452
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
453
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
454
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
455
+ "backbone.qwen_model.model.language_model.layers.12.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
456
+ "backbone.qwen_model.model.language_model.layers.12.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
457
+ "backbone.qwen_model.model.language_model.layers.12.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
458
+ "backbone.qwen_model.model.language_model.layers.12.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
459
+ "backbone.qwen_model.model.language_model.layers.12.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
460
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
461
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
462
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
463
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
464
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
465
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
466
+ "backbone.qwen_model.model.language_model.layers.13.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
467
+ "backbone.qwen_model.model.language_model.layers.13.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
468
+ "backbone.qwen_model.model.language_model.layers.13.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
469
+ "backbone.qwen_model.model.language_model.layers.13.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
470
+ "backbone.qwen_model.model.language_model.layers.13.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
471
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
472
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
473
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
474
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
475
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
476
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
477
+ "backbone.qwen_model.model.language_model.layers.14.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
478
+ "backbone.qwen_model.model.language_model.layers.14.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
479
+ "backbone.qwen_model.model.language_model.layers.14.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
480
+ "backbone.qwen_model.model.language_model.layers.14.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
481
+ "backbone.qwen_model.model.language_model.layers.14.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
482
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
483
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
484
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
485
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
486
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
487
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
488
+ "backbone.qwen_model.model.language_model.layers.15.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
489
+ "backbone.qwen_model.model.language_model.layers.15.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
490
+ "backbone.qwen_model.model.language_model.layers.15.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
491
+ "backbone.qwen_model.model.language_model.layers.15.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
492
+ "backbone.qwen_model.model.language_model.layers.15.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
493
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
494
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
495
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
496
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
497
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
498
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
499
+ "backbone.qwen_model.model.language_model.layers.16.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
500
+ "backbone.qwen_model.model.language_model.layers.16.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
501
+ "backbone.qwen_model.model.language_model.layers.16.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
502
+ "backbone.qwen_model.model.language_model.layers.16.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
503
+ "backbone.qwen_model.model.language_model.layers.16.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
504
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
505
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
506
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
507
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
508
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
509
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
510
+ "backbone.qwen_model.model.language_model.layers.17.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
511
+ "backbone.qwen_model.model.language_model.layers.17.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
512
+ "backbone.qwen_model.model.language_model.layers.17.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
513
+ "backbone.qwen_model.model.language_model.layers.17.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
514
+ "backbone.qwen_model.model.language_model.layers.17.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
515
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
516
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
517
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
518
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
519
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
520
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
521
+ "backbone.qwen_model.model.language_model.layers.6.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
522
+ "backbone.qwen_model.model.language_model.layers.6.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
523
+ "backbone.qwen_model.model.language_model.layers.6.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
524
+ "backbone.qwen_model.model.language_model.layers.6.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
525
+ "backbone.qwen_model.model.language_model.layers.7.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
526
+ "backbone.qwen_model.model.language_model.layers.7.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
527
+ "backbone.qwen_model.model.language_model.layers.7.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
528
+ "backbone.qwen_model.model.language_model.layers.7.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
529
+ "backbone.qwen_model.model.language_model.layers.7.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
530
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
531
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
532
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
533
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
534
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
535
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
536
+ "backbone.qwen_model.model.language_model.layers.8.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
537
+ "backbone.qwen_model.model.language_model.layers.8.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
538
+ "backbone.qwen_model.model.language_model.layers.8.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
539
+ "backbone.qwen_model.model.language_model.layers.8.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
540
+ "backbone.qwen_model.model.language_model.layers.8.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
541
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
542
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
543
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
544
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
545
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
546
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
547
+ "backbone.qwen_model.model.language_model.layers.9.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
548
+ "backbone.qwen_model.model.language_model.layers.9.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
549
+ "backbone.qwen_model.model.language_model.layers.9.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
550
+ "backbone.qwen_model.model.language_model.layers.9.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
551
+ "backbone.qwen_model.model.language_model.layers.9.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
552
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
553
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
554
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
555
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
556
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
557
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
558
+ "backbone.qwen_model.model.language_model.norm.weight": "model-00002-of-00003.safetensors",
559
+ "action_model.action_decoder.layer1.W": "model-00003-of-00003.safetensors",
560
+ "action_model.action_decoder.layer1.b": "model-00003-of-00003.safetensors",
561
+ "action_model.action_decoder.layer2.W": "model-00003-of-00003.safetensors",
562
+ "action_model.action_decoder.layer2.b": "model-00003-of-00003.safetensors",
563
+ "action_model.action_encoder.W1.W": "model-00003-of-00003.safetensors",
564
+ "action_model.action_encoder.W1.b": "model-00003-of-00003.safetensors",
565
+ "action_model.action_encoder.W2.W": "model-00003-of-00003.safetensors",
566
+ "action_model.action_encoder.W2.b": "model-00003-of-00003.safetensors",
567
+ "action_model.action_encoder.W3.W": "model-00003-of-00003.safetensors",
568
+ "action_model.action_encoder.W3.b": "model-00003-of-00003.safetensors",
569
+ "action_model.model.double_blocks.0.k_norm_sa.weight": "model-00003-of-00003.safetensors",
570
+ "action_model.model.double_blocks.0.k_norm_vl.weight": "model-00003-of-00003.safetensors",
571
+ "action_model.model.double_blocks.0.q_norm_sa.weight": "model-00003-of-00003.safetensors",
572
+ "action_model.model.double_blocks.0.q_norm_vl.weight": "model-00003-of-00003.safetensors",
573
+ "action_model.model.double_blocks.0.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
574
+ "action_model.model.double_blocks.0.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
575
+ "action_model.model.double_blocks.0.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
576
+ "action_model.model.double_blocks.0.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
577
+ "action_model.model.double_blocks.0.sa_proj.bias": "model-00003-of-00003.safetensors",
578
+ "action_model.model.double_blocks.0.sa_proj.weight": "model-00003-of-00003.safetensors",
579
+ "action_model.model.double_blocks.0.sa_qkv.bias": "model-00003-of-00003.safetensors",
580
+ "action_model.model.double_blocks.0.sa_qkv.weight": "model-00003-of-00003.safetensors",
581
+ "action_model.model.double_blocks.0.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
582
+ "action_model.model.double_blocks.0.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
583
+ "action_model.model.double_blocks.0.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
584
+ "action_model.model.double_blocks.0.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
585
+ "action_model.model.double_blocks.0.vl_proj.bias": "model-00003-of-00003.safetensors",
586
+ "action_model.model.double_blocks.0.vl_proj.weight": "model-00003-of-00003.safetensors",
587
+ "action_model.model.double_blocks.0.vl_qkv.bias": "model-00003-of-00003.safetensors",
588
+ "action_model.model.double_blocks.0.vl_qkv.weight": "model-00003-of-00003.safetensors",
589
+ "action_model.model.double_blocks.1.k_norm_sa.weight": "model-00003-of-00003.safetensors",
590
+ "action_model.model.double_blocks.1.k_norm_vl.weight": "model-00003-of-00003.safetensors",
591
+ "action_model.model.double_blocks.1.q_norm_sa.weight": "model-00003-of-00003.safetensors",
592
+ "action_model.model.double_blocks.1.q_norm_vl.weight": "model-00003-of-00003.safetensors",
593
+ "action_model.model.double_blocks.1.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
594
+ "action_model.model.double_blocks.1.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
595
+ "action_model.model.double_blocks.1.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
596
+ "action_model.model.double_blocks.1.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
597
+ "action_model.model.double_blocks.1.sa_proj.bias": "model-00003-of-00003.safetensors",
598
+ "action_model.model.double_blocks.1.sa_proj.weight": "model-00003-of-00003.safetensors",
599
+ "action_model.model.double_blocks.1.sa_qkv.bias": "model-00003-of-00003.safetensors",
600
+ "action_model.model.double_blocks.1.sa_qkv.weight": "model-00003-of-00003.safetensors",
601
+ "action_model.model.double_blocks.1.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
602
+ "action_model.model.double_blocks.1.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
603
+ "action_model.model.double_blocks.1.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
604
+ "action_model.model.double_blocks.1.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
605
+ "action_model.model.double_blocks.1.vl_proj.bias": "model-00003-of-00003.safetensors",
606
+ "action_model.model.double_blocks.1.vl_proj.weight": "model-00003-of-00003.safetensors",
607
+ "action_model.model.double_blocks.1.vl_qkv.bias": "model-00003-of-00003.safetensors",
608
+ "action_model.model.double_blocks.1.vl_qkv.weight": "model-00003-of-00003.safetensors",
609
+ "action_model.model.double_blocks.2.k_norm_sa.weight": "model-00003-of-00003.safetensors",
610
+ "action_model.model.double_blocks.2.k_norm_vl.weight": "model-00003-of-00003.safetensors",
611
+ "action_model.model.double_blocks.2.q_norm_sa.weight": "model-00003-of-00003.safetensors",
612
+ "action_model.model.double_blocks.2.q_norm_vl.weight": "model-00003-of-00003.safetensors",
613
+ "action_model.model.double_blocks.2.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
614
+ "action_model.model.double_blocks.2.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
615
+ "action_model.model.double_blocks.2.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
616
+ "action_model.model.double_blocks.2.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
617
+ "action_model.model.double_blocks.2.sa_proj.bias": "model-00003-of-00003.safetensors",
618
+ "action_model.model.double_blocks.2.sa_proj.weight": "model-00003-of-00003.safetensors",
619
+ "action_model.model.double_blocks.2.sa_qkv.bias": "model-00003-of-00003.safetensors",
620
+ "action_model.model.double_blocks.2.sa_qkv.weight": "model-00003-of-00003.safetensors",
621
+ "action_model.model.double_blocks.2.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
622
+ "action_model.model.double_blocks.2.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
623
+ "action_model.model.double_blocks.2.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
624
+ "action_model.model.double_blocks.2.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
625
+ "action_model.model.double_blocks.2.vl_proj.bias": "model-00003-of-00003.safetensors",
626
+ "action_model.model.double_blocks.2.vl_proj.weight": "model-00003-of-00003.safetensors",
627
+ "action_model.model.double_blocks.2.vl_qkv.bias": "model-00003-of-00003.safetensors",
628
+ "action_model.model.double_blocks.2.vl_qkv.weight": "model-00003-of-00003.safetensors",
629
+ "action_model.model.double_blocks.3.k_norm_sa.weight": "model-00003-of-00003.safetensors",
630
+ "action_model.model.double_blocks.3.k_norm_vl.weight": "model-00003-of-00003.safetensors",
631
+ "action_model.model.double_blocks.3.q_norm_sa.weight": "model-00003-of-00003.safetensors",
632
+ "action_model.model.double_blocks.3.q_norm_vl.weight": "model-00003-of-00003.safetensors",
633
+ "action_model.model.double_blocks.3.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
634
+ "action_model.model.double_blocks.3.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
635
+ "action_model.model.double_blocks.3.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
636
+ "action_model.model.double_blocks.3.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
637
+ "action_model.model.double_blocks.3.sa_proj.bias": "model-00003-of-00003.safetensors",
638
+ "action_model.model.double_blocks.3.sa_proj.weight": "model-00003-of-00003.safetensors",
639
+ "action_model.model.double_blocks.3.sa_qkv.bias": "model-00003-of-00003.safetensors",
640
+ "action_model.model.double_blocks.3.sa_qkv.weight": "model-00003-of-00003.safetensors",
641
+ "action_model.model.double_blocks.3.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
642
+ "action_model.model.double_blocks.3.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
643
+ "action_model.model.double_blocks.3.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
644
+ "action_model.model.double_blocks.3.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
645
+ "action_model.model.double_blocks.3.vl_proj.bias": "model-00003-of-00003.safetensors",
646
+ "action_model.model.double_blocks.3.vl_proj.weight": "model-00003-of-00003.safetensors",
647
+ "action_model.model.double_blocks.3.vl_qkv.bias": "model-00003-of-00003.safetensors",
648
+ "action_model.model.double_blocks.3.vl_qkv.weight": "model-00003-of-00003.safetensors",
649
+ "action_model.model.proj_out_1.bias": "model-00003-of-00003.safetensors",
650
+ "action_model.model.proj_out_1.weight": "model-00003-of-00003.safetensors",
651
+ "action_model.model.proj_out_2.bias": "model-00003-of-00003.safetensors",
652
+ "action_model.model.proj_out_2.weight": "model-00003-of-00003.safetensors",
653
+ "action_model.model.single_blocks.0.k_norm.weight": "model-00003-of-00003.safetensors",
654
+ "action_model.model.single_blocks.0.linear1.bias": "model-00003-of-00003.safetensors",
655
+ "action_model.model.single_blocks.0.linear1.weight": "model-00003-of-00003.safetensors",
656
+ "action_model.model.single_blocks.0.linear2.bias": "model-00003-of-00003.safetensors",
657
+ "action_model.model.single_blocks.0.linear2.weight": "model-00003-of-00003.safetensors",
658
+ "action_model.model.single_blocks.0.mlp_proj.bias": "model-00003-of-00003.safetensors",
659
+ "action_model.model.single_blocks.0.mlp_proj.weight": "model-00003-of-00003.safetensors",
660
+ "action_model.model.single_blocks.0.q_norm.weight": "model-00003-of-00003.safetensors",
661
+ "action_model.model.single_blocks.1.k_norm.weight": "model-00003-of-00003.safetensors",
662
+ "action_model.model.single_blocks.1.linear1.bias": "model-00003-of-00003.safetensors",
663
+ "action_model.model.single_blocks.1.linear1.weight": "model-00003-of-00003.safetensors",
664
+ "action_model.model.single_blocks.1.linear2.bias": "model-00003-of-00003.safetensors",
665
+ "action_model.model.single_blocks.1.linear2.weight": "model-00003-of-00003.safetensors",
666
+ "action_model.model.single_blocks.1.mlp_proj.bias": "model-00003-of-00003.safetensors",
667
+ "action_model.model.single_blocks.1.mlp_proj.weight": "model-00003-of-00003.safetensors",
668
+ "action_model.model.single_blocks.1.q_norm.weight": "model-00003-of-00003.safetensors",
669
+ "action_model.model.single_blocks.2.k_norm.weight": "model-00003-of-00003.safetensors",
670
+ "action_model.model.single_blocks.2.linear1.bias": "model-00003-of-00003.safetensors",
671
+ "action_model.model.single_blocks.2.linear1.weight": "model-00003-of-00003.safetensors",
672
+ "action_model.model.single_blocks.2.linear2.bias": "model-00003-of-00003.safetensors",
673
+ "action_model.model.single_blocks.2.linear2.weight": "model-00003-of-00003.safetensors",
674
+ "action_model.model.single_blocks.2.mlp_proj.bias": "model-00003-of-00003.safetensors",
675
+ "action_model.model.single_blocks.2.mlp_proj.weight": "model-00003-of-00003.safetensors",
676
+ "action_model.model.single_blocks.2.q_norm.weight": "model-00003-of-00003.safetensors",
677
+ "action_model.model.single_blocks.3.k_norm.weight": "model-00003-of-00003.safetensors",
678
+ "action_model.model.single_blocks.3.linear1.bias": "model-00003-of-00003.safetensors",
679
+ "action_model.model.single_blocks.3.linear1.weight": "model-00003-of-00003.safetensors",
680
+ "action_model.model.single_blocks.3.linear2.bias": "model-00003-of-00003.safetensors",
681
+ "action_model.model.single_blocks.3.linear2.weight": "model-00003-of-00003.safetensors",
682
+ "action_model.model.single_blocks.3.mlp_proj.bias": "model-00003-of-00003.safetensors",
683
+ "action_model.model.single_blocks.3.mlp_proj.weight": "model-00003-of-00003.safetensors",
684
+ "action_model.model.single_blocks.3.q_norm.weight": "model-00003-of-00003.safetensors",
685
+ "action_model.model.single_blocks.4.k_norm.weight": "model-00003-of-00003.safetensors",
686
+ "action_model.model.single_blocks.4.linear1.bias": "model-00003-of-00003.safetensors",
687
+ "action_model.model.single_blocks.4.linear1.weight": "model-00003-of-00003.safetensors",
688
+ "action_model.model.single_blocks.4.linear2.bias": "model-00003-of-00003.safetensors",
689
+ "action_model.model.single_blocks.4.linear2.weight": "model-00003-of-00003.safetensors",
690
+ "action_model.model.single_blocks.4.mlp_proj.bias": "model-00003-of-00003.safetensors",
691
+ "action_model.model.single_blocks.4.mlp_proj.weight": "model-00003-of-00003.safetensors",
692
+ "action_model.model.single_blocks.4.q_norm.weight": "model-00003-of-00003.safetensors",
693
+ "action_model.model.single_blocks.5.k_norm.weight": "model-00003-of-00003.safetensors",
694
+ "action_model.model.single_blocks.5.linear1.bias": "model-00003-of-00003.safetensors",
695
+ "action_model.model.single_blocks.5.linear1.weight": "model-00003-of-00003.safetensors",
696
+ "action_model.model.single_blocks.5.linear2.bias": "model-00003-of-00003.safetensors",
697
+ "action_model.model.single_blocks.5.linear2.weight": "model-00003-of-00003.safetensors",
698
+ "action_model.model.single_blocks.5.mlp_proj.bias": "model-00003-of-00003.safetensors",
699
+ "action_model.model.single_blocks.5.mlp_proj.weight": "model-00003-of-00003.safetensors",
700
+ "action_model.model.single_blocks.5.q_norm.weight": "model-00003-of-00003.safetensors",
701
+ "action_model.model.single_blocks.6.k_norm.weight": "model-00003-of-00003.safetensors",
702
+ "action_model.model.single_blocks.6.linear1.bias": "model-00003-of-00003.safetensors",
703
+ "action_model.model.single_blocks.6.linear1.weight": "model-00003-of-00003.safetensors",
704
+ "action_model.model.single_blocks.6.linear2.bias": "model-00003-of-00003.safetensors",
705
+ "action_model.model.single_blocks.6.linear2.weight": "model-00003-of-00003.safetensors",
706
+ "action_model.model.single_blocks.6.mlp_proj.bias": "model-00003-of-00003.safetensors",
707
+ "action_model.model.single_blocks.6.mlp_proj.weight": "model-00003-of-00003.safetensors",
708
+ "action_model.model.single_blocks.6.q_norm.weight": "model-00003-of-00003.safetensors",
709
+ "action_model.model.single_blocks.7.k_norm.weight": "model-00003-of-00003.safetensors",
710
+ "action_model.model.single_blocks.7.linear1.bias": "model-00003-of-00003.safetensors",
711
+ "action_model.model.single_blocks.7.linear1.weight": "model-00003-of-00003.safetensors",
712
+ "action_model.model.single_blocks.7.linear2.bias": "model-00003-of-00003.safetensors",
713
+ "action_model.model.single_blocks.7.linear2.weight": "model-00003-of-00003.safetensors",
714
+ "action_model.model.single_blocks.7.mlp_proj.bias": "model-00003-of-00003.safetensors",
715
+ "action_model.model.single_blocks.7.mlp_proj.weight": "model-00003-of-00003.safetensors",
716
+ "action_model.model.single_blocks.7.q_norm.weight": "model-00003-of-00003.safetensors",
717
+ "action_model.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00003-of-00003.safetensors",
718
+ "action_model.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00003-of-00003.safetensors",
719
+ "action_model.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00003-of-00003.safetensors",
720
+ "action_model.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00003-of-00003.safetensors",
721
+ "action_model.model.vl_proj_to_sa.bias": "model-00003-of-00003.safetensors",
722
+ "action_model.model.vl_proj_to_sa.weight": "model-00003-of-00003.safetensors",
723
+ "action_model.position_embedding.weight": "model-00003-of-00003.safetensors",
724
+ "action_model.state_encoder.layer1.W": "model-00003-of-00003.safetensors",
725
+ "action_model.state_encoder.layer1.b": "model-00003-of-00003.safetensors",
726
+ "action_model.state_encoder.layer2.W": "model-00003-of-00003.safetensors",
727
+ "action_model.state_encoder.layer2.b": "model-00003-of-00003.safetensors",
728
+ "backbone.qwen_model.lm_head.weight": "model-00003-of-00003.safetensors"
729
+ }
730
+ }
processor/embodiment_id.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "general_embodiment": 0,
3
+ "fractal20220817_data": 1,
4
+ "kuka": 2,
5
+ "bridge_orig": 3,
6
+ "taco_play": 4,
7
+ "jaco_play": 5,
8
+ "berkeley_cable_routing": 6,
9
+ "roboturk": 7,
10
+ "viola": 8,
11
+ "berkeley_autolab_ur5": 9,
12
+ "toto": 10,
13
+ "language_table": 11,
14
+ "stanford_hydra_dataset_converted_externally_to_rlds": 12,
15
+ "austin_buds_dataset_converted_externally_to_rlds": 13,
16
+ "nyu_franka_play_dataset_converted_externally_to_rlds": 14,
17
+ "furniture_bench_dataset_converted_externally_to_rlds": 15,
18
+ "ucsd_kitchen_dataset_converted_externally_to_rlds": 16,
19
+ "austin_sailor_dataset_converted_externally_to_rlds": 17,
20
+ "austin_sirius_dataset_converted_externally_to_rlds": 18,
21
+ "dlr_edan_shared_control_converted_externally_to_rlds": 19,
22
+ "iamlab_cmu_pickup_insert_converted_externally_to_rlds": 20,
23
+ "utaustin_mutex": 21,
24
+ "berkeley_fanuc_manipulation": 22,
25
+ "cmu_stretch": 23,
26
+ "bc_z": 24,
27
+ "fmb_dataset": 25,
28
+ "dobbe": 26,
29
+ "droid": 27,
30
+ "agibot_dexhand": 28,
31
+ "agibot_gripper": 29,
32
+ "galaxea": 30,
33
+ "humanoid_everyday_g1": 31,
34
+ "humanoid_everyday_h1": 32,
35
+ "action_net": 33,
36
+ "neural_gr1": 34,
37
+ "new_embodiment": 35
38
+ }
processor/processor_config.json ADDED
@@ -0,0 +1,3081 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "RLDXProcessor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "kuka": {
6
+ "video": {
7
+ "delta_indices": [
8
+ -6,
9
+ -4,
10
+ -2,
11
+ 0
12
+ ],
13
+ "modality_keys": [
14
+ "primary"
15
+ ],
16
+ "sin_cos_embedding_keys": null,
17
+ "mean_std_embedding_keys": null,
18
+ "action_configs": null
19
+ },
20
+ "state": {
21
+ "delta_indices": [
22
+ 0
23
+ ],
24
+ "modality_keys": [
25
+ "end_effector_position",
26
+ "end_effector_rotation",
27
+ "gripper_position"
28
+ ],
29
+ "sin_cos_embedding_keys": null,
30
+ "mean_std_embedding_keys": null,
31
+ "action_configs": null
32
+ },
33
+ "action": {
34
+ "delta_indices": [
35
+ 0,
36
+ 1,
37
+ 2,
38
+ 3,
39
+ 4,
40
+ 5,
41
+ 6,
42
+ 7,
43
+ 8,
44
+ 9,
45
+ 10,
46
+ 11,
47
+ 12,
48
+ 13,
49
+ 14,
50
+ 15
51
+ ],
52
+ "modality_keys": [
53
+ "end_effector_position",
54
+ "end_effector_rotation",
55
+ "gripper_close"
56
+ ],
57
+ "sin_cos_embedding_keys": null,
58
+ "mean_std_embedding_keys": null,
59
+ "action_configs": [
60
+ {
61
+ "rep": "DELTA",
62
+ "type": "EEF",
63
+ "format": "DEFAULT",
64
+ "state_key": null
65
+ },
66
+ {
67
+ "rep": "DELTA",
68
+ "type": "EEF",
69
+ "format": "DEFAULT",
70
+ "state_key": null
71
+ },
72
+ {
73
+ "rep": "ABSOLUTE",
74
+ "type": "NON_EEF",
75
+ "format": "DEFAULT",
76
+ "state_key": null
77
+ }
78
+ ]
79
+ },
80
+ "language": {
81
+ "delta_indices": [
82
+ 0
83
+ ],
84
+ "modality_keys": [
85
+ "annotation.human.action.task_description"
86
+ ],
87
+ "sin_cos_embedding_keys": null,
88
+ "mean_std_embedding_keys": null,
89
+ "action_configs": null
90
+ }
91
+ },
92
+ "bc_z": {
93
+ "video": {
94
+ "delta_indices": [
95
+ -6,
96
+ -4,
97
+ -2,
98
+ 0
99
+ ],
100
+ "modality_keys": [
101
+ "primary"
102
+ ],
103
+ "sin_cos_embedding_keys": null,
104
+ "mean_std_embedding_keys": null,
105
+ "action_configs": null
106
+ },
107
+ "state": {
108
+ "delta_indices": [
109
+ 0
110
+ ],
111
+ "modality_keys": [
112
+ "end_effector_position",
113
+ "end_effector_rotation",
114
+ "gripper_position"
115
+ ],
116
+ "sin_cos_embedding_keys": null,
117
+ "mean_std_embedding_keys": null,
118
+ "action_configs": null
119
+ },
120
+ "action": {
121
+ "delta_indices": [
122
+ 0,
123
+ 1,
124
+ 2,
125
+ 3,
126
+ 4,
127
+ 5,
128
+ 6,
129
+ 7,
130
+ 8,
131
+ 9,
132
+ 10,
133
+ 11,
134
+ 12,
135
+ 13,
136
+ 14,
137
+ 15
138
+ ],
139
+ "modality_keys": [
140
+ "end_effector_position",
141
+ "end_effector_rotation",
142
+ "gripper_close"
143
+ ],
144
+ "sin_cos_embedding_keys": null,
145
+ "mean_std_embedding_keys": null,
146
+ "action_configs": [
147
+ {
148
+ "rep": "DELTA",
149
+ "type": "EEF",
150
+ "format": "DEFAULT",
151
+ "state_key": null
152
+ },
153
+ {
154
+ "rep": "DELTA",
155
+ "type": "EEF",
156
+ "format": "DEFAULT",
157
+ "state_key": null
158
+ },
159
+ {
160
+ "rep": "ABSOLUTE",
161
+ "type": "NON_EEF",
162
+ "format": "DEFAULT",
163
+ "state_key": null
164
+ }
165
+ ]
166
+ },
167
+ "language": {
168
+ "delta_indices": [
169
+ 0
170
+ ],
171
+ "modality_keys": [
172
+ "annotation.human.action.task_description"
173
+ ],
174
+ "sin_cos_embedding_keys": null,
175
+ "mean_std_embedding_keys": null,
176
+ "action_configs": null
177
+ }
178
+ },
179
+ "language_table": {
180
+ "video": {
181
+ "delta_indices": [
182
+ -6,
183
+ -4,
184
+ -2,
185
+ 0
186
+ ],
187
+ "modality_keys": [
188
+ "primary"
189
+ ],
190
+ "sin_cos_embedding_keys": null,
191
+ "mean_std_embedding_keys": null,
192
+ "action_configs": null
193
+ },
194
+ "state": {
195
+ "delta_indices": [
196
+ 0
197
+ ],
198
+ "modality_keys": [
199
+ "end_effector_position"
200
+ ],
201
+ "sin_cos_embedding_keys": null,
202
+ "mean_std_embedding_keys": null,
203
+ "action_configs": null
204
+ },
205
+ "action": {
206
+ "delta_indices": [
207
+ 0,
208
+ 1,
209
+ 2,
210
+ 3,
211
+ 4,
212
+ 5,
213
+ 6,
214
+ 7,
215
+ 8,
216
+ 9,
217
+ 10,
218
+ 11,
219
+ 12,
220
+ 13,
221
+ 14,
222
+ 15
223
+ ],
224
+ "modality_keys": [
225
+ "end_effector_position"
226
+ ],
227
+ "sin_cos_embedding_keys": null,
228
+ "mean_std_embedding_keys": null,
229
+ "action_configs": [
230
+ {
231
+ "rep": "DELTA",
232
+ "type": "EEF",
233
+ "format": "DEFAULT",
234
+ "state_key": null
235
+ }
236
+ ]
237
+ },
238
+ "language": {
239
+ "delta_indices": [
240
+ 0
241
+ ],
242
+ "modality_keys": [
243
+ "annotation.human.action.task_description"
244
+ ],
245
+ "sin_cos_embedding_keys": null,
246
+ "mean_std_embedding_keys": null,
247
+ "action_configs": null
248
+ }
249
+ },
250
+ "action_net": {
251
+ "video": {
252
+ "delta_indices": [
253
+ -6,
254
+ -4,
255
+ -2,
256
+ 0
257
+ ],
258
+ "modality_keys": [
259
+ "primary"
260
+ ],
261
+ "sin_cos_embedding_keys": null,
262
+ "mean_std_embedding_keys": null,
263
+ "action_configs": null
264
+ },
265
+ "state": {
266
+ "delta_indices": [
267
+ 0
268
+ ],
269
+ "modality_keys": [
270
+ "state"
271
+ ],
272
+ "sin_cos_embedding_keys": null,
273
+ "mean_std_embedding_keys": null,
274
+ "action_configs": null
275
+ },
276
+ "action": {
277
+ "delta_indices": [
278
+ 0,
279
+ 1,
280
+ 2,
281
+ 3,
282
+ 4,
283
+ 5,
284
+ 6,
285
+ 7,
286
+ 8,
287
+ 9,
288
+ 10,
289
+ 11,
290
+ 12,
291
+ 13,
292
+ 14,
293
+ 15
294
+ ],
295
+ "modality_keys": [
296
+ "action"
297
+ ],
298
+ "sin_cos_embedding_keys": null,
299
+ "mean_std_embedding_keys": null,
300
+ "action_configs": [
301
+ {
302
+ "rep": "ABSOLUTE",
303
+ "type": "NON_EEF",
304
+ "format": "DEFAULT",
305
+ "state_key": null
306
+ }
307
+ ]
308
+ },
309
+ "language": {
310
+ "delta_indices": [
311
+ 0
312
+ ],
313
+ "modality_keys": [
314
+ "annotation.human.action.task_description"
315
+ ],
316
+ "sin_cos_embedding_keys": null,
317
+ "mean_std_embedding_keys": null,
318
+ "action_configs": null
319
+ }
320
+ },
321
+ "bridge_orig": {
322
+ "video": {
323
+ "delta_indices": [
324
+ -6,
325
+ -4,
326
+ -2,
327
+ 0
328
+ ],
329
+ "modality_keys": [
330
+ "primary",
331
+ "secondary"
332
+ ],
333
+ "sin_cos_embedding_keys": null,
334
+ "mean_std_embedding_keys": null,
335
+ "action_configs": null
336
+ },
337
+ "state": {
338
+ "delta_indices": [
339
+ 0
340
+ ],
341
+ "modality_keys": [
342
+ "end_effector_position",
343
+ "end_effector_rotation",
344
+ "gripper_position"
345
+ ],
346
+ "sin_cos_embedding_keys": null,
347
+ "mean_std_embedding_keys": null,
348
+ "action_configs": null
349
+ },
350
+ "action": {
351
+ "delta_indices": [
352
+ 0,
353
+ 1,
354
+ 2,
355
+ 3,
356
+ 4,
357
+ 5,
358
+ 6,
359
+ 7,
360
+ 8,
361
+ 9,
362
+ 10,
363
+ 11,
364
+ 12,
365
+ 13,
366
+ 14,
367
+ 15
368
+ ],
369
+ "modality_keys": [
370
+ "end_effector_position",
371
+ "end_effector_rotation",
372
+ "gripper_close"
373
+ ],
374
+ "sin_cos_embedding_keys": null,
375
+ "mean_std_embedding_keys": null,
376
+ "action_configs": [
377
+ {
378
+ "rep": "DELTA",
379
+ "type": "EEF",
380
+ "format": "DEFAULT",
381
+ "state_key": null
382
+ },
383
+ {
384
+ "rep": "DELTA",
385
+ "type": "EEF",
386
+ "format": "DEFAULT",
387
+ "state_key": null
388
+ },
389
+ {
390
+ "rep": "ABSOLUTE",
391
+ "type": "NON_EEF",
392
+ "format": "DEFAULT",
393
+ "state_key": null
394
+ }
395
+ ]
396
+ },
397
+ "language": {
398
+ "delta_indices": [
399
+ 0
400
+ ],
401
+ "modality_keys": [
402
+ "annotation.human.action.task_description"
403
+ ],
404
+ "sin_cos_embedding_keys": null,
405
+ "mean_std_embedding_keys": null,
406
+ "action_configs": null
407
+ }
408
+ },
409
+ "neural_gr1": {
410
+ "video": {
411
+ "delta_indices": [
412
+ -6,
413
+ -4,
414
+ -2,
415
+ 0
416
+ ],
417
+ "modality_keys": [
418
+ "ego_view"
419
+ ],
420
+ "sin_cos_embedding_keys": null,
421
+ "mean_std_embedding_keys": null,
422
+ "action_configs": null
423
+ },
424
+ "state": {
425
+ "delta_indices": [
426
+ 0
427
+ ],
428
+ "modality_keys": [
429
+ "left_arm",
430
+ "left_hand",
431
+ "left_leg",
432
+ "neck",
433
+ "right_arm",
434
+ "right_hand",
435
+ "right_leg",
436
+ "waist"
437
+ ],
438
+ "sin_cos_embedding_keys": null,
439
+ "mean_std_embedding_keys": null,
440
+ "action_configs": null
441
+ },
442
+ "action": {
443
+ "delta_indices": [
444
+ 0,
445
+ 1,
446
+ 2,
447
+ 3,
448
+ 4,
449
+ 5,
450
+ 6,
451
+ 7,
452
+ 8,
453
+ 9,
454
+ 10,
455
+ 11,
456
+ 12,
457
+ 13,
458
+ 14,
459
+ 15
460
+ ],
461
+ "modality_keys": [
462
+ "left_arm",
463
+ "left_hand",
464
+ "left_leg",
465
+ "neck",
466
+ "right_arm",
467
+ "right_hand",
468
+ "right_leg",
469
+ "waist"
470
+ ],
471
+ "sin_cos_embedding_keys": null,
472
+ "mean_std_embedding_keys": null,
473
+ "action_configs": [
474
+ {
475
+ "rep": "ABSOLUTE",
476
+ "type": "NON_EEF",
477
+ "format": "DEFAULT",
478
+ "state_key": null
479
+ },
480
+ {
481
+ "rep": "ABSOLUTE",
482
+ "type": "NON_EEF",
483
+ "format": "DEFAULT",
484
+ "state_key": null
485
+ },
486
+ {
487
+ "rep": "ABSOLUTE",
488
+ "type": "NON_EEF",
489
+ "format": "DEFAULT",
490
+ "state_key": null
491
+ },
492
+ {
493
+ "rep": "ABSOLUTE",
494
+ "type": "NON_EEF",
495
+ "format": "DEFAULT",
496
+ "state_key": null
497
+ },
498
+ {
499
+ "rep": "ABSOLUTE",
500
+ "type": "NON_EEF",
501
+ "format": "DEFAULT",
502
+ "state_key": null
503
+ },
504
+ {
505
+ "rep": "ABSOLUTE",
506
+ "type": "NON_EEF",
507
+ "format": "DEFAULT",
508
+ "state_key": null
509
+ },
510
+ {
511
+ "rep": "ABSOLUTE",
512
+ "type": "NON_EEF",
513
+ "format": "DEFAULT",
514
+ "state_key": null
515
+ },
516
+ {
517
+ "rep": "ABSOLUTE",
518
+ "type": "NON_EEF",
519
+ "format": "DEFAULT",
520
+ "state_key": null
521
+ }
522
+ ]
523
+ },
524
+ "language": {
525
+ "delta_indices": [
526
+ 0
527
+ ],
528
+ "modality_keys": [
529
+ "annotation.human.action.task_description"
530
+ ],
531
+ "sin_cos_embedding_keys": null,
532
+ "mean_std_embedding_keys": null,
533
+ "action_configs": null
534
+ }
535
+ },
536
+ "austin_buds_dataset_converted_externally_to_rlds": {
537
+ "video": {
538
+ "delta_indices": [
539
+ -6,
540
+ -4,
541
+ -2,
542
+ 0
543
+ ],
544
+ "modality_keys": [
545
+ "primary",
546
+ "wrist"
547
+ ],
548
+ "sin_cos_embedding_keys": null,
549
+ "mean_std_embedding_keys": null,
550
+ "action_configs": null
551
+ },
552
+ "state": {
553
+ "delta_indices": [
554
+ 0
555
+ ],
556
+ "modality_keys": [
557
+ "joint_position",
558
+ "gripper_position"
559
+ ],
560
+ "sin_cos_embedding_keys": null,
561
+ "mean_std_embedding_keys": null,
562
+ "action_configs": null
563
+ },
564
+ "action": {
565
+ "delta_indices": [
566
+ 0,
567
+ 1,
568
+ 2,
569
+ 3,
570
+ 4,
571
+ 5,
572
+ 6,
573
+ 7,
574
+ 8,
575
+ 9,
576
+ 10,
577
+ 11,
578
+ 12,
579
+ 13,
580
+ 14,
581
+ 15
582
+ ],
583
+ "modality_keys": [
584
+ "end_effector_position",
585
+ "end_effector_rotation",
586
+ "gripper_close"
587
+ ],
588
+ "sin_cos_embedding_keys": null,
589
+ "mean_std_embedding_keys": null,
590
+ "action_configs": [
591
+ {
592
+ "rep": "DELTA",
593
+ "type": "EEF",
594
+ "format": "DEFAULT",
595
+ "state_key": null
596
+ },
597
+ {
598
+ "rep": "DELTA",
599
+ "type": "EEF",
600
+ "format": "DEFAULT",
601
+ "state_key": null
602
+ },
603
+ {
604
+ "rep": "ABSOLUTE",
605
+ "type": "NON_EEF",
606
+ "format": "DEFAULT",
607
+ "state_key": null
608
+ }
609
+ ]
610
+ },
611
+ "language": {
612
+ "delta_indices": [
613
+ 0
614
+ ],
615
+ "modality_keys": [
616
+ "annotation.human.action.task_description"
617
+ ],
618
+ "sin_cos_embedding_keys": null,
619
+ "mean_std_embedding_keys": null,
620
+ "action_configs": null
621
+ }
622
+ },
623
+ "taco_play": {
624
+ "video": {
625
+ "delta_indices": [
626
+ -6,
627
+ -4,
628
+ -2,
629
+ 0
630
+ ],
631
+ "modality_keys": [
632
+ "primary",
633
+ "wrist"
634
+ ],
635
+ "sin_cos_embedding_keys": null,
636
+ "mean_std_embedding_keys": null,
637
+ "action_configs": null
638
+ },
639
+ "state": {
640
+ "delta_indices": [
641
+ 0
642
+ ],
643
+ "modality_keys": [
644
+ "end_effector_position",
645
+ "end_effector_rotation",
646
+ "gripper_position"
647
+ ],
648
+ "sin_cos_embedding_keys": null,
649
+ "mean_std_embedding_keys": null,
650
+ "action_configs": null
651
+ },
652
+ "action": {
653
+ "delta_indices": [
654
+ 0,
655
+ 1,
656
+ 2,
657
+ 3,
658
+ 4,
659
+ 5,
660
+ 6,
661
+ 7,
662
+ 8,
663
+ 9,
664
+ 10,
665
+ 11,
666
+ 12,
667
+ 13,
668
+ 14,
669
+ 15
670
+ ],
671
+ "modality_keys": [
672
+ "end_effector_position",
673
+ "end_effector_rotation",
674
+ "gripper_close"
675
+ ],
676
+ "sin_cos_embedding_keys": null,
677
+ "mean_std_embedding_keys": null,
678
+ "action_configs": [
679
+ {
680
+ "rep": "DELTA",
681
+ "type": "EEF",
682
+ "format": "DEFAULT",
683
+ "state_key": null
684
+ },
685
+ {
686
+ "rep": "DELTA",
687
+ "type": "EEF",
688
+ "format": "DEFAULT",
689
+ "state_key": null
690
+ },
691
+ {
692
+ "rep": "ABSOLUTE",
693
+ "type": "NON_EEF",
694
+ "format": "DEFAULT",
695
+ "state_key": null
696
+ }
697
+ ]
698
+ },
699
+ "language": {
700
+ "delta_indices": [
701
+ 0
702
+ ],
703
+ "modality_keys": [
704
+ "annotation.human.action.task_description"
705
+ ],
706
+ "sin_cos_embedding_keys": null,
707
+ "mean_std_embedding_keys": null,
708
+ "action_configs": null
709
+ }
710
+ },
711
+ "roboturk": {
712
+ "video": {
713
+ "delta_indices": [
714
+ -6,
715
+ -4,
716
+ -2,
717
+ 0
718
+ ],
719
+ "modality_keys": [
720
+ "primary"
721
+ ],
722
+ "sin_cos_embedding_keys": null,
723
+ "mean_std_embedding_keys": null,
724
+ "action_configs": null
725
+ },
726
+ "state": {
727
+ "delta_indices": [
728
+ 0
729
+ ],
730
+ "modality_keys": [
731
+ "none"
732
+ ],
733
+ "sin_cos_embedding_keys": null,
734
+ "mean_std_embedding_keys": null,
735
+ "action_configs": null
736
+ },
737
+ "action": {
738
+ "delta_indices": [
739
+ 0,
740
+ 1,
741
+ 2,
742
+ 3,
743
+ 4,
744
+ 5,
745
+ 6,
746
+ 7,
747
+ 8,
748
+ 9,
749
+ 10,
750
+ 11,
751
+ 12,
752
+ 13,
753
+ 14,
754
+ 15
755
+ ],
756
+ "modality_keys": [
757
+ "end_effector_position",
758
+ "end_effector_rotation",
759
+ "gripper_close"
760
+ ],
761
+ "sin_cos_embedding_keys": null,
762
+ "mean_std_embedding_keys": null,
763
+ "action_configs": [
764
+ {
765
+ "rep": "DELTA",
766
+ "type": "EEF",
767
+ "format": "DEFAULT",
768
+ "state_key": null
769
+ },
770
+ {
771
+ "rep": "DELTA",
772
+ "type": "EEF",
773
+ "format": "DEFAULT",
774
+ "state_key": null
775
+ },
776
+ {
777
+ "rep": "ABSOLUTE",
778
+ "type": "NON_EEF",
779
+ "format": "DEFAULT",
780
+ "state_key": null
781
+ }
782
+ ]
783
+ },
784
+ "language": {
785
+ "delta_indices": [
786
+ 0
787
+ ],
788
+ "modality_keys": [
789
+ "annotation.human.action.task_description"
790
+ ],
791
+ "sin_cos_embedding_keys": null,
792
+ "mean_std_embedding_keys": null,
793
+ "action_configs": null
794
+ }
795
+ },
796
+ "galaxea": {
797
+ "video": {
798
+ "delta_indices": [
799
+ -6,
800
+ -4,
801
+ -2,
802
+ 0
803
+ ],
804
+ "modality_keys": [
805
+ "primary",
806
+ "wrist_left",
807
+ "wrist_right"
808
+ ],
809
+ "sin_cos_embedding_keys": null,
810
+ "mean_std_embedding_keys": null,
811
+ "action_configs": null
812
+ },
813
+ "state": {
814
+ "delta_indices": [
815
+ 0
816
+ ],
817
+ "modality_keys": [
818
+ "state"
819
+ ],
820
+ "sin_cos_embedding_keys": null,
821
+ "mean_std_embedding_keys": null,
822
+ "action_configs": null
823
+ },
824
+ "action": {
825
+ "delta_indices": [
826
+ 0,
827
+ 1,
828
+ 2,
829
+ 3,
830
+ 4,
831
+ 5,
832
+ 6,
833
+ 7,
834
+ 8,
835
+ 9,
836
+ 10,
837
+ 11,
838
+ 12,
839
+ 13,
840
+ 14,
841
+ 15
842
+ ],
843
+ "modality_keys": [
844
+ "action"
845
+ ],
846
+ "sin_cos_embedding_keys": null,
847
+ "mean_std_embedding_keys": null,
848
+ "action_configs": [
849
+ {
850
+ "rep": "ABSOLUTE",
851
+ "type": "NON_EEF",
852
+ "format": "DEFAULT",
853
+ "state_key": null
854
+ }
855
+ ]
856
+ },
857
+ "language": {
858
+ "delta_indices": [
859
+ 0
860
+ ],
861
+ "modality_keys": [
862
+ "annotation.human.action.task_description"
863
+ ],
864
+ "sin_cos_embedding_keys": null,
865
+ "mean_std_embedding_keys": null,
866
+ "action_configs": null
867
+ }
868
+ },
869
+ "austin_sailor_dataset_converted_externally_to_rlds": {
870
+ "video": {
871
+ "delta_indices": [
872
+ -6,
873
+ -4,
874
+ -2,
875
+ 0
876
+ ],
877
+ "modality_keys": [
878
+ "primary",
879
+ "wrist"
880
+ ],
881
+ "sin_cos_embedding_keys": null,
882
+ "mean_std_embedding_keys": null,
883
+ "action_configs": null
884
+ },
885
+ "state": {
886
+ "delta_indices": [
887
+ 0
888
+ ],
889
+ "modality_keys": [
890
+ "end_effector_position",
891
+ "end_effector_rotation",
892
+ "gripper_position"
893
+ ],
894
+ "sin_cos_embedding_keys": null,
895
+ "mean_std_embedding_keys": null,
896
+ "action_configs": null
897
+ },
898
+ "action": {
899
+ "delta_indices": [
900
+ 0,
901
+ 1,
902
+ 2,
903
+ 3,
904
+ 4,
905
+ 5,
906
+ 6,
907
+ 7,
908
+ 8,
909
+ 9,
910
+ 10,
911
+ 11,
912
+ 12,
913
+ 13,
914
+ 14,
915
+ 15
916
+ ],
917
+ "modality_keys": [
918
+ "end_effector_position",
919
+ "end_effector_rotation",
920
+ "gripper_close"
921
+ ],
922
+ "sin_cos_embedding_keys": null,
923
+ "mean_std_embedding_keys": null,
924
+ "action_configs": [
925
+ {
926
+ "rep": "DELTA",
927
+ "type": "EEF",
928
+ "format": "DEFAULT",
929
+ "state_key": null
930
+ },
931
+ {
932
+ "rep": "DELTA",
933
+ "type": "EEF",
934
+ "format": "DEFAULT",
935
+ "state_key": null
936
+ },
937
+ {
938
+ "rep": "ABSOLUTE",
939
+ "type": "NON_EEF",
940
+ "format": "DEFAULT",
941
+ "state_key": null
942
+ }
943
+ ]
944
+ },
945
+ "language": {
946
+ "delta_indices": [
947
+ 0
948
+ ],
949
+ "modality_keys": [
950
+ "annotation.human.action.task_description"
951
+ ],
952
+ "sin_cos_embedding_keys": null,
953
+ "mean_std_embedding_keys": null,
954
+ "action_configs": null
955
+ }
956
+ },
957
+ "viola": {
958
+ "video": {
959
+ "delta_indices": [
960
+ -6,
961
+ -4,
962
+ -2,
963
+ 0
964
+ ],
965
+ "modality_keys": [
966
+ "primary",
967
+ "wrist"
968
+ ],
969
+ "sin_cos_embedding_keys": null,
970
+ "mean_std_embedding_keys": null,
971
+ "action_configs": null
972
+ },
973
+ "state": {
974
+ "delta_indices": [
975
+ 0
976
+ ],
977
+ "modality_keys": [
978
+ "joint_position",
979
+ "gripper_position"
980
+ ],
981
+ "sin_cos_embedding_keys": null,
982
+ "mean_std_embedding_keys": null,
983
+ "action_configs": null
984
+ },
985
+ "action": {
986
+ "delta_indices": [
987
+ 0,
988
+ 1,
989
+ 2,
990
+ 3,
991
+ 4,
992
+ 5,
993
+ 6,
994
+ 7,
995
+ 8,
996
+ 9,
997
+ 10,
998
+ 11,
999
+ 12,
1000
+ 13,
1001
+ 14,
1002
+ 15
1003
+ ],
1004
+ "modality_keys": [
1005
+ "end_effector_position",
1006
+ "end_effector_rotation",
1007
+ "gripper_close"
1008
+ ],
1009
+ "sin_cos_embedding_keys": null,
1010
+ "mean_std_embedding_keys": null,
1011
+ "action_configs": [
1012
+ {
1013
+ "rep": "DELTA",
1014
+ "type": "EEF",
1015
+ "format": "DEFAULT",
1016
+ "state_key": null
1017
+ },
1018
+ {
1019
+ "rep": "DELTA",
1020
+ "type": "EEF",
1021
+ "format": "DEFAULT",
1022
+ "state_key": null
1023
+ },
1024
+ {
1025
+ "rep": "ABSOLUTE",
1026
+ "type": "NON_EEF",
1027
+ "format": "DEFAULT",
1028
+ "state_key": null
1029
+ }
1030
+ ]
1031
+ },
1032
+ "language": {
1033
+ "delta_indices": [
1034
+ 0
1035
+ ],
1036
+ "modality_keys": [
1037
+ "annotation.human.action.task_description"
1038
+ ],
1039
+ "sin_cos_embedding_keys": null,
1040
+ "mean_std_embedding_keys": null,
1041
+ "action_configs": null
1042
+ }
1043
+ },
1044
+ "agibot_gripper": {
1045
+ "video": {
1046
+ "delta_indices": [
1047
+ -6,
1048
+ -4,
1049
+ -2,
1050
+ 0
1051
+ ],
1052
+ "modality_keys": [
1053
+ "primary",
1054
+ "wrist_left",
1055
+ "wrist_right"
1056
+ ],
1057
+ "sin_cos_embedding_keys": null,
1058
+ "mean_std_embedding_keys": null,
1059
+ "action_configs": null
1060
+ },
1061
+ "state": {
1062
+ "delta_indices": [
1063
+ 0
1064
+ ],
1065
+ "modality_keys": [
1066
+ "state"
1067
+ ],
1068
+ "sin_cos_embedding_keys": null,
1069
+ "mean_std_embedding_keys": null,
1070
+ "action_configs": null
1071
+ },
1072
+ "action": {
1073
+ "delta_indices": [
1074
+ 0,
1075
+ 1,
1076
+ 2,
1077
+ 3,
1078
+ 4,
1079
+ 5,
1080
+ 6,
1081
+ 7,
1082
+ 8,
1083
+ 9,
1084
+ 10,
1085
+ 11,
1086
+ 12,
1087
+ 13,
1088
+ 14,
1089
+ 15
1090
+ ],
1091
+ "modality_keys": [
1092
+ "action"
1093
+ ],
1094
+ "sin_cos_embedding_keys": null,
1095
+ "mean_std_embedding_keys": null,
1096
+ "action_configs": [
1097
+ {
1098
+ "rep": "ABSOLUTE",
1099
+ "type": "NON_EEF",
1100
+ "format": "DEFAULT",
1101
+ "state_key": null
1102
+ }
1103
+ ]
1104
+ },
1105
+ "language": {
1106
+ "delta_indices": [
1107
+ 0
1108
+ ],
1109
+ "modality_keys": [
1110
+ "annotation.human.action.task_description"
1111
+ ],
1112
+ "sin_cos_embedding_keys": null,
1113
+ "mean_std_embedding_keys": null,
1114
+ "action_configs": null
1115
+ }
1116
+ },
1117
+ "agibot_dexhand": {
1118
+ "video": {
1119
+ "delta_indices": [
1120
+ -6,
1121
+ -4,
1122
+ -2,
1123
+ 0
1124
+ ],
1125
+ "modality_keys": [
1126
+ "primary"
1127
+ ],
1128
+ "sin_cos_embedding_keys": null,
1129
+ "mean_std_embedding_keys": null,
1130
+ "action_configs": null
1131
+ },
1132
+ "state": {
1133
+ "delta_indices": [
1134
+ 0
1135
+ ],
1136
+ "modality_keys": [
1137
+ "state"
1138
+ ],
1139
+ "sin_cos_embedding_keys": null,
1140
+ "mean_std_embedding_keys": null,
1141
+ "action_configs": null
1142
+ },
1143
+ "action": {
1144
+ "delta_indices": [
1145
+ 0,
1146
+ 1,
1147
+ 2,
1148
+ 3,
1149
+ 4,
1150
+ 5,
1151
+ 6,
1152
+ 7,
1153
+ 8,
1154
+ 9,
1155
+ 10,
1156
+ 11,
1157
+ 12,
1158
+ 13,
1159
+ 14,
1160
+ 15
1161
+ ],
1162
+ "modality_keys": [
1163
+ "action"
1164
+ ],
1165
+ "sin_cos_embedding_keys": null,
1166
+ "mean_std_embedding_keys": null,
1167
+ "action_configs": [
1168
+ {
1169
+ "rep": "ABSOLUTE",
1170
+ "type": "NON_EEF",
1171
+ "format": "DEFAULT",
1172
+ "state_key": null
1173
+ }
1174
+ ]
1175
+ },
1176
+ "language": {
1177
+ "delta_indices": [
1178
+ 0
1179
+ ],
1180
+ "modality_keys": [
1181
+ "annotation.human.action.task_description"
1182
+ ],
1183
+ "sin_cos_embedding_keys": null,
1184
+ "mean_std_embedding_keys": null,
1185
+ "action_configs": null
1186
+ }
1187
+ },
1188
+ "fmb_dataset": {
1189
+ "video": {
1190
+ "delta_indices": [
1191
+ -6,
1192
+ -4,
1193
+ -2,
1194
+ 0
1195
+ ],
1196
+ "modality_keys": [
1197
+ "primary",
1198
+ "secondary",
1199
+ "wrist"
1200
+ ],
1201
+ "sin_cos_embedding_keys": null,
1202
+ "mean_std_embedding_keys": null,
1203
+ "action_configs": null
1204
+ },
1205
+ "state": {
1206
+ "delta_indices": [
1207
+ 0
1208
+ ],
1209
+ "modality_keys": [
1210
+ "end_effector_position",
1211
+ "end_effector_rotation",
1212
+ "gripper_position"
1213
+ ],
1214
+ "sin_cos_embedding_keys": null,
1215
+ "mean_std_embedding_keys": null,
1216
+ "action_configs": null
1217
+ },
1218
+ "action": {
1219
+ "delta_indices": [
1220
+ 0,
1221
+ 1,
1222
+ 2,
1223
+ 3,
1224
+ 4,
1225
+ 5,
1226
+ 6,
1227
+ 7,
1228
+ 8,
1229
+ 9,
1230
+ 10,
1231
+ 11,
1232
+ 12,
1233
+ 13,
1234
+ 14,
1235
+ 15
1236
+ ],
1237
+ "modality_keys": [
1238
+ "end_effector_position",
1239
+ "end_effector_rotation",
1240
+ "gripper_close"
1241
+ ],
1242
+ "sin_cos_embedding_keys": null,
1243
+ "mean_std_embedding_keys": null,
1244
+ "action_configs": [
1245
+ {
1246
+ "rep": "DELTA",
1247
+ "type": "EEF",
1248
+ "format": "DEFAULT",
1249
+ "state_key": null
1250
+ },
1251
+ {
1252
+ "rep": "DELTA",
1253
+ "type": "EEF",
1254
+ "format": "DEFAULT",
1255
+ "state_key": null
1256
+ },
1257
+ {
1258
+ "rep": "ABSOLUTE",
1259
+ "type": "NON_EEF",
1260
+ "format": "DEFAULT",
1261
+ "state_key": null
1262
+ }
1263
+ ]
1264
+ },
1265
+ "language": {
1266
+ "delta_indices": [
1267
+ 0
1268
+ ],
1269
+ "modality_keys": [
1270
+ "annotation.human.action.task_description"
1271
+ ],
1272
+ "sin_cos_embedding_keys": null,
1273
+ "mean_std_embedding_keys": null,
1274
+ "action_configs": null
1275
+ }
1276
+ },
1277
+ "berkeley_cable_routing": {
1278
+ "video": {
1279
+ "delta_indices": [
1280
+ -6,
1281
+ -4,
1282
+ -2,
1283
+ 0
1284
+ ],
1285
+ "modality_keys": [
1286
+ "primary",
1287
+ "secondary",
1288
+ "wrist"
1289
+ ],
1290
+ "sin_cos_embedding_keys": null,
1291
+ "mean_std_embedding_keys": null,
1292
+ "action_configs": null
1293
+ },
1294
+ "state": {
1295
+ "delta_indices": [
1296
+ 0
1297
+ ],
1298
+ "modality_keys": [
1299
+ "joint_position"
1300
+ ],
1301
+ "sin_cos_embedding_keys": null,
1302
+ "mean_std_embedding_keys": null,
1303
+ "action_configs": null
1304
+ },
1305
+ "action": {
1306
+ "delta_indices": [
1307
+ 0,
1308
+ 1,
1309
+ 2,
1310
+ 3,
1311
+ 4,
1312
+ 5,
1313
+ 6,
1314
+ 7,
1315
+ 8,
1316
+ 9,
1317
+ 10,
1318
+ 11,
1319
+ 12,
1320
+ 13,
1321
+ 14,
1322
+ 15
1323
+ ],
1324
+ "modality_keys": [
1325
+ "end_effector_position",
1326
+ "end_effector_rotation",
1327
+ "gripper_close"
1328
+ ],
1329
+ "sin_cos_embedding_keys": null,
1330
+ "mean_std_embedding_keys": null,
1331
+ "action_configs": [
1332
+ {
1333
+ "rep": "DELTA",
1334
+ "type": "EEF",
1335
+ "format": "DEFAULT",
1336
+ "state_key": null
1337
+ },
1338
+ {
1339
+ "rep": "DELTA",
1340
+ "type": "EEF",
1341
+ "format": "DEFAULT",
1342
+ "state_key": null
1343
+ },
1344
+ {
1345
+ "rep": "ABSOLUTE",
1346
+ "type": "NON_EEF",
1347
+ "format": "DEFAULT",
1348
+ "state_key": null
1349
+ }
1350
+ ]
1351
+ },
1352
+ "language": {
1353
+ "delta_indices": [
1354
+ 0
1355
+ ],
1356
+ "modality_keys": [
1357
+ "annotation.human.action.task_description"
1358
+ ],
1359
+ "sin_cos_embedding_keys": null,
1360
+ "mean_std_embedding_keys": null,
1361
+ "action_configs": null
1362
+ }
1363
+ },
1364
+ "fractal20220817_data": {
1365
+ "video": {
1366
+ "delta_indices": [
1367
+ -6,
1368
+ -4,
1369
+ -2,
1370
+ 0
1371
+ ],
1372
+ "modality_keys": [
1373
+ "primary"
1374
+ ],
1375
+ "sin_cos_embedding_keys": null,
1376
+ "mean_std_embedding_keys": null,
1377
+ "action_configs": null
1378
+ },
1379
+ "state": {
1380
+ "delta_indices": [
1381
+ 0
1382
+ ],
1383
+ "modality_keys": [
1384
+ "end_effector_position",
1385
+ "end_effector_rotation",
1386
+ "gripper_position"
1387
+ ],
1388
+ "sin_cos_embedding_keys": null,
1389
+ "mean_std_embedding_keys": null,
1390
+ "action_configs": null
1391
+ },
1392
+ "action": {
1393
+ "delta_indices": [
1394
+ 0,
1395
+ 1,
1396
+ 2,
1397
+ 3,
1398
+ 4,
1399
+ 5,
1400
+ 6,
1401
+ 7,
1402
+ 8,
1403
+ 9,
1404
+ 10,
1405
+ 11,
1406
+ 12,
1407
+ 13,
1408
+ 14,
1409
+ 15
1410
+ ],
1411
+ "modality_keys": [
1412
+ "end_effector_position",
1413
+ "end_effector_rotation",
1414
+ "gripper_close"
1415
+ ],
1416
+ "sin_cos_embedding_keys": null,
1417
+ "mean_std_embedding_keys": null,
1418
+ "action_configs": [
1419
+ {
1420
+ "rep": "DELTA",
1421
+ "type": "EEF",
1422
+ "format": "DEFAULT",
1423
+ "state_key": null
1424
+ },
1425
+ {
1426
+ "rep": "DELTA",
1427
+ "type": "EEF",
1428
+ "format": "DEFAULT",
1429
+ "state_key": null
1430
+ },
1431
+ {
1432
+ "rep": "ABSOLUTE",
1433
+ "type": "NON_EEF",
1434
+ "format": "DEFAULT",
1435
+ "state_key": null
1436
+ }
1437
+ ]
1438
+ },
1439
+ "language": {
1440
+ "delta_indices": [
1441
+ 0
1442
+ ],
1443
+ "modality_keys": [
1444
+ "annotation.human.action.task_description"
1445
+ ],
1446
+ "sin_cos_embedding_keys": null,
1447
+ "mean_std_embedding_keys": null,
1448
+ "action_configs": null
1449
+ }
1450
+ },
1451
+ "ucsd_kitchen_dataset_converted_externally_to_rlds": {
1452
+ "video": {
1453
+ "delta_indices": [
1454
+ -6,
1455
+ -4,
1456
+ -2,
1457
+ 0
1458
+ ],
1459
+ "modality_keys": [
1460
+ "primary"
1461
+ ],
1462
+ "sin_cos_embedding_keys": null,
1463
+ "mean_std_embedding_keys": null,
1464
+ "action_configs": null
1465
+ },
1466
+ "state": {
1467
+ "delta_indices": [
1468
+ 0
1469
+ ],
1470
+ "modality_keys": [
1471
+ "joint_position"
1472
+ ],
1473
+ "sin_cos_embedding_keys": null,
1474
+ "mean_std_embedding_keys": null,
1475
+ "action_configs": null
1476
+ },
1477
+ "action": {
1478
+ "delta_indices": [
1479
+ 0,
1480
+ 1,
1481
+ 2,
1482
+ 3,
1483
+ 4,
1484
+ 5,
1485
+ 6,
1486
+ 7,
1487
+ 8,
1488
+ 9,
1489
+ 10,
1490
+ 11,
1491
+ 12,
1492
+ 13,
1493
+ 14,
1494
+ 15
1495
+ ],
1496
+ "modality_keys": [
1497
+ "end_effector_position",
1498
+ "end_effector_rotation",
1499
+ "gripper_close"
1500
+ ],
1501
+ "sin_cos_embedding_keys": null,
1502
+ "mean_std_embedding_keys": null,
1503
+ "action_configs": [
1504
+ {
1505
+ "rep": "DELTA",
1506
+ "type": "EEF",
1507
+ "format": "DEFAULT",
1508
+ "state_key": null
1509
+ },
1510
+ {
1511
+ "rep": "DELTA",
1512
+ "type": "EEF",
1513
+ "format": "DEFAULT",
1514
+ "state_key": null
1515
+ },
1516
+ {
1517
+ "rep": "ABSOLUTE",
1518
+ "type": "NON_EEF",
1519
+ "format": "DEFAULT",
1520
+ "state_key": null
1521
+ }
1522
+ ]
1523
+ },
1524
+ "language": {
1525
+ "delta_indices": [
1526
+ 0
1527
+ ],
1528
+ "modality_keys": [
1529
+ "annotation.human.action.task_description"
1530
+ ],
1531
+ "sin_cos_embedding_keys": null,
1532
+ "mean_std_embedding_keys": null,
1533
+ "action_configs": null
1534
+ }
1535
+ },
1536
+ "dobbe": {
1537
+ "video": {
1538
+ "delta_indices": [
1539
+ -6,
1540
+ -4,
1541
+ -2,
1542
+ 0
1543
+ ],
1544
+ "modality_keys": [
1545
+ "wrist"
1546
+ ],
1547
+ "sin_cos_embedding_keys": null,
1548
+ "mean_std_embedding_keys": null,
1549
+ "action_configs": null
1550
+ },
1551
+ "state": {
1552
+ "delta_indices": [
1553
+ 0
1554
+ ],
1555
+ "modality_keys": [
1556
+ "end_effector_position",
1557
+ "end_effector_rotation",
1558
+ "gripper_position"
1559
+ ],
1560
+ "sin_cos_embedding_keys": null,
1561
+ "mean_std_embedding_keys": null,
1562
+ "action_configs": null
1563
+ },
1564
+ "action": {
1565
+ "delta_indices": [
1566
+ 0,
1567
+ 1,
1568
+ 2,
1569
+ 3,
1570
+ 4,
1571
+ 5,
1572
+ 6,
1573
+ 7,
1574
+ 8,
1575
+ 9,
1576
+ 10,
1577
+ 11,
1578
+ 12,
1579
+ 13,
1580
+ 14,
1581
+ 15
1582
+ ],
1583
+ "modality_keys": [
1584
+ "end_effector_position",
1585
+ "end_effector_rotation",
1586
+ "gripper_close"
1587
+ ],
1588
+ "sin_cos_embedding_keys": null,
1589
+ "mean_std_embedding_keys": null,
1590
+ "action_configs": [
1591
+ {
1592
+ "rep": "DELTA",
1593
+ "type": "EEF",
1594
+ "format": "DEFAULT",
1595
+ "state_key": null
1596
+ },
1597
+ {
1598
+ "rep": "DELTA",
1599
+ "type": "EEF",
1600
+ "format": "DEFAULT",
1601
+ "state_key": null
1602
+ },
1603
+ {
1604
+ "rep": "ABSOLUTE",
1605
+ "type": "NON_EEF",
1606
+ "format": "DEFAULT",
1607
+ "state_key": null
1608
+ }
1609
+ ]
1610
+ },
1611
+ "language": {
1612
+ "delta_indices": [
1613
+ 0
1614
+ ],
1615
+ "modality_keys": [
1616
+ "annotation.human.action.task_description"
1617
+ ],
1618
+ "sin_cos_embedding_keys": null,
1619
+ "mean_std_embedding_keys": null,
1620
+ "action_configs": null
1621
+ }
1622
+ },
1623
+ "humanoid_everyday_h1": {
1624
+ "video": {
1625
+ "delta_indices": [
1626
+ -6,
1627
+ -4,
1628
+ -2,
1629
+ 0
1630
+ ],
1631
+ "modality_keys": [
1632
+ "egocentric_resized"
1633
+ ],
1634
+ "sin_cos_embedding_keys": null,
1635
+ "mean_std_embedding_keys": null,
1636
+ "action_configs": null
1637
+ },
1638
+ "state": {
1639
+ "delta_indices": [
1640
+ 0
1641
+ ],
1642
+ "modality_keys": [
1643
+ "left_arm",
1644
+ "left_hand",
1645
+ "right_arm",
1646
+ "right_hand"
1647
+ ],
1648
+ "sin_cos_embedding_keys": null,
1649
+ "mean_std_embedding_keys": null,
1650
+ "action_configs": null
1651
+ },
1652
+ "action": {
1653
+ "delta_indices": [
1654
+ 0,
1655
+ 1,
1656
+ 2,
1657
+ 3,
1658
+ 4,
1659
+ 5,
1660
+ 6,
1661
+ 7,
1662
+ 8,
1663
+ 9,
1664
+ 10,
1665
+ 11,
1666
+ 12,
1667
+ 13,
1668
+ 14,
1669
+ 15
1670
+ ],
1671
+ "modality_keys": [
1672
+ "left_arm",
1673
+ "left_hand",
1674
+ "right_arm",
1675
+ "right_hand"
1676
+ ],
1677
+ "sin_cos_embedding_keys": null,
1678
+ "mean_std_embedding_keys": null,
1679
+ "action_configs": [
1680
+ {
1681
+ "rep": "ABSOLUTE",
1682
+ "type": "NON_EEF",
1683
+ "format": "DEFAULT",
1684
+ "state_key": null
1685
+ },
1686
+ {
1687
+ "rep": "ABSOLUTE",
1688
+ "type": "NON_EEF",
1689
+ "format": "DEFAULT",
1690
+ "state_key": null
1691
+ },
1692
+ {
1693
+ "rep": "ABSOLUTE",
1694
+ "type": "NON_EEF",
1695
+ "format": "DEFAULT",
1696
+ "state_key": null
1697
+ },
1698
+ {
1699
+ "rep": "ABSOLUTE",
1700
+ "type": "NON_EEF",
1701
+ "format": "DEFAULT",
1702
+ "state_key": null
1703
+ }
1704
+ ]
1705
+ },
1706
+ "language": {
1707
+ "delta_indices": [
1708
+ 0
1709
+ ],
1710
+ "modality_keys": [
1711
+ "annotation.human.action.task_description"
1712
+ ],
1713
+ "sin_cos_embedding_keys": null,
1714
+ "mean_std_embedding_keys": null,
1715
+ "action_configs": null
1716
+ }
1717
+ },
1718
+ "stanford_hydra_dataset_converted_externally_to_rlds": {
1719
+ "video": {
1720
+ "delta_indices": [
1721
+ -6,
1722
+ -4,
1723
+ -2,
1724
+ 0
1725
+ ],
1726
+ "modality_keys": [
1727
+ "primary",
1728
+ "wrist"
1729
+ ],
1730
+ "sin_cos_embedding_keys": null,
1731
+ "mean_std_embedding_keys": null,
1732
+ "action_configs": null
1733
+ },
1734
+ "state": {
1735
+ "delta_indices": [
1736
+ 0
1737
+ ],
1738
+ "modality_keys": [
1739
+ "end_effector_position",
1740
+ "end_effector_rotation",
1741
+ "gripper_position"
1742
+ ],
1743
+ "sin_cos_embedding_keys": null,
1744
+ "mean_std_embedding_keys": null,
1745
+ "action_configs": null
1746
+ },
1747
+ "action": {
1748
+ "delta_indices": [
1749
+ 0,
1750
+ 1,
1751
+ 2,
1752
+ 3,
1753
+ 4,
1754
+ 5,
1755
+ 6,
1756
+ 7,
1757
+ 8,
1758
+ 9,
1759
+ 10,
1760
+ 11,
1761
+ 12,
1762
+ 13,
1763
+ 14,
1764
+ 15
1765
+ ],
1766
+ "modality_keys": [
1767
+ "end_effector_position",
1768
+ "end_effector_rotation",
1769
+ "gripper_close"
1770
+ ],
1771
+ "sin_cos_embedding_keys": null,
1772
+ "mean_std_embedding_keys": null,
1773
+ "action_configs": [
1774
+ {
1775
+ "rep": "DELTA",
1776
+ "type": "EEF",
1777
+ "format": "DEFAULT",
1778
+ "state_key": null
1779
+ },
1780
+ {
1781
+ "rep": "DELTA",
1782
+ "type": "EEF",
1783
+ "format": "DEFAULT",
1784
+ "state_key": null
1785
+ },
1786
+ {
1787
+ "rep": "ABSOLUTE",
1788
+ "type": "NON_EEF",
1789
+ "format": "DEFAULT",
1790
+ "state_key": null
1791
+ }
1792
+ ]
1793
+ },
1794
+ "language": {
1795
+ "delta_indices": [
1796
+ 0
1797
+ ],
1798
+ "modality_keys": [
1799
+ "annotation.human.action.task_description"
1800
+ ],
1801
+ "sin_cos_embedding_keys": null,
1802
+ "mean_std_embedding_keys": null,
1803
+ "action_configs": null
1804
+ }
1805
+ },
1806
+ "berkeley_autolab_ur5": {
1807
+ "video": {
1808
+ "delta_indices": [
1809
+ -6,
1810
+ -4,
1811
+ -2,
1812
+ 0
1813
+ ],
1814
+ "modality_keys": [
1815
+ "primary",
1816
+ "wrist"
1817
+ ],
1818
+ "sin_cos_embedding_keys": null,
1819
+ "mean_std_embedding_keys": null,
1820
+ "action_configs": null
1821
+ },
1822
+ "state": {
1823
+ "delta_indices": [
1824
+ 0
1825
+ ],
1826
+ "modality_keys": [
1827
+ "end_effector_position",
1828
+ "end_effector_rotation",
1829
+ "gripper_position"
1830
+ ],
1831
+ "sin_cos_embedding_keys": null,
1832
+ "mean_std_embedding_keys": null,
1833
+ "action_configs": null
1834
+ },
1835
+ "action": {
1836
+ "delta_indices": [
1837
+ 0,
1838
+ 1,
1839
+ 2,
1840
+ 3,
1841
+ 4,
1842
+ 5,
1843
+ 6,
1844
+ 7,
1845
+ 8,
1846
+ 9,
1847
+ 10,
1848
+ 11,
1849
+ 12,
1850
+ 13,
1851
+ 14,
1852
+ 15
1853
+ ],
1854
+ "modality_keys": [
1855
+ "end_effector_position",
1856
+ "end_effector_rotation",
1857
+ "gripper_close"
1858
+ ],
1859
+ "sin_cos_embedding_keys": null,
1860
+ "mean_std_embedding_keys": null,
1861
+ "action_configs": [
1862
+ {
1863
+ "rep": "DELTA",
1864
+ "type": "EEF",
1865
+ "format": "DEFAULT",
1866
+ "state_key": null
1867
+ },
1868
+ {
1869
+ "rep": "DELTA",
1870
+ "type": "EEF",
1871
+ "format": "DEFAULT",
1872
+ "state_key": null
1873
+ },
1874
+ {
1875
+ "rep": "ABSOLUTE",
1876
+ "type": "NON_EEF",
1877
+ "format": "DEFAULT",
1878
+ "state_key": null
1879
+ }
1880
+ ]
1881
+ },
1882
+ "language": {
1883
+ "delta_indices": [
1884
+ 0
1885
+ ],
1886
+ "modality_keys": [
1887
+ "annotation.human.action.task_description"
1888
+ ],
1889
+ "sin_cos_embedding_keys": null,
1890
+ "mean_std_embedding_keys": null,
1891
+ "action_configs": null
1892
+ }
1893
+ },
1894
+ "austin_sirius_dataset_converted_externally_to_rlds": {
1895
+ "video": {
1896
+ "delta_indices": [
1897
+ -6,
1898
+ -4,
1899
+ -2,
1900
+ 0
1901
+ ],
1902
+ "modality_keys": [
1903
+ "primary",
1904
+ "wrist"
1905
+ ],
1906
+ "sin_cos_embedding_keys": null,
1907
+ "mean_std_embedding_keys": null,
1908
+ "action_configs": null
1909
+ },
1910
+ "state": {
1911
+ "delta_indices": [
1912
+ 0
1913
+ ],
1914
+ "modality_keys": [
1915
+ "end_effector_position",
1916
+ "end_effector_rotation",
1917
+ "gripper_position"
1918
+ ],
1919
+ "sin_cos_embedding_keys": null,
1920
+ "mean_std_embedding_keys": null,
1921
+ "action_configs": null
1922
+ },
1923
+ "action": {
1924
+ "delta_indices": [
1925
+ 0,
1926
+ 1,
1927
+ 2,
1928
+ 3,
1929
+ 4,
1930
+ 5,
1931
+ 6,
1932
+ 7,
1933
+ 8,
1934
+ 9,
1935
+ 10,
1936
+ 11,
1937
+ 12,
1938
+ 13,
1939
+ 14,
1940
+ 15
1941
+ ],
1942
+ "modality_keys": [
1943
+ "end_effector_position",
1944
+ "end_effector_rotation",
1945
+ "gripper_close"
1946
+ ],
1947
+ "sin_cos_embedding_keys": null,
1948
+ "mean_std_embedding_keys": null,
1949
+ "action_configs": [
1950
+ {
1951
+ "rep": "DELTA",
1952
+ "type": "EEF",
1953
+ "format": "DEFAULT",
1954
+ "state_key": null
1955
+ },
1956
+ {
1957
+ "rep": "DELTA",
1958
+ "type": "EEF",
1959
+ "format": "DEFAULT",
1960
+ "state_key": null
1961
+ },
1962
+ {
1963
+ "rep": "ABSOLUTE",
1964
+ "type": "NON_EEF",
1965
+ "format": "DEFAULT",
1966
+ "state_key": null
1967
+ }
1968
+ ]
1969
+ },
1970
+ "language": {
1971
+ "delta_indices": [
1972
+ 0
1973
+ ],
1974
+ "modality_keys": [
1975
+ "annotation.human.action.task_description"
1976
+ ],
1977
+ "sin_cos_embedding_keys": null,
1978
+ "mean_std_embedding_keys": null,
1979
+ "action_configs": null
1980
+ }
1981
+ },
1982
+ "nyu_franka_play_dataset_converted_externally_to_rlds": {
1983
+ "video": {
1984
+ "delta_indices": [
1985
+ -6,
1986
+ -4,
1987
+ -2,
1988
+ 0
1989
+ ],
1990
+ "modality_keys": [
1991
+ "primary",
1992
+ "secondary"
1993
+ ],
1994
+ "sin_cos_embedding_keys": null,
1995
+ "mean_std_embedding_keys": null,
1996
+ "action_configs": null
1997
+ },
1998
+ "state": {
1999
+ "delta_indices": [
2000
+ 0
2001
+ ],
2002
+ "modality_keys": [
2003
+ "end_effector_position",
2004
+ "end_effector_rotation",
2005
+ "gripper_position"
2006
+ ],
2007
+ "sin_cos_embedding_keys": null,
2008
+ "mean_std_embedding_keys": null,
2009
+ "action_configs": null
2010
+ },
2011
+ "action": {
2012
+ "delta_indices": [
2013
+ 0,
2014
+ 1,
2015
+ 2,
2016
+ 3,
2017
+ 4,
2018
+ 5,
2019
+ 6,
2020
+ 7,
2021
+ 8,
2022
+ 9,
2023
+ 10,
2024
+ 11,
2025
+ 12,
2026
+ 13,
2027
+ 14,
2028
+ 15
2029
+ ],
2030
+ "modality_keys": [
2031
+ "end_effector_position",
2032
+ "end_effector_rotation",
2033
+ "gripper_close"
2034
+ ],
2035
+ "sin_cos_embedding_keys": null,
2036
+ "mean_std_embedding_keys": null,
2037
+ "action_configs": [
2038
+ {
2039
+ "rep": "DELTA",
2040
+ "type": "EEF",
2041
+ "format": "DEFAULT",
2042
+ "state_key": null
2043
+ },
2044
+ {
2045
+ "rep": "DELTA",
2046
+ "type": "EEF",
2047
+ "format": "DEFAULT",
2048
+ "state_key": null
2049
+ },
2050
+ {
2051
+ "rep": "ABSOLUTE",
2052
+ "type": "NON_EEF",
2053
+ "format": "DEFAULT",
2054
+ "state_key": null
2055
+ }
2056
+ ]
2057
+ },
2058
+ "language": {
2059
+ "delta_indices": [
2060
+ 0
2061
+ ],
2062
+ "modality_keys": [
2063
+ "annotation.human.action.task_description"
2064
+ ],
2065
+ "sin_cos_embedding_keys": null,
2066
+ "mean_std_embedding_keys": null,
2067
+ "action_configs": null
2068
+ }
2069
+ },
2070
+ "toto": {
2071
+ "video": {
2072
+ "delta_indices": [
2073
+ -6,
2074
+ -4,
2075
+ -2,
2076
+ 0
2077
+ ],
2078
+ "modality_keys": [
2079
+ "primary"
2080
+ ],
2081
+ "sin_cos_embedding_keys": null,
2082
+ "mean_std_embedding_keys": null,
2083
+ "action_configs": null
2084
+ },
2085
+ "state": {
2086
+ "delta_indices": [
2087
+ 0
2088
+ ],
2089
+ "modality_keys": [
2090
+ "joint_position",
2091
+ "gripper_position"
2092
+ ],
2093
+ "sin_cos_embedding_keys": null,
2094
+ "mean_std_embedding_keys": null,
2095
+ "action_configs": null
2096
+ },
2097
+ "action": {
2098
+ "delta_indices": [
2099
+ 0,
2100
+ 1,
2101
+ 2,
2102
+ 3,
2103
+ 4,
2104
+ 5,
2105
+ 6,
2106
+ 7,
2107
+ 8,
2108
+ 9,
2109
+ 10,
2110
+ 11,
2111
+ 12,
2112
+ 13,
2113
+ 14,
2114
+ 15
2115
+ ],
2116
+ "modality_keys": [
2117
+ "end_effector_position",
2118
+ "end_effector_rotation",
2119
+ "gripper_close"
2120
+ ],
2121
+ "sin_cos_embedding_keys": null,
2122
+ "mean_std_embedding_keys": null,
2123
+ "action_configs": [
2124
+ {
2125
+ "rep": "DELTA",
2126
+ "type": "EEF",
2127
+ "format": "DEFAULT",
2128
+ "state_key": null
2129
+ },
2130
+ {
2131
+ "rep": "DELTA",
2132
+ "type": "EEF",
2133
+ "format": "DEFAULT",
2134
+ "state_key": null
2135
+ },
2136
+ {
2137
+ "rep": "ABSOLUTE",
2138
+ "type": "NON_EEF",
2139
+ "format": "DEFAULT",
2140
+ "state_key": null
2141
+ }
2142
+ ]
2143
+ },
2144
+ "language": {
2145
+ "delta_indices": [
2146
+ 0
2147
+ ],
2148
+ "modality_keys": [
2149
+ "annotation.human.action.task_description"
2150
+ ],
2151
+ "sin_cos_embedding_keys": null,
2152
+ "mean_std_embedding_keys": null,
2153
+ "action_configs": null
2154
+ }
2155
+ },
2156
+ "dlr_edan_shared_control_converted_externally_to_rlds": {
2157
+ "video": {
2158
+ "delta_indices": [
2159
+ -6,
2160
+ -4,
2161
+ -2,
2162
+ 0
2163
+ ],
2164
+ "modality_keys": [
2165
+ "primary"
2166
+ ],
2167
+ "sin_cos_embedding_keys": null,
2168
+ "mean_std_embedding_keys": null,
2169
+ "action_configs": null
2170
+ },
2171
+ "state": {
2172
+ "delta_indices": [
2173
+ 0
2174
+ ],
2175
+ "modality_keys": [
2176
+ "end_effector_position",
2177
+ "end_effector_rotation",
2178
+ "gripper_position"
2179
+ ],
2180
+ "sin_cos_embedding_keys": null,
2181
+ "mean_std_embedding_keys": null,
2182
+ "action_configs": null
2183
+ },
2184
+ "action": {
2185
+ "delta_indices": [
2186
+ 0,
2187
+ 1,
2188
+ 2,
2189
+ 3,
2190
+ 4,
2191
+ 5,
2192
+ 6,
2193
+ 7,
2194
+ 8,
2195
+ 9,
2196
+ 10,
2197
+ 11,
2198
+ 12,
2199
+ 13,
2200
+ 14,
2201
+ 15
2202
+ ],
2203
+ "modality_keys": [
2204
+ "end_effector_position",
2205
+ "end_effector_rotation",
2206
+ "gripper_close"
2207
+ ],
2208
+ "sin_cos_embedding_keys": null,
2209
+ "mean_std_embedding_keys": null,
2210
+ "action_configs": [
2211
+ {
2212
+ "rep": "DELTA",
2213
+ "type": "EEF",
2214
+ "format": "DEFAULT",
2215
+ "state_key": null
2216
+ },
2217
+ {
2218
+ "rep": "DELTA",
2219
+ "type": "EEF",
2220
+ "format": "DEFAULT",
2221
+ "state_key": null
2222
+ },
2223
+ {
2224
+ "rep": "ABSOLUTE",
2225
+ "type": "NON_EEF",
2226
+ "format": "DEFAULT",
2227
+ "state_key": null
2228
+ }
2229
+ ]
2230
+ },
2231
+ "language": {
2232
+ "delta_indices": [
2233
+ 0
2234
+ ],
2235
+ "modality_keys": [
2236
+ "annotation.human.action.task_description"
2237
+ ],
2238
+ "sin_cos_embedding_keys": null,
2239
+ "mean_std_embedding_keys": null,
2240
+ "action_configs": null
2241
+ }
2242
+ },
2243
+ "droid": {
2244
+ "video": {
2245
+ "delta_indices": [
2246
+ -6,
2247
+ -4,
2248
+ -2,
2249
+ 0
2250
+ ],
2251
+ "modality_keys": [
2252
+ "primary",
2253
+ "secondary",
2254
+ "wrist"
2255
+ ],
2256
+ "sin_cos_embedding_keys": null,
2257
+ "mean_std_embedding_keys": null,
2258
+ "action_configs": null
2259
+ },
2260
+ "state": {
2261
+ "delta_indices": [
2262
+ 0
2263
+ ],
2264
+ "modality_keys": [
2265
+ "end_effector_position",
2266
+ "end_effector_rotation",
2267
+ "gripper_position"
2268
+ ],
2269
+ "sin_cos_embedding_keys": null,
2270
+ "mean_std_embedding_keys": null,
2271
+ "action_configs": null
2272
+ },
2273
+ "action": {
2274
+ "delta_indices": [
2275
+ 0,
2276
+ 1,
2277
+ 2,
2278
+ 3,
2279
+ 4,
2280
+ 5,
2281
+ 6,
2282
+ 7,
2283
+ 8,
2284
+ 9,
2285
+ 10,
2286
+ 11,
2287
+ 12,
2288
+ 13,
2289
+ 14,
2290
+ 15
2291
+ ],
2292
+ "modality_keys": [
2293
+ "end_effector_position",
2294
+ "end_effector_rotation",
2295
+ "gripper_close"
2296
+ ],
2297
+ "sin_cos_embedding_keys": null,
2298
+ "mean_std_embedding_keys": null,
2299
+ "action_configs": [
2300
+ {
2301
+ "rep": "DELTA",
2302
+ "type": "EEF",
2303
+ "format": "DEFAULT",
2304
+ "state_key": null
2305
+ },
2306
+ {
2307
+ "rep": "DELTA",
2308
+ "type": "EEF",
2309
+ "format": "DEFAULT",
2310
+ "state_key": null
2311
+ },
2312
+ {
2313
+ "rep": "ABSOLUTE",
2314
+ "type": "NON_EEF",
2315
+ "format": "DEFAULT",
2316
+ "state_key": null
2317
+ }
2318
+ ]
2319
+ },
2320
+ "language": {
2321
+ "delta_indices": [
2322
+ 0
2323
+ ],
2324
+ "modality_keys": [
2325
+ "annotation.human.action.task_description"
2326
+ ],
2327
+ "sin_cos_embedding_keys": null,
2328
+ "mean_std_embedding_keys": null,
2329
+ "action_configs": null
2330
+ }
2331
+ },
2332
+ "cmu_stretch": {
2333
+ "video": {
2334
+ "delta_indices": [
2335
+ -6,
2336
+ -4,
2337
+ -2,
2338
+ 0
2339
+ ],
2340
+ "modality_keys": [
2341
+ "primary"
2342
+ ],
2343
+ "sin_cos_embedding_keys": null,
2344
+ "mean_std_embedding_keys": null,
2345
+ "action_configs": null
2346
+ },
2347
+ "state": {
2348
+ "delta_indices": [
2349
+ 0
2350
+ ],
2351
+ "modality_keys": [
2352
+ "end_effector_position",
2353
+ "end_effector_rotation",
2354
+ "gripper_position"
2355
+ ],
2356
+ "sin_cos_embedding_keys": null,
2357
+ "mean_std_embedding_keys": null,
2358
+ "action_configs": null
2359
+ },
2360
+ "action": {
2361
+ "delta_indices": [
2362
+ 0,
2363
+ 1,
2364
+ 2,
2365
+ 3,
2366
+ 4,
2367
+ 5,
2368
+ 6,
2369
+ 7,
2370
+ 8,
2371
+ 9,
2372
+ 10,
2373
+ 11,
2374
+ 12,
2375
+ 13,
2376
+ 14,
2377
+ 15
2378
+ ],
2379
+ "modality_keys": [
2380
+ "end_effector_position",
2381
+ "end_effector_rotation",
2382
+ "gripper_close"
2383
+ ],
2384
+ "sin_cos_embedding_keys": null,
2385
+ "mean_std_embedding_keys": null,
2386
+ "action_configs": [
2387
+ {
2388
+ "rep": "DELTA",
2389
+ "type": "EEF",
2390
+ "format": "DEFAULT",
2391
+ "state_key": null
2392
+ },
2393
+ {
2394
+ "rep": "DELTA",
2395
+ "type": "EEF",
2396
+ "format": "DEFAULT",
2397
+ "state_key": null
2398
+ },
2399
+ {
2400
+ "rep": "ABSOLUTE",
2401
+ "type": "NON_EEF",
2402
+ "format": "DEFAULT",
2403
+ "state_key": null
2404
+ }
2405
+ ]
2406
+ },
2407
+ "language": {
2408
+ "delta_indices": [
2409
+ 0
2410
+ ],
2411
+ "modality_keys": [
2412
+ "annotation.human.action.task_description"
2413
+ ],
2414
+ "sin_cos_embedding_keys": null,
2415
+ "mean_std_embedding_keys": null,
2416
+ "action_configs": null
2417
+ }
2418
+ },
2419
+ "iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
2420
+ "video": {
2421
+ "delta_indices": [
2422
+ -6,
2423
+ -4,
2424
+ -2,
2425
+ 0
2426
+ ],
2427
+ "modality_keys": [
2428
+ "primary",
2429
+ "wrist"
2430
+ ],
2431
+ "sin_cos_embedding_keys": null,
2432
+ "mean_std_embedding_keys": null,
2433
+ "action_configs": null
2434
+ },
2435
+ "state": {
2436
+ "delta_indices": [
2437
+ 0
2438
+ ],
2439
+ "modality_keys": [
2440
+ "end_effector_position",
2441
+ "end_effector_rotation",
2442
+ "gripper_position"
2443
+ ],
2444
+ "sin_cos_embedding_keys": null,
2445
+ "mean_std_embedding_keys": null,
2446
+ "action_configs": null
2447
+ },
2448
+ "action": {
2449
+ "delta_indices": [
2450
+ 0,
2451
+ 1,
2452
+ 2,
2453
+ 3,
2454
+ 4,
2455
+ 5,
2456
+ 6,
2457
+ 7,
2458
+ 8,
2459
+ 9,
2460
+ 10,
2461
+ 11,
2462
+ 12,
2463
+ 13,
2464
+ 14,
2465
+ 15
2466
+ ],
2467
+ "modality_keys": [
2468
+ "end_effector_position",
2469
+ "end_effector_rotation",
2470
+ "gripper_close"
2471
+ ],
2472
+ "sin_cos_embedding_keys": null,
2473
+ "mean_std_embedding_keys": null,
2474
+ "action_configs": [
2475
+ {
2476
+ "rep": "DELTA",
2477
+ "type": "EEF",
2478
+ "format": "DEFAULT",
2479
+ "state_key": null
2480
+ },
2481
+ {
2482
+ "rep": "DELTA",
2483
+ "type": "EEF",
2484
+ "format": "DEFAULT",
2485
+ "state_key": null
2486
+ },
2487
+ {
2488
+ "rep": "ABSOLUTE",
2489
+ "type": "NON_EEF",
2490
+ "format": "DEFAULT",
2491
+ "state_key": null
2492
+ }
2493
+ ]
2494
+ },
2495
+ "language": {
2496
+ "delta_indices": [
2497
+ 0
2498
+ ],
2499
+ "modality_keys": [
2500
+ "annotation.human.action.task_description"
2501
+ ],
2502
+ "sin_cos_embedding_keys": null,
2503
+ "mean_std_embedding_keys": null,
2504
+ "action_configs": null
2505
+ }
2506
+ },
2507
+ "jaco_play": {
2508
+ "video": {
2509
+ "delta_indices": [
2510
+ -6,
2511
+ -4,
2512
+ -2,
2513
+ 0
2514
+ ],
2515
+ "modality_keys": [
2516
+ "primary",
2517
+ "wrist"
2518
+ ],
2519
+ "sin_cos_embedding_keys": null,
2520
+ "mean_std_embedding_keys": null,
2521
+ "action_configs": null
2522
+ },
2523
+ "state": {
2524
+ "delta_indices": [
2525
+ 0
2526
+ ],
2527
+ "modality_keys": [
2528
+ "end_effector_position",
2529
+ "end_effector_rotation",
2530
+ "gripper_position"
2531
+ ],
2532
+ "sin_cos_embedding_keys": null,
2533
+ "mean_std_embedding_keys": null,
2534
+ "action_configs": null
2535
+ },
2536
+ "action": {
2537
+ "delta_indices": [
2538
+ 0,
2539
+ 1,
2540
+ 2,
2541
+ 3,
2542
+ 4,
2543
+ 5,
2544
+ 6,
2545
+ 7,
2546
+ 8,
2547
+ 9,
2548
+ 10,
2549
+ 11,
2550
+ 12,
2551
+ 13,
2552
+ 14,
2553
+ 15
2554
+ ],
2555
+ "modality_keys": [
2556
+ "end_effector_position",
2557
+ "end_effector_rotation",
2558
+ "gripper_close"
2559
+ ],
2560
+ "sin_cos_embedding_keys": null,
2561
+ "mean_std_embedding_keys": null,
2562
+ "action_configs": [
2563
+ {
2564
+ "rep": "DELTA",
2565
+ "type": "EEF",
2566
+ "format": "DEFAULT",
2567
+ "state_key": null
2568
+ },
2569
+ {
2570
+ "rep": "DELTA",
2571
+ "type": "EEF",
2572
+ "format": "DEFAULT",
2573
+ "state_key": null
2574
+ },
2575
+ {
2576
+ "rep": "ABSOLUTE",
2577
+ "type": "NON_EEF",
2578
+ "format": "DEFAULT",
2579
+ "state_key": null
2580
+ }
2581
+ ]
2582
+ },
2583
+ "language": {
2584
+ "delta_indices": [
2585
+ 0
2586
+ ],
2587
+ "modality_keys": [
2588
+ "annotation.human.action.task_description"
2589
+ ],
2590
+ "sin_cos_embedding_keys": null,
2591
+ "mean_std_embedding_keys": null,
2592
+ "action_configs": null
2593
+ }
2594
+ },
2595
+ "furniture_bench_dataset_converted_externally_to_rlds": {
2596
+ "video": {
2597
+ "delta_indices": [
2598
+ -6,
2599
+ -4,
2600
+ -2,
2601
+ 0
2602
+ ],
2603
+ "modality_keys": [
2604
+ "primary",
2605
+ "wrist"
2606
+ ],
2607
+ "sin_cos_embedding_keys": null,
2608
+ "mean_std_embedding_keys": null,
2609
+ "action_configs": null
2610
+ },
2611
+ "state": {
2612
+ "delta_indices": [
2613
+ 0
2614
+ ],
2615
+ "modality_keys": [
2616
+ "end_effector_position",
2617
+ "end_effector_rotation",
2618
+ "gripper_position"
2619
+ ],
2620
+ "sin_cos_embedding_keys": null,
2621
+ "mean_std_embedding_keys": null,
2622
+ "action_configs": null
2623
+ },
2624
+ "action": {
2625
+ "delta_indices": [
2626
+ 0,
2627
+ 1,
2628
+ 2,
2629
+ 3,
2630
+ 4,
2631
+ 5,
2632
+ 6,
2633
+ 7,
2634
+ 8,
2635
+ 9,
2636
+ 10,
2637
+ 11,
2638
+ 12,
2639
+ 13,
2640
+ 14,
2641
+ 15
2642
+ ],
2643
+ "modality_keys": [
2644
+ "end_effector_position",
2645
+ "end_effector_rotation",
2646
+ "gripper_close"
2647
+ ],
2648
+ "sin_cos_embedding_keys": null,
2649
+ "mean_std_embedding_keys": null,
2650
+ "action_configs": [
2651
+ {
2652
+ "rep": "DELTA",
2653
+ "type": "EEF",
2654
+ "format": "DEFAULT",
2655
+ "state_key": null
2656
+ },
2657
+ {
2658
+ "rep": "DELTA",
2659
+ "type": "EEF",
2660
+ "format": "DEFAULT",
2661
+ "state_key": null
2662
+ },
2663
+ {
2664
+ "rep": "ABSOLUTE",
2665
+ "type": "NON_EEF",
2666
+ "format": "DEFAULT",
2667
+ "state_key": null
2668
+ }
2669
+ ]
2670
+ },
2671
+ "language": {
2672
+ "delta_indices": [
2673
+ 0
2674
+ ],
2675
+ "modality_keys": [
2676
+ "annotation.human.action.task_description"
2677
+ ],
2678
+ "sin_cos_embedding_keys": null,
2679
+ "mean_std_embedding_keys": null,
2680
+ "action_configs": null
2681
+ }
2682
+ },
2683
+ "humanoid_everyday_g1": {
2684
+ "video": {
2685
+ "delta_indices": [
2686
+ -6,
2687
+ -4,
2688
+ -2,
2689
+ 0
2690
+ ],
2691
+ "modality_keys": [
2692
+ "egocentric_resized"
2693
+ ],
2694
+ "sin_cos_embedding_keys": null,
2695
+ "mean_std_embedding_keys": null,
2696
+ "action_configs": null
2697
+ },
2698
+ "state": {
2699
+ "delta_indices": [
2700
+ 0
2701
+ ],
2702
+ "modality_keys": [
2703
+ "left_arm",
2704
+ "left_hand",
2705
+ "right_arm",
2706
+ "right_hand"
2707
+ ],
2708
+ "sin_cos_embedding_keys": null,
2709
+ "mean_std_embedding_keys": null,
2710
+ "action_configs": null
2711
+ },
2712
+ "action": {
2713
+ "delta_indices": [
2714
+ 0,
2715
+ 1,
2716
+ 2,
2717
+ 3,
2718
+ 4,
2719
+ 5,
2720
+ 6,
2721
+ 7,
2722
+ 8,
2723
+ 9,
2724
+ 10,
2725
+ 11,
2726
+ 12,
2727
+ 13,
2728
+ 14,
2729
+ 15
2730
+ ],
2731
+ "modality_keys": [
2732
+ "left_arm",
2733
+ "left_hand",
2734
+ "right_arm",
2735
+ "right_hand"
2736
+ ],
2737
+ "sin_cos_embedding_keys": null,
2738
+ "mean_std_embedding_keys": null,
2739
+ "action_configs": [
2740
+ {
2741
+ "rep": "ABSOLUTE",
2742
+ "type": "NON_EEF",
2743
+ "format": "DEFAULT",
2744
+ "state_key": null
2745
+ },
2746
+ {
2747
+ "rep": "ABSOLUTE",
2748
+ "type": "NON_EEF",
2749
+ "format": "DEFAULT",
2750
+ "state_key": null
2751
+ },
2752
+ {
2753
+ "rep": "ABSOLUTE",
2754
+ "type": "NON_EEF",
2755
+ "format": "DEFAULT",
2756
+ "state_key": null
2757
+ },
2758
+ {
2759
+ "rep": "ABSOLUTE",
2760
+ "type": "NON_EEF",
2761
+ "format": "DEFAULT",
2762
+ "state_key": null
2763
+ }
2764
+ ]
2765
+ },
2766
+ "language": {
2767
+ "delta_indices": [
2768
+ 0
2769
+ ],
2770
+ "modality_keys": [
2771
+ "annotation.human.action.task_description"
2772
+ ],
2773
+ "sin_cos_embedding_keys": null,
2774
+ "mean_std_embedding_keys": null,
2775
+ "action_configs": null
2776
+ }
2777
+ },
2778
+ "berkeley_fanuc_manipulation": {
2779
+ "video": {
2780
+ "delta_indices": [
2781
+ -6,
2782
+ -4,
2783
+ -2,
2784
+ 0
2785
+ ],
2786
+ "modality_keys": [
2787
+ "primary",
2788
+ "wrist"
2789
+ ],
2790
+ "sin_cos_embedding_keys": null,
2791
+ "mean_std_embedding_keys": null,
2792
+ "action_configs": null
2793
+ },
2794
+ "state": {
2795
+ "delta_indices": [
2796
+ 0
2797
+ ],
2798
+ "modality_keys": [
2799
+ "joint_position",
2800
+ "gripper_position"
2801
+ ],
2802
+ "sin_cos_embedding_keys": null,
2803
+ "mean_std_embedding_keys": null,
2804
+ "action_configs": null
2805
+ },
2806
+ "action": {
2807
+ "delta_indices": [
2808
+ 0,
2809
+ 1,
2810
+ 2,
2811
+ 3,
2812
+ 4,
2813
+ 5,
2814
+ 6,
2815
+ 7,
2816
+ 8,
2817
+ 9,
2818
+ 10,
2819
+ 11,
2820
+ 12,
2821
+ 13,
2822
+ 14,
2823
+ 15
2824
+ ],
2825
+ "modality_keys": [
2826
+ "end_effector_position",
2827
+ "end_effector_rotation",
2828
+ "gripper_close"
2829
+ ],
2830
+ "sin_cos_embedding_keys": null,
2831
+ "mean_std_embedding_keys": null,
2832
+ "action_configs": [
2833
+ {
2834
+ "rep": "DELTA",
2835
+ "type": "EEF",
2836
+ "format": "DEFAULT",
2837
+ "state_key": null
2838
+ },
2839
+ {
2840
+ "rep": "DELTA",
2841
+ "type": "EEF",
2842
+ "format": "DEFAULT",
2843
+ "state_key": null
2844
+ },
2845
+ {
2846
+ "rep": "ABSOLUTE",
2847
+ "type": "NON_EEF",
2848
+ "format": "DEFAULT",
2849
+ "state_key": null
2850
+ }
2851
+ ]
2852
+ },
2853
+ "language": {
2854
+ "delta_indices": [
2855
+ 0
2856
+ ],
2857
+ "modality_keys": [
2858
+ "annotation.human.action.task_description"
2859
+ ],
2860
+ "sin_cos_embedding_keys": null,
2861
+ "mean_std_embedding_keys": null,
2862
+ "action_configs": null
2863
+ }
2864
+ },
2865
+ "utaustin_mutex": {
2866
+ "video": {
2867
+ "delta_indices": [
2868
+ -6,
2869
+ -4,
2870
+ -2,
2871
+ 0
2872
+ ],
2873
+ "modality_keys": [
2874
+ "primary",
2875
+ "wrist"
2876
+ ],
2877
+ "sin_cos_embedding_keys": null,
2878
+ "mean_std_embedding_keys": null,
2879
+ "action_configs": null
2880
+ },
2881
+ "state": {
2882
+ "delta_indices": [
2883
+ 0
2884
+ ],
2885
+ "modality_keys": [
2886
+ "joint_position",
2887
+ "gripper_position"
2888
+ ],
2889
+ "sin_cos_embedding_keys": null,
2890
+ "mean_std_embedding_keys": null,
2891
+ "action_configs": null
2892
+ },
2893
+ "action": {
2894
+ "delta_indices": [
2895
+ 0,
2896
+ 1,
2897
+ 2,
2898
+ 3,
2899
+ 4,
2900
+ 5,
2901
+ 6,
2902
+ 7,
2903
+ 8,
2904
+ 9,
2905
+ 10,
2906
+ 11,
2907
+ 12,
2908
+ 13,
2909
+ 14,
2910
+ 15
2911
+ ],
2912
+ "modality_keys": [
2913
+ "end_effector_position",
2914
+ "end_effector_rotation",
2915
+ "gripper_close"
2916
+ ],
2917
+ "sin_cos_embedding_keys": null,
2918
+ "mean_std_embedding_keys": null,
2919
+ "action_configs": [
2920
+ {
2921
+ "rep": "DELTA",
2922
+ "type": "EEF",
2923
+ "format": "DEFAULT",
2924
+ "state_key": null
2925
+ },
2926
+ {
2927
+ "rep": "DELTA",
2928
+ "type": "EEF",
2929
+ "format": "DEFAULT",
2930
+ "state_key": null
2931
+ },
2932
+ {
2933
+ "rep": "ABSOLUTE",
2934
+ "type": "NON_EEF",
2935
+ "format": "DEFAULT",
2936
+ "state_key": null
2937
+ }
2938
+ ]
2939
+ },
2940
+ "language": {
2941
+ "delta_indices": [
2942
+ 0
2943
+ ],
2944
+ "modality_keys": [
2945
+ "annotation.human.action.task_description"
2946
+ ],
2947
+ "sin_cos_embedding_keys": null,
2948
+ "mean_std_embedding_keys": null,
2949
+ "action_configs": null
2950
+ }
2951
+ },
2952
+ "general_embodiment": {
2953
+ "video": {
2954
+ "delta_indices": [
2955
+ -6,
2956
+ -4,
2957
+ -2,
2958
+ 0
2959
+ ],
2960
+ "modality_keys": [
2961
+ "left_view",
2962
+ "right_view",
2963
+ "wrist_view"
2964
+ ],
2965
+ "sin_cos_embedding_keys": null,
2966
+ "mean_std_embedding_keys": null,
2967
+ "action_configs": null
2968
+ },
2969
+ "state": {
2970
+ "delta_indices": [
2971
+ 0
2972
+ ],
2973
+ "modality_keys": [
2974
+ "end_effector_position_relative",
2975
+ "end_effector_rotation_relative",
2976
+ "gripper_qpos",
2977
+ "base_position",
2978
+ "base_rotation"
2979
+ ],
2980
+ "sin_cos_embedding_keys": null,
2981
+ "mean_std_embedding_keys": null,
2982
+ "action_configs": null
2983
+ },
2984
+ "action": {
2985
+ "delta_indices": [
2986
+ 0,
2987
+ 1,
2988
+ 2,
2989
+ 3,
2990
+ 4,
2991
+ 5,
2992
+ 6,
2993
+ 7,
2994
+ 8,
2995
+ 9,
2996
+ 10,
2997
+ 11,
2998
+ 12,
2999
+ 13,
3000
+ 14,
3001
+ 15
3002
+ ],
3003
+ "modality_keys": [
3004
+ "end_effector_position",
3005
+ "end_effector_rotation",
3006
+ "gripper_close",
3007
+ "base_motion",
3008
+ "control_mode"
3009
+ ],
3010
+ "sin_cos_embedding_keys": null,
3011
+ "mean_std_embedding_keys": null,
3012
+ "action_configs": [
3013
+ {
3014
+ "rep": "DELTA",
3015
+ "type": "EEF",
3016
+ "format": "DEFAULT",
3017
+ "state_key": null
3018
+ },
3019
+ {
3020
+ "rep": "DELTA",
3021
+ "type": "EEF",
3022
+ "format": "DEFAULT",
3023
+ "state_key": null
3024
+ },
3025
+ {
3026
+ "rep": "ABSOLUTE",
3027
+ "type": "NON_EEF",
3028
+ "format": "DEFAULT",
3029
+ "state_key": null
3030
+ },
3031
+ {
3032
+ "rep": "DELTA",
3033
+ "type": "NON_EEF",
3034
+ "format": "DEFAULT",
3035
+ "state_key": null
3036
+ },
3037
+ {
3038
+ "rep": "ABSOLUTE",
3039
+ "type": "NON_EEF",
3040
+ "format": "DEFAULT",
3041
+ "state_key": null
3042
+ }
3043
+ ]
3044
+ },
3045
+ "language": {
3046
+ "delta_indices": [
3047
+ 0
3048
+ ],
3049
+ "modality_keys": [
3050
+ "annotation.human.action.task_description"
3051
+ ],
3052
+ "sin_cos_embedding_keys": null,
3053
+ "mean_std_embedding_keys": null,
3054
+ "action_configs": null
3055
+ }
3056
+ }
3057
+ },
3058
+ "random_rotation_angle": null,
3059
+ "color_jitter_params": {
3060
+ "brightness": 0.3,
3061
+ "contrast": 0.4,
3062
+ "saturation": 0.5,
3063
+ "hue": 0.08
3064
+ },
3065
+ "model_name": "RLWRLD/RLDX-1-VLM",
3066
+ "model_type": "vtc_qwen3_vl",
3067
+ "formalize_language": true,
3068
+ "max_state_dim": 64,
3069
+ "max_action_dim": 64,
3070
+ "max_action_horizon": 16,
3071
+ "use_percentiles": true,
3072
+ "clip_outliers": true,
3073
+ "apply_sincos_state_encoding": false,
3074
+ "use_relative_action": true,
3075
+ "memory_length": 1,
3076
+ "general_embodiment_train_ratio": 0,
3077
+ "image_max_area": 65536,
3078
+ "image_resize_m": 32,
3079
+ "random_crop_fraction": null
3080
+ }
3081
+ }
processor/statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
teaser.png ADDED

Git LFS Details

  • SHA256: 6b34b11f6c8e2699766e26aa210be9e4b3e5f3f8f45ed009ae5c7ef07c7c7cd7
  • Pointer size: 133 Bytes
  • Size of remote file: 10.4 MB