jaehyunkang commited on
Commit
d6bfa6d
·
0 Parent(s):

RLDX-1 Release

Browse files
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ teaser.png filter=lfs diff=lfs merge=lfs -text
37
+ architecture.png filter=lfs diff=lfs merge=lfs -text
LICENSE.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RLWRLD Model License v1.0
2
+
3
+ ## 1. Definitions
4
+
5
+ "Licensor" means RLWRLD, INC. and its affiliates.
6
+
7
+ "Model" means the machine learning model, including learnt weights, parameters, configuration files, and documentation made available under this license.
8
+
9
+ "Derivative Model" means all (a) modifications to the Model, (b) works based on the Model, and (c) any other derivative works of the Model, including models fine-tuned from the Model.
10
+
11
+ "You" means an individual or legal entity exercising permissions granted by this license.
12
+
13
+ ## 2. License Grant
14
+
15
+ Subject to the terms and conditions of this license, Licensor grants to You a perpetual, worldwide, non-exclusive, royalty-free license to use, reproduce, prepare derivative works of, publicly display, publicly perform, and distribute the Model and any Derivative Models.
16
+
17
+ ## 3. Conditions and Limitations
18
+
19
+ **3.1 Non-Commercial Use.** The Model and any Derivative Models may only be used for non-commercial purposes. "Non-commercial" means for academic research, educational, personal, or evaluation purposes only, and does not include any use primarily intended for or directed toward commercial advantage or monetary compensation.
20
+
21
+ **3.2 Attribution.** You must give appropriate credit to Licensor, provide a link to this license, and indicate if changes were made. You must include the following attribution notice with any distribution of the Model or Derivative Model:
22
+
23
+ > "Licensed under the RLWRLD Model License v1.0"
24
+
25
+ **3.3 Share-Alike.** If You distribute a Derivative Model, You must do so under this same license, or another license that includes at minimum (a) a non-commercial use limitation no less restrictive than Section 3.1 and (b) a share-alike requirement no less restrictive than this Section 3.3.
26
+
27
+ **3.4 Redistribution.** You may distribute copies of the Model or Derivative Models provided that You (a) include a complete copy of this license, (b) retain all copyright, trademark, and attribution notices, and (c) comply with all conditions in this Section 3.
28
+
29
+ **3.5 Use Restrictions.** The Model and any Derivative Models shall not be used for: (a) military, weapons development, or defense applications; (b) surveillance or monitoring of individuals without their consent; or (c) any use that violates applicable laws or regulations.
30
+
31
+ **3.6 Trademarks.** This license does not grant any rights to use Licensor's names, logos, or trademarks, except as required for reasonable and customary use in describing the origin of the Model and reproducing the notices described in this license.
32
+
33
+ **3.7 Patent Claims.** If You or Your affiliate(s) bring or threaten to bring any claim or litigation (including any claim, cross-claim, or counterclaim in a lawsuit) against any entity to enforce any patents that You allege are infringed by the Model, then any rights granted to You under this license will terminate immediately.
34
+
35
+ **3.8 Termination.** If You violate any term of this license, Your rights under this license will terminate immediately.
36
+
37
+ ## 4. Third-Party Components
38
+
39
+ The Model may include or be distributed with third-party components that are subject to separate license terms and notices. Such components are subject to their respective licenses, including any notices and disclaimers contained therein. Licensor does not grant any rights with respect to third-party components beyond those provided under the applicable third-party licenses.
40
+
41
+ ## 5. Disclaimer of Warranty
42
+
43
+ THE MODEL IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NONINFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OR LAWFULNESS OF USING OR REDISTRIBUTING THE MODEL, DERIVATIVE MODELS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE MODEL, DERIVATIVE MODELS AND ANY OUTPUT AND RESULTS.
44
+
45
+ ## 6. Limitation of Liability
46
+
47
+ IN NO EVENT SHALL LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE OR THE USE OR INABILITY TO USE THE MODEL, DERIVATIVE MODELS, OR ANY OUTPUTS THEREOF, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
48
+
49
+ ## 7. Indemnity
50
+
51
+ You shall indemnify and hold harmless Licensor from and against any claim by any third party arising out of or related to Your use or distribution of the Model, Derivative Models, or any outputs thereof.
52
+
53
+ ## 8. Feedback
54
+
55
+ If You provide feedback, suggestions, or improvements regarding the Model, Licensor may use such feedback without restriction or compensation to You.
56
+
57
+ ## 9. General Provisions
58
+
59
+ **9.1 Governing Law.** This license will be governed by and construed in accordance with the laws of the State of Delaware, United States, without regard to its conflict of laws rules. The UN Convention on Contracts for International Sale of Goods does not apply to this license.
60
+
61
+ **9.2 License Updates.** Licensor may update this license to comply with legal and regulatory requirements at any time. You agree to either comply with any updated license or cease Your use and distribution of the Model and any Derivative Model.
README.md ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: rlwrld-model-license-v1.0
4
+ license_link: LICENSE.md
5
+ library_name: transformers
6
+ pipeline_tag: robotics
7
+ tags:
8
+ - robotics
9
+ - vla
10
+ - vision-language-action
11
+ - manipulation
12
+ - flow-matching
13
+ - rldx
14
+ - simpler
15
+ - widowx
16
+ base_model: RLWRLD/RLDX-1-PT
17
+ ---
18
+
19
+ # RLDX-1-FT-SIMPLER-WIDOWX
20
+
21
+ [Paper](https://arxiv.org/abs/2605.03269)  ·  [Project page](https://rlwrld.ai/rldx-1)  ·  [Code](https://github.com/RLWRLD/RLDX-1)  ·  [Models](https://huggingface.co/collections/RLWRLD/rldx-1)
22
+
23
+ <p align="center">
24
+ <img src="teaser.png" width="100%" alt="RLDX-1 teaser">
25
+ </p>
26
+
27
+ **RLDX-1** is a general-purpose Robot Foundation Model designed for dexterous
28
+ manipulation. Powered by a **Multi-Stream Action Transformer (MSAT)**, it
29
+ seamlessly unifies multimodal perception (visual + tactile), high-DoF
30
+ actuation, and memory-aware decision-making in a single architecture.
31
+
32
+ This repository hosts **`RLDX-1-FT-SIMPLER-WIDOWX`** — RLDX-1 finetuned for
33
+ the **SimplerEnv WidowX** benchmark (BridgeData-style WidowX 250 tasks).
34
+ It achieves **71.9%** average success.
35
+
36
+ ## Highlights
37
+
38
+ - **Multi-Stream Action Transformer (MSAT).** Cognition, physics, and
39
+ action each get a dedicated stream coupled by joint self-attention —
40
+ an extension of MM-DiT to action modeling.
41
+ - **Motion awareness.** Multi-frame observations + a motion module
42
+ capture temporal dynamics; intermediate VLM layers compress video
43
+ tokens to keep the policy efficient.
44
+ - **Long-term memory.** A memory module fuses past cognition features
45
+ with the current ones for history-grounded decisions beyond a short
46
+ multi-frame window.
47
+ - **Physical sensing.** Tactile and torque enter as a dedicated physics
48
+ stream; the decoder is jointly trained to predict future physical
49
+ signals.
50
+ - **Three-stage training.** Pre-training (generalization) → mid-training
51
+ (functionality) → post-training (task adaptation), with synthetic data
52
+ augmenting rare manipulation scenarios.
53
+ - **Real-time inference.** Static graph capture + custom fused kernels
54
+ bring the all-modality model to **43.7 ms / step on RTX 5090
55
+ (1.63× speedup, >22 Hz)**.
56
+
57
+ ## Performance
58
+
59
+ | Benchmark | Success Rate |
60
+ |---|---|
61
+ | SIMPLER WidowX | **71.9%** |
62
+
63
+ ## Quick start
64
+
65
+ ### Installation
66
+
67
+ ```bash
68
+ git clone https://github.com/RLWRLD/RLDX-1.git
69
+ cd RLDX
70
+ uv sync --python 3.10
71
+ uv pip install -e .
72
+ ```
73
+
74
+ ### Inference
75
+
76
+ ```python
77
+ from rldx.policy.rldx_policy import RLDXPolicy
78
+ from rldx.data.embodiment_tags import EmbodimentTag
79
+
80
+ policy = RLDXPolicy(
81
+ model_path="RLWRLD/RLDX-1-FT-SIMPLER-WIDOWX",
82
+ embodiment_tag=EmbodimentTag.OXE_BRIDGE_ORIG,
83
+ device="cuda:0",
84
+ )
85
+
86
+ action = policy.get_action(observation)
87
+ ```
88
+
89
+ ### Real-time serving (ZeroMQ)
90
+
91
+ ```bash
92
+ uv run python rldx/eval/run_rldx_server.py \
93
+ --model-path RLWRLD/RLDX-1-FT-SIMPLER-WIDOWX \
94
+ --embodiment-tag OXE_BRIDGE_ORIG \
95
+ --host 0.0.0.0 --port 20000
96
+ ```
97
+
98
+ To reproduce the benchmark numbers end-to-end, see
99
+ [`run_scripts/eval/simpler/README.md`](https://github.com/RLWRLD/RLDX-1/blob/main/run_scripts/eval/simpler/README.md).
100
+
101
+ ## Model details
102
+
103
+ - **Architecture:** Multi-Stream Action Transformer (MSAT) policy on a
104
+ Qwen3-VL backbone with cognition-token perceptual summary. Trained with
105
+ flow matching.
106
+ - **Inputs:** RGB video (default 4 frames), state proprioception, language
107
+ instruction.
108
+ - **Outputs:** Action chunks of length 16.
109
+ - **Embodiment tag:** `OXE_BRIDGE_ORIG`.
110
+ - **Base model:** [`RLWRLD/RLDX-1-PT`](https://huggingface.co/RLWRLD/RLDX-1-PT).
111
+ - **Backbone:** [`Qwen/Qwen3-VL-8B-Instruct`](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct).
112
+ - **Finetune data:** SimplerEnv WidowX training set (BridgeData subset of OXE).
113
+ - **Params:** 6.9B.
114
+
115
+ For the full architectural walkthrough see
116
+ [`docs/architecture.md`](https://github.com/RLWRLD/RLDX-1/blob/main/docs/architecture.md).
117
+
118
+ ## RLDX-1 model family
119
+
120
+ | Checkpoint | Description |
121
+ |---|---|
122
+ | [`RLDX-1-PT`](https://huggingface.co/RLWRLD/RLDX-1-PT) | Multi-source pretrained foundation |
123
+ | [`RLDX-1-VLM`](https://huggingface.co/RLWRLD/RLDX-1-VLM) | Qwen3-VL-8B vision-language backbone |
124
+ | [`RLDX-1-FT-ROBOCASA`](https://huggingface.co/RLWRLD/RLDX-1-FT-ROBOCASA) | RoboCasa Kitchen 24-task finetune |
125
+ | [`RLDX-1-FT-RC365`](https://huggingface.co/RLWRLD/RLDX-1-FT-RC365) | RoboCasa-365 cross-task finetune |
126
+ | [`RLDX-1-FT-LIBERO`](https://huggingface.co/RLWRLD/RLDX-1-FT-LIBERO) | LIBERO 4-task suite (goal, object, spatial, long) finetune |
127
+ | [`RLDX-1-FT-SIMPLER-GOOGLE`](https://huggingface.co/RLWRLD/RLDX-1-FT-SIMPLER-GOOGLE) | SIMPLER Google VM/VA finetune |
128
+ | [`RLDX-1-FT-SIMPLER-WIDOWX`](https://huggingface.co/RLWRLD/RLDX-1-FT-SIMPLER-WIDOWX) | SIMPLER WidowX finetune (this repo) |
129
+ | [`RLDX-1-FT-GR1`](https://huggingface.co/RLWRLD/RLDX-1-FT-GR1) | GR-1 Tabletop finetune |
130
+ | [`RLDX-1-MT-DROID`](https://huggingface.co/RLWRLD/RLDX-1-MT-DROID) | DROID mid-train |
131
+ | [`RLDX-1-MT-ALLEX`](https://huggingface.co/RLWRLD/RLDX-1-MT-ALLEX) | All add-ons (memory + motion + physics + video) |
132
+
133
+ ## Intended use & limitations
134
+
135
+ **Intended use.** Research on robotic manipulation, simulation benchmarking
136
+ on SimplerEnv WidowX, and non-commercial real-robot deployment under the
137
+ conditions of the RLWRLD Model License v1.0.
138
+
139
+ **Out of scope.** Commercial deployment, military or weapons applications,
140
+ non-consensual surveillance, and any use that violates applicable laws or
141
+ regulations. See [`LICENSE.md`](LICENSE.md) §3.5 for the full list.
142
+
143
+ **Limitations.** Conditioned on the WidowX 250 BridgeData embodiment. For
144
+ Google-Robot evaluation use
145
+ [`RLDX-1-FT-SIMPLER-GOOGLE`](https://huggingface.co/RLWRLD/RLDX-1-FT-SIMPLER-GOOGLE);
146
+ for other embodiments, finetune from
147
+ [`RLDX-1-PT`](https://huggingface.co/RLWRLD/RLDX-1-PT) instead.
148
+
149
+ ## Citation
150
+
151
+ ```bibtex
152
+ @article{rldx2026,
153
+ title={RLDX-1 Technical Report},
154
+ author={Kim, Dongyoung and Jang, Huiwon and Koo, Myungkyu and Jang, Suhyeok and Kim, Taeyoung and others},
155
+ year={2026},
156
+ note={RLWRLD},
157
+ eprint={2605.03269},
158
+ archivePrefix={arXiv},
159
+ url={https://arxiv.org/abs/2605.03269}
160
+ }
161
+ ```
162
+
163
+ ## License
164
+
165
+ Released under the **RLWRLD Model License v1.0** — a non-commercial license
166
+ with attribution and share-alike requirements. See [`LICENSE.md`](LICENSE.md) for
167
+ the full text. By using this model you agree to those terms, including the
168
+ use restrictions in §3.5.
architecture.png ADDED

Git LFS Details

  • SHA256: 8d0e305139502965d4289446add15e9e11c34dcc8106ad526fa8c957c12595d3
  • Pointer size: 132 Bytes
  • Size of remote file: 1.09 MB
config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RLDX"
4
+ ],
5
+ "attn_implementation": null,
6
+ "backbone_model_type": "vtc_qwen3_vl",
7
+ "backbone_trainable_params_fp32": true,
8
+ "color_jitter_params": {
9
+ "brightness": 0.3,
10
+ "contrast": 0.4,
11
+ "hue": 0.08,
12
+ "saturation": 0.5
13
+ },
14
+ "conversation_image_first": false,
15
+ "diffusion_model_cfg": {
16
+ "action_model_max_seq_len": 512,
17
+ "attention_head_dim": 64,
18
+ "depth_multi_stream": 4,
19
+ "depth_single_stream": 8,
20
+ "dropout": 0.2,
21
+ "final_dropout": true,
22
+ "num_attention_heads": 24,
23
+ "output_dim": 1024,
24
+ "positional_embeddings": "rope_sa_only",
25
+ "pre_norm": "layer_norm",
26
+ "qk_norm": "rms_norm",
27
+ "rope_theta": 10000.0,
28
+ "sa_dim": 1536,
29
+ "set_triple_stream_for_mq": false,
30
+ "set_triple_stream_for_state": false,
31
+ "temb_type": "input_token",
32
+ "use_swiglu": true,
33
+ "vl_dim": 4096
34
+ },
35
+ "dtype": "bfloat16",
36
+ "load_bf16": true,
37
+ "memory_cfg": {
38
+ "hidden_size": 4096,
39
+ "intermediate_size": 16384,
40
+ "max_position_embeddings": 32,
41
+ "num_attention_heads": 16,
42
+ "num_hidden_layers": 2,
43
+ "num_key_value_heads": 16,
44
+ "rms_norm_eps": 1e-05,
45
+ "use_causal_attn": true,
46
+ "use_rope": true
47
+ },
48
+ "memory_video_delta_indices": [
49
+ -48,
50
+ -32,
51
+ -16,
52
+ 0
53
+ ],
54
+ "model_name": "RLWRLD/RLDX-1-VLM",
55
+ "model_type": "RLDX-1",
56
+ "n_cog_tokens": 64,
57
+ "general_embodiment_train_ratio": 0,
58
+ "qwen3_collator": true,
59
+ "random_rotation_angle": null,
60
+ "reproject_vision": false,
61
+ "state_dropout_prob": 0.0,
62
+ "transformers_version": "4.57.0",
63
+ "tune_diffusion_model": true,
64
+ "tune_llm": false,
65
+ "tune_projector": true,
66
+ "tune_top_llm_layers": 4,
67
+ "tune_visual": false,
68
+ "tune_vlln": true,
69
+ "use_relative_action": true,
70
+ "use_video": true,
71
+ "video_length": 4
72
+ }
embodiment_id.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "general_embodiment": 0,
3
+ "fractal20220817_data": 1,
4
+ "kuka": 2,
5
+ "bridge_orig": 3,
6
+ "taco_play": 4,
7
+ "jaco_play": 5,
8
+ "berkeley_cable_routing": 6,
9
+ "roboturk": 7,
10
+ "viola": 8,
11
+ "berkeley_autolab_ur5": 9,
12
+ "toto": 10,
13
+ "language_table": 11,
14
+ "stanford_hydra_dataset_converted_externally_to_rlds": 12,
15
+ "austin_buds_dataset_converted_externally_to_rlds": 13,
16
+ "nyu_franka_play_dataset_converted_externally_to_rlds": 14,
17
+ "furniture_bench_dataset_converted_externally_to_rlds": 15,
18
+ "ucsd_kitchen_dataset_converted_externally_to_rlds": 16,
19
+ "austin_sailor_dataset_converted_externally_to_rlds": 17,
20
+ "austin_sirius_dataset_converted_externally_to_rlds": 18,
21
+ "dlr_edan_shared_control_converted_externally_to_rlds": 19,
22
+ "iamlab_cmu_pickup_insert_converted_externally_to_rlds": 20,
23
+ "utaustin_mutex": 21,
24
+ "berkeley_fanuc_manipulation": 22,
25
+ "cmu_stretch": 23,
26
+ "bc_z": 24,
27
+ "fmb_dataset": 25,
28
+ "dobbe": 26,
29
+ "droid": 27,
30
+ "agibot_dexhand": 28,
31
+ "agibot_gripper": 29,
32
+ "galaxea": 30,
33
+ "humanoid_everyday_g1": 31,
34
+ "humanoid_everyday_h1": 32,
35
+ "action_net": 33,
36
+ "neural_gr1": 34,
37
+ "new_embodiment": 35
38
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67bd5d3f9d3471052ae5fe24036c20a2831f96209fa4f9adc75a9dde9bf6e44d
3
+ size 4912540968
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fadfc08bd5467a851a6610c7c3433d6d1f124dc5f0de7fa7a126f0cec535307f
3
+ size 4446192352
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa3d31f98edc428a219a573733c03ad00d1248fbc305e1ce9a083c328aa61290
3
+ size 4467155576
model.safetensors.index.json ADDED
@@ -0,0 +1,730 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 6912894784,
4
+ "total_size": 13825888896
5
+ },
6
+ "weight_map": {
7
+ "backbone.cog_emb": "model-00001-of-00003.safetensors",
8
+ "backbone.qwen_model.model.language_model.embed_tokens.weight": "model-00001-of-00003.safetensors",
9
+ "backbone.qwen_model.model.language_model.layers.0.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
10
+ "backbone.qwen_model.model.language_model.layers.0.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
11
+ "backbone.qwen_model.model.language_model.layers.0.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
12
+ "backbone.qwen_model.model.language_model.layers.0.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
13
+ "backbone.qwen_model.model.language_model.layers.0.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
14
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
15
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
16
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
17
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
18
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
19
+ "backbone.qwen_model.model.language_model.layers.0.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
20
+ "backbone.qwen_model.model.language_model.layers.1.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
21
+ "backbone.qwen_model.model.language_model.layers.1.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
22
+ "backbone.qwen_model.model.language_model.layers.1.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
23
+ "backbone.qwen_model.model.language_model.layers.1.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
24
+ "backbone.qwen_model.model.language_model.layers.1.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
25
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
26
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
27
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
28
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
29
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
30
+ "backbone.qwen_model.model.language_model.layers.1.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
31
+ "backbone.qwen_model.model.language_model.layers.2.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
32
+ "backbone.qwen_model.model.language_model.layers.2.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
33
+ "backbone.qwen_model.model.language_model.layers.2.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
34
+ "backbone.qwen_model.model.language_model.layers.2.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
35
+ "backbone.qwen_model.model.language_model.layers.2.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
36
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
37
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
38
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
39
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
40
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
41
+ "backbone.qwen_model.model.language_model.layers.2.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
42
+ "backbone.qwen_model.model.language_model.layers.3.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
43
+ "backbone.qwen_model.model.language_model.layers.3.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
44
+ "backbone.qwen_model.model.language_model.layers.3.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
45
+ "backbone.qwen_model.model.language_model.layers.3.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
46
+ "backbone.qwen_model.model.language_model.layers.3.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
47
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
48
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
49
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
50
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
51
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
52
+ "backbone.qwen_model.model.language_model.layers.3.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
53
+ "backbone.qwen_model.model.language_model.layers.4.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
54
+ "backbone.qwen_model.model.language_model.layers.4.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
55
+ "backbone.qwen_model.model.language_model.layers.4.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
56
+ "backbone.qwen_model.model.language_model.layers.4.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
57
+ "backbone.qwen_model.model.language_model.layers.4.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
58
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
59
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
60
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
61
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
62
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
63
+ "backbone.qwen_model.model.language_model.layers.4.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
64
+ "backbone.qwen_model.model.language_model.layers.5.layer.input_layernorm.weight": "model-00001-of-00003.safetensors",
65
+ "backbone.qwen_model.model.language_model.layers.5.layer.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
66
+ "backbone.qwen_model.model.language_model.layers.5.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
67
+ "backbone.qwen_model.model.language_model.layers.5.layer.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
68
+ "backbone.qwen_model.model.language_model.layers.5.layer.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
69
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
70
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
71
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
72
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
73
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
74
+ "backbone.qwen_model.model.language_model.layers.5.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
75
+ "backbone.qwen_model.model.language_model.layers.6.layer.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
76
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
77
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
78
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
79
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
80
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
81
+ "backbone.qwen_model.model.language_model.layers.6.layer.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
82
+ "backbone.qwen_model.model.visual.blocks.0.attn.proj.bias": "model-00001-of-00003.safetensors",
83
+ "backbone.qwen_model.model.visual.blocks.0.attn.proj.weight": "model-00001-of-00003.safetensors",
84
+ "backbone.qwen_model.model.visual.blocks.0.attn.qkv.bias": "model-00001-of-00003.safetensors",
85
+ "backbone.qwen_model.model.visual.blocks.0.attn.qkv.weight": "model-00001-of-00003.safetensors",
86
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
87
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
88
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
89
+ "backbone.qwen_model.model.visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
90
+ "backbone.qwen_model.model.visual.blocks.0.norm1.bias": "model-00001-of-00003.safetensors",
91
+ "backbone.qwen_model.model.visual.blocks.0.norm1.weight": "model-00001-of-00003.safetensors",
92
+ "backbone.qwen_model.model.visual.blocks.0.norm2.bias": "model-00001-of-00003.safetensors",
93
+ "backbone.qwen_model.model.visual.blocks.0.norm2.weight": "model-00001-of-00003.safetensors",
94
+ "backbone.qwen_model.model.visual.blocks.1.attn.proj.bias": "model-00001-of-00003.safetensors",
95
+ "backbone.qwen_model.model.visual.blocks.1.attn.proj.weight": "model-00001-of-00003.safetensors",
96
+ "backbone.qwen_model.model.visual.blocks.1.attn.qkv.bias": "model-00001-of-00003.safetensors",
97
+ "backbone.qwen_model.model.visual.blocks.1.attn.qkv.weight": "model-00001-of-00003.safetensors",
98
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
99
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
100
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
101
+ "backbone.qwen_model.model.visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
102
+ "backbone.qwen_model.model.visual.blocks.1.norm1.bias": "model-00001-of-00003.safetensors",
103
+ "backbone.qwen_model.model.visual.blocks.1.norm1.weight": "model-00001-of-00003.safetensors",
104
+ "backbone.qwen_model.model.visual.blocks.1.norm2.bias": "model-00001-of-00003.safetensors",
105
+ "backbone.qwen_model.model.visual.blocks.1.norm2.weight": "model-00001-of-00003.safetensors",
106
+ "backbone.qwen_model.model.visual.blocks.10.attn.proj.bias": "model-00001-of-00003.safetensors",
107
+ "backbone.qwen_model.model.visual.blocks.10.attn.proj.weight": "model-00001-of-00003.safetensors",
108
+ "backbone.qwen_model.model.visual.blocks.10.attn.qkv.bias": "model-00001-of-00003.safetensors",
109
+ "backbone.qwen_model.model.visual.blocks.10.attn.qkv.weight": "model-00001-of-00003.safetensors",
110
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
111
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
112
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
113
+ "backbone.qwen_model.model.visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
114
+ "backbone.qwen_model.model.visual.blocks.10.norm1.bias": "model-00001-of-00003.safetensors",
115
+ "backbone.qwen_model.model.visual.blocks.10.norm1.weight": "model-00001-of-00003.safetensors",
116
+ "backbone.qwen_model.model.visual.blocks.10.norm2.bias": "model-00001-of-00003.safetensors",
117
+ "backbone.qwen_model.model.visual.blocks.10.norm2.weight": "model-00001-of-00003.safetensors",
118
+ "backbone.qwen_model.model.visual.blocks.11.attn.proj.bias": "model-00001-of-00003.safetensors",
119
+ "backbone.qwen_model.model.visual.blocks.11.attn.proj.weight": "model-00001-of-00003.safetensors",
120
+ "backbone.qwen_model.model.visual.blocks.11.attn.qkv.bias": "model-00001-of-00003.safetensors",
121
+ "backbone.qwen_model.model.visual.blocks.11.attn.qkv.weight": "model-00001-of-00003.safetensors",
122
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
123
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
124
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
125
+ "backbone.qwen_model.model.visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
126
+ "backbone.qwen_model.model.visual.blocks.11.norm1.bias": "model-00001-of-00003.safetensors",
127
+ "backbone.qwen_model.model.visual.blocks.11.norm1.weight": "model-00001-of-00003.safetensors",
128
+ "backbone.qwen_model.model.visual.blocks.11.norm2.bias": "model-00001-of-00003.safetensors",
129
+ "backbone.qwen_model.model.visual.blocks.11.norm2.weight": "model-00001-of-00003.safetensors",
130
+ "backbone.qwen_model.model.visual.blocks.12.attn.proj.bias": "model-00001-of-00003.safetensors",
131
+ "backbone.qwen_model.model.visual.blocks.12.attn.proj.weight": "model-00001-of-00003.safetensors",
132
+ "backbone.qwen_model.model.visual.blocks.12.attn.qkv.bias": "model-00001-of-00003.safetensors",
133
+ "backbone.qwen_model.model.visual.blocks.12.attn.qkv.weight": "model-00001-of-00003.safetensors",
134
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
135
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
136
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
137
+ "backbone.qwen_model.model.visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
138
+ "backbone.qwen_model.model.visual.blocks.12.norm1.bias": "model-00001-of-00003.safetensors",
139
+ "backbone.qwen_model.model.visual.blocks.12.norm1.weight": "model-00001-of-00003.safetensors",
140
+ "backbone.qwen_model.model.visual.blocks.12.norm2.bias": "model-00001-of-00003.safetensors",
141
+ "backbone.qwen_model.model.visual.blocks.12.norm2.weight": "model-00001-of-00003.safetensors",
142
+ "backbone.qwen_model.model.visual.blocks.13.attn.proj.bias": "model-00001-of-00003.safetensors",
143
+ "backbone.qwen_model.model.visual.blocks.13.attn.proj.weight": "model-00001-of-00003.safetensors",
144
+ "backbone.qwen_model.model.visual.blocks.13.attn.qkv.bias": "model-00001-of-00003.safetensors",
145
+ "backbone.qwen_model.model.visual.blocks.13.attn.qkv.weight": "model-00001-of-00003.safetensors",
146
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
147
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
148
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
149
+ "backbone.qwen_model.model.visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
150
+ "backbone.qwen_model.model.visual.blocks.13.norm1.bias": "model-00001-of-00003.safetensors",
151
+ "backbone.qwen_model.model.visual.blocks.13.norm1.weight": "model-00001-of-00003.safetensors",
152
+ "backbone.qwen_model.model.visual.blocks.13.norm2.bias": "model-00001-of-00003.safetensors",
153
+ "backbone.qwen_model.model.visual.blocks.13.norm2.weight": "model-00001-of-00003.safetensors",
154
+ "backbone.qwen_model.model.visual.blocks.14.attn.proj.bias": "model-00001-of-00003.safetensors",
155
+ "backbone.qwen_model.model.visual.blocks.14.attn.proj.weight": "model-00001-of-00003.safetensors",
156
+ "backbone.qwen_model.model.visual.blocks.14.attn.qkv.bias": "model-00001-of-00003.safetensors",
157
+ "backbone.qwen_model.model.visual.blocks.14.attn.qkv.weight": "model-00001-of-00003.safetensors",
158
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
159
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
160
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
161
+ "backbone.qwen_model.model.visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
162
+ "backbone.qwen_model.model.visual.blocks.14.norm1.bias": "model-00001-of-00003.safetensors",
163
+ "backbone.qwen_model.model.visual.blocks.14.norm1.weight": "model-00001-of-00003.safetensors",
164
+ "backbone.qwen_model.model.visual.blocks.14.norm2.bias": "model-00001-of-00003.safetensors",
165
+ "backbone.qwen_model.model.visual.blocks.14.norm2.weight": "model-00001-of-00003.safetensors",
166
+ "backbone.qwen_model.model.visual.blocks.15.attn.proj.bias": "model-00001-of-00003.safetensors",
167
+ "backbone.qwen_model.model.visual.blocks.15.attn.proj.weight": "model-00001-of-00003.safetensors",
168
+ "backbone.qwen_model.model.visual.blocks.15.attn.qkv.bias": "model-00001-of-00003.safetensors",
169
+ "backbone.qwen_model.model.visual.blocks.15.attn.qkv.weight": "model-00001-of-00003.safetensors",
170
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
171
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
172
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
173
+ "backbone.qwen_model.model.visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
174
+ "backbone.qwen_model.model.visual.blocks.15.norm1.bias": "model-00001-of-00003.safetensors",
175
+ "backbone.qwen_model.model.visual.blocks.15.norm1.weight": "model-00001-of-00003.safetensors",
176
+ "backbone.qwen_model.model.visual.blocks.15.norm2.bias": "model-00001-of-00003.safetensors",
177
+ "backbone.qwen_model.model.visual.blocks.15.norm2.weight": "model-00001-of-00003.safetensors",
178
+ "backbone.qwen_model.model.visual.blocks.16.attn.proj.bias": "model-00001-of-00003.safetensors",
179
+ "backbone.qwen_model.model.visual.blocks.16.attn.proj.weight": "model-00001-of-00003.safetensors",
180
+ "backbone.qwen_model.model.visual.blocks.16.attn.qkv.bias": "model-00001-of-00003.safetensors",
181
+ "backbone.qwen_model.model.visual.blocks.16.attn.qkv.weight": "model-00001-of-00003.safetensors",
182
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
183
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
184
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
185
+ "backbone.qwen_model.model.visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
186
+ "backbone.qwen_model.model.visual.blocks.16.norm1.bias": "model-00001-of-00003.safetensors",
187
+ "backbone.qwen_model.model.visual.blocks.16.norm1.weight": "model-00001-of-00003.safetensors",
188
+ "backbone.qwen_model.model.visual.blocks.16.norm2.bias": "model-00001-of-00003.safetensors",
189
+ "backbone.qwen_model.model.visual.blocks.16.norm2.weight": "model-00001-of-00003.safetensors",
190
+ "backbone.qwen_model.model.visual.blocks.17.attn.proj.bias": "model-00001-of-00003.safetensors",
191
+ "backbone.qwen_model.model.visual.blocks.17.attn.proj.weight": "model-00001-of-00003.safetensors",
192
+ "backbone.qwen_model.model.visual.blocks.17.attn.qkv.bias": "model-00001-of-00003.safetensors",
193
+ "backbone.qwen_model.model.visual.blocks.17.attn.qkv.weight": "model-00001-of-00003.safetensors",
194
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
195
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
196
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
197
+ "backbone.qwen_model.model.visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
198
+ "backbone.qwen_model.model.visual.blocks.17.norm1.bias": "model-00001-of-00003.safetensors",
199
+ "backbone.qwen_model.model.visual.blocks.17.norm1.weight": "model-00001-of-00003.safetensors",
200
+ "backbone.qwen_model.model.visual.blocks.17.norm2.bias": "model-00001-of-00003.safetensors",
201
+ "backbone.qwen_model.model.visual.blocks.17.norm2.weight": "model-00001-of-00003.safetensors",
202
+ "backbone.qwen_model.model.visual.blocks.18.attn.proj.bias": "model-00001-of-00003.safetensors",
203
+ "backbone.qwen_model.model.visual.blocks.18.attn.proj.weight": "model-00001-of-00003.safetensors",
204
+ "backbone.qwen_model.model.visual.blocks.18.attn.qkv.bias": "model-00001-of-00003.safetensors",
205
+ "backbone.qwen_model.model.visual.blocks.18.attn.qkv.weight": "model-00001-of-00003.safetensors",
206
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
207
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
208
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
209
+ "backbone.qwen_model.model.visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
210
+ "backbone.qwen_model.model.visual.blocks.18.norm1.bias": "model-00001-of-00003.safetensors",
211
+ "backbone.qwen_model.model.visual.blocks.18.norm1.weight": "model-00001-of-00003.safetensors",
212
+ "backbone.qwen_model.model.visual.blocks.18.norm2.bias": "model-00001-of-00003.safetensors",
213
+ "backbone.qwen_model.model.visual.blocks.18.norm2.weight": "model-00001-of-00003.safetensors",
214
+ "backbone.qwen_model.model.visual.blocks.19.attn.proj.bias": "model-00001-of-00003.safetensors",
215
+ "backbone.qwen_model.model.visual.blocks.19.attn.proj.weight": "model-00001-of-00003.safetensors",
216
+ "backbone.qwen_model.model.visual.blocks.19.attn.qkv.bias": "model-00001-of-00003.safetensors",
217
+ "backbone.qwen_model.model.visual.blocks.19.attn.qkv.weight": "model-00001-of-00003.safetensors",
218
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
219
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
220
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
221
+ "backbone.qwen_model.model.visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
222
+ "backbone.qwen_model.model.visual.blocks.19.norm1.bias": "model-00001-of-00003.safetensors",
223
+ "backbone.qwen_model.model.visual.blocks.19.norm1.weight": "model-00001-of-00003.safetensors",
224
+ "backbone.qwen_model.model.visual.blocks.19.norm2.bias": "model-00001-of-00003.safetensors",
225
+ "backbone.qwen_model.model.visual.blocks.19.norm2.weight": "model-00001-of-00003.safetensors",
226
+ "backbone.qwen_model.model.visual.blocks.2.attn.proj.bias": "model-00001-of-00003.safetensors",
227
+ "backbone.qwen_model.model.visual.blocks.2.attn.proj.weight": "model-00001-of-00003.safetensors",
228
+ "backbone.qwen_model.model.visual.blocks.2.attn.qkv.bias": "model-00001-of-00003.safetensors",
229
+ "backbone.qwen_model.model.visual.blocks.2.attn.qkv.weight": "model-00001-of-00003.safetensors",
230
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
231
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
232
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
233
+ "backbone.qwen_model.model.visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
234
+ "backbone.qwen_model.model.visual.blocks.2.norm1.bias": "model-00001-of-00003.safetensors",
235
+ "backbone.qwen_model.model.visual.blocks.2.norm1.weight": "model-00001-of-00003.safetensors",
236
+ "backbone.qwen_model.model.visual.blocks.2.norm2.bias": "model-00001-of-00003.safetensors",
237
+ "backbone.qwen_model.model.visual.blocks.2.norm2.weight": "model-00001-of-00003.safetensors",
238
+ "backbone.qwen_model.model.visual.blocks.20.attn.proj.bias": "model-00001-of-00003.safetensors",
239
+ "backbone.qwen_model.model.visual.blocks.20.attn.proj.weight": "model-00001-of-00003.safetensors",
240
+ "backbone.qwen_model.model.visual.blocks.20.attn.qkv.bias": "model-00001-of-00003.safetensors",
241
+ "backbone.qwen_model.model.visual.blocks.20.attn.qkv.weight": "model-00001-of-00003.safetensors",
242
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
243
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
244
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
245
+ "backbone.qwen_model.model.visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
246
+ "backbone.qwen_model.model.visual.blocks.20.norm1.bias": "model-00001-of-00003.safetensors",
247
+ "backbone.qwen_model.model.visual.blocks.20.norm1.weight": "model-00001-of-00003.safetensors",
248
+ "backbone.qwen_model.model.visual.blocks.20.norm2.bias": "model-00001-of-00003.safetensors",
249
+ "backbone.qwen_model.model.visual.blocks.20.norm2.weight": "model-00001-of-00003.safetensors",
250
+ "backbone.qwen_model.model.visual.blocks.21.attn.proj.bias": "model-00001-of-00003.safetensors",
251
+ "backbone.qwen_model.model.visual.blocks.21.attn.proj.weight": "model-00001-of-00003.safetensors",
252
+ "backbone.qwen_model.model.visual.blocks.21.attn.qkv.bias": "model-00001-of-00003.safetensors",
253
+ "backbone.qwen_model.model.visual.blocks.21.attn.qkv.weight": "model-00001-of-00003.safetensors",
254
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
255
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
256
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
257
+ "backbone.qwen_model.model.visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
258
+ "backbone.qwen_model.model.visual.blocks.21.norm1.bias": "model-00001-of-00003.safetensors",
259
+ "backbone.qwen_model.model.visual.blocks.21.norm1.weight": "model-00001-of-00003.safetensors",
260
+ "backbone.qwen_model.model.visual.blocks.21.norm2.bias": "model-00001-of-00003.safetensors",
261
+ "backbone.qwen_model.model.visual.blocks.21.norm2.weight": "model-00001-of-00003.safetensors",
262
+ "backbone.qwen_model.model.visual.blocks.22.attn.proj.bias": "model-00001-of-00003.safetensors",
263
+ "backbone.qwen_model.model.visual.blocks.22.attn.proj.weight": "model-00001-of-00003.safetensors",
264
+ "backbone.qwen_model.model.visual.blocks.22.attn.qkv.bias": "model-00001-of-00003.safetensors",
265
+ "backbone.qwen_model.model.visual.blocks.22.attn.qkv.weight": "model-00001-of-00003.safetensors",
266
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
267
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
268
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
269
+ "backbone.qwen_model.model.visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
270
+ "backbone.qwen_model.model.visual.blocks.22.norm1.bias": "model-00001-of-00003.safetensors",
271
+ "backbone.qwen_model.model.visual.blocks.22.norm1.weight": "model-00001-of-00003.safetensors",
272
+ "backbone.qwen_model.model.visual.blocks.22.norm2.bias": "model-00001-of-00003.safetensors",
273
+ "backbone.qwen_model.model.visual.blocks.22.norm2.weight": "model-00001-of-00003.safetensors",
274
+ "backbone.qwen_model.model.visual.blocks.23.attn.proj.bias": "model-00001-of-00003.safetensors",
275
+ "backbone.qwen_model.model.visual.blocks.23.attn.proj.weight": "model-00001-of-00003.safetensors",
276
+ "backbone.qwen_model.model.visual.blocks.23.attn.qkv.bias": "model-00001-of-00003.safetensors",
277
+ "backbone.qwen_model.model.visual.blocks.23.attn.qkv.weight": "model-00001-of-00003.safetensors",
278
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
279
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
280
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
281
+ "backbone.qwen_model.model.visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
282
+ "backbone.qwen_model.model.visual.blocks.23.norm1.bias": "model-00001-of-00003.safetensors",
283
+ "backbone.qwen_model.model.visual.blocks.23.norm1.weight": "model-00001-of-00003.safetensors",
284
+ "backbone.qwen_model.model.visual.blocks.23.norm2.bias": "model-00001-of-00003.safetensors",
285
+ "backbone.qwen_model.model.visual.blocks.23.norm2.weight": "model-00001-of-00003.safetensors",
286
+ "backbone.qwen_model.model.visual.blocks.24.attn.proj.bias": "model-00001-of-00003.safetensors",
287
+ "backbone.qwen_model.model.visual.blocks.24.attn.proj.weight": "model-00001-of-00003.safetensors",
288
+ "backbone.qwen_model.model.visual.blocks.24.attn.qkv.bias": "model-00001-of-00003.safetensors",
289
+ "backbone.qwen_model.model.visual.blocks.24.attn.qkv.weight": "model-00001-of-00003.safetensors",
290
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
291
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
292
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
293
+ "backbone.qwen_model.model.visual.blocks.24.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
294
+ "backbone.qwen_model.model.visual.blocks.24.norm1.bias": "model-00001-of-00003.safetensors",
295
+ "backbone.qwen_model.model.visual.blocks.24.norm1.weight": "model-00001-of-00003.safetensors",
296
+ "backbone.qwen_model.model.visual.blocks.24.norm2.bias": "model-00001-of-00003.safetensors",
297
+ "backbone.qwen_model.model.visual.blocks.24.norm2.weight": "model-00001-of-00003.safetensors",
298
+ "backbone.qwen_model.model.visual.blocks.25.attn.proj.bias": "model-00001-of-00003.safetensors",
299
+ "backbone.qwen_model.model.visual.blocks.25.attn.proj.weight": "model-00001-of-00003.safetensors",
300
+ "backbone.qwen_model.model.visual.blocks.25.attn.qkv.bias": "model-00001-of-00003.safetensors",
301
+ "backbone.qwen_model.model.visual.blocks.25.attn.qkv.weight": "model-00001-of-00003.safetensors",
302
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
303
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
304
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
305
+ "backbone.qwen_model.model.visual.blocks.25.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
306
+ "backbone.qwen_model.model.visual.blocks.25.norm1.bias": "model-00001-of-00003.safetensors",
307
+ "backbone.qwen_model.model.visual.blocks.25.norm1.weight": "model-00001-of-00003.safetensors",
308
+ "backbone.qwen_model.model.visual.blocks.25.norm2.bias": "model-00001-of-00003.safetensors",
309
+ "backbone.qwen_model.model.visual.blocks.25.norm2.weight": "model-00001-of-00003.safetensors",
310
+ "backbone.qwen_model.model.visual.blocks.26.attn.proj.bias": "model-00001-of-00003.safetensors",
311
+ "backbone.qwen_model.model.visual.blocks.26.attn.proj.weight": "model-00001-of-00003.safetensors",
312
+ "backbone.qwen_model.model.visual.blocks.26.attn.qkv.bias": "model-00001-of-00003.safetensors",
313
+ "backbone.qwen_model.model.visual.blocks.26.attn.qkv.weight": "model-00001-of-00003.safetensors",
314
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
315
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
316
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
317
+ "backbone.qwen_model.model.visual.blocks.26.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
318
+ "backbone.qwen_model.model.visual.blocks.26.norm1.bias": "model-00001-of-00003.safetensors",
319
+ "backbone.qwen_model.model.visual.blocks.26.norm1.weight": "model-00001-of-00003.safetensors",
320
+ "backbone.qwen_model.model.visual.blocks.26.norm2.bias": "model-00001-of-00003.safetensors",
321
+ "backbone.qwen_model.model.visual.blocks.26.norm2.weight": "model-00001-of-00003.safetensors",
322
+ "backbone.qwen_model.model.visual.blocks.3.attn.proj.bias": "model-00001-of-00003.safetensors",
323
+ "backbone.qwen_model.model.visual.blocks.3.attn.proj.weight": "model-00001-of-00003.safetensors",
324
+ "backbone.qwen_model.model.visual.blocks.3.attn.qkv.bias": "model-00001-of-00003.safetensors",
325
+ "backbone.qwen_model.model.visual.blocks.3.attn.qkv.weight": "model-00001-of-00003.safetensors",
326
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
327
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
328
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
329
+ "backbone.qwen_model.model.visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
330
+ "backbone.qwen_model.model.visual.blocks.3.norm1.bias": "model-00001-of-00003.safetensors",
331
+ "backbone.qwen_model.model.visual.blocks.3.norm1.weight": "model-00001-of-00003.safetensors",
332
+ "backbone.qwen_model.model.visual.blocks.3.norm2.bias": "model-00001-of-00003.safetensors",
333
+ "backbone.qwen_model.model.visual.blocks.3.norm2.weight": "model-00001-of-00003.safetensors",
334
+ "backbone.qwen_model.model.visual.blocks.4.attn.proj.bias": "model-00001-of-00003.safetensors",
335
+ "backbone.qwen_model.model.visual.blocks.4.attn.proj.weight": "model-00001-of-00003.safetensors",
336
+ "backbone.qwen_model.model.visual.blocks.4.attn.qkv.bias": "model-00001-of-00003.safetensors",
337
+ "backbone.qwen_model.model.visual.blocks.4.attn.qkv.weight": "model-00001-of-00003.safetensors",
338
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
339
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
340
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
341
+ "backbone.qwen_model.model.visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
342
+ "backbone.qwen_model.model.visual.blocks.4.norm1.bias": "model-00001-of-00003.safetensors",
343
+ "backbone.qwen_model.model.visual.blocks.4.norm1.weight": "model-00001-of-00003.safetensors",
344
+ "backbone.qwen_model.model.visual.blocks.4.norm2.bias": "model-00001-of-00003.safetensors",
345
+ "backbone.qwen_model.model.visual.blocks.4.norm2.weight": "model-00001-of-00003.safetensors",
346
+ "backbone.qwen_model.model.visual.blocks.5.attn.proj.bias": "model-00001-of-00003.safetensors",
347
+ "backbone.qwen_model.model.visual.blocks.5.attn.proj.weight": "model-00001-of-00003.safetensors",
348
+ "backbone.qwen_model.model.visual.blocks.5.attn.qkv.bias": "model-00001-of-00003.safetensors",
349
+ "backbone.qwen_model.model.visual.blocks.5.attn.qkv.weight": "model-00001-of-00003.safetensors",
350
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
351
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
352
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
353
+ "backbone.qwen_model.model.visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
354
+ "backbone.qwen_model.model.visual.blocks.5.norm1.bias": "model-00001-of-00003.safetensors",
355
+ "backbone.qwen_model.model.visual.blocks.5.norm1.weight": "model-00001-of-00003.safetensors",
356
+ "backbone.qwen_model.model.visual.blocks.5.norm2.bias": "model-00001-of-00003.safetensors",
357
+ "backbone.qwen_model.model.visual.blocks.5.norm2.weight": "model-00001-of-00003.safetensors",
358
+ "backbone.qwen_model.model.visual.blocks.6.attn.proj.bias": "model-00001-of-00003.safetensors",
359
+ "backbone.qwen_model.model.visual.blocks.6.attn.proj.weight": "model-00001-of-00003.safetensors",
360
+ "backbone.qwen_model.model.visual.blocks.6.attn.qkv.bias": "model-00001-of-00003.safetensors",
361
+ "backbone.qwen_model.model.visual.blocks.6.attn.qkv.weight": "model-00001-of-00003.safetensors",
362
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
363
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
364
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
365
+ "backbone.qwen_model.model.visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
366
+ "backbone.qwen_model.model.visual.blocks.6.norm1.bias": "model-00001-of-00003.safetensors",
367
+ "backbone.qwen_model.model.visual.blocks.6.norm1.weight": "model-00001-of-00003.safetensors",
368
+ "backbone.qwen_model.model.visual.blocks.6.norm2.bias": "model-00001-of-00003.safetensors",
369
+ "backbone.qwen_model.model.visual.blocks.6.norm2.weight": "model-00001-of-00003.safetensors",
370
+ "backbone.qwen_model.model.visual.blocks.7.attn.proj.bias": "model-00001-of-00003.safetensors",
371
+ "backbone.qwen_model.model.visual.blocks.7.attn.proj.weight": "model-00001-of-00003.safetensors",
372
+ "backbone.qwen_model.model.visual.blocks.7.attn.qkv.bias": "model-00001-of-00003.safetensors",
373
+ "backbone.qwen_model.model.visual.blocks.7.attn.qkv.weight": "model-00001-of-00003.safetensors",
374
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
375
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
376
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
377
+ "backbone.qwen_model.model.visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
378
+ "backbone.qwen_model.model.visual.blocks.7.norm1.bias": "model-00001-of-00003.safetensors",
379
+ "backbone.qwen_model.model.visual.blocks.7.norm1.weight": "model-00001-of-00003.safetensors",
380
+ "backbone.qwen_model.model.visual.blocks.7.norm2.bias": "model-00001-of-00003.safetensors",
381
+ "backbone.qwen_model.model.visual.blocks.7.norm2.weight": "model-00001-of-00003.safetensors",
382
+ "backbone.qwen_model.model.visual.blocks.8.attn.proj.bias": "model-00001-of-00003.safetensors",
383
+ "backbone.qwen_model.model.visual.blocks.8.attn.proj.weight": "model-00001-of-00003.safetensors",
384
+ "backbone.qwen_model.model.visual.blocks.8.attn.qkv.bias": "model-00001-of-00003.safetensors",
385
+ "backbone.qwen_model.model.visual.blocks.8.attn.qkv.weight": "model-00001-of-00003.safetensors",
386
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
387
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
388
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
389
+ "backbone.qwen_model.model.visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
390
+ "backbone.qwen_model.model.visual.blocks.8.norm1.bias": "model-00001-of-00003.safetensors",
391
+ "backbone.qwen_model.model.visual.blocks.8.norm1.weight": "model-00001-of-00003.safetensors",
392
+ "backbone.qwen_model.model.visual.blocks.8.norm2.bias": "model-00001-of-00003.safetensors",
393
+ "backbone.qwen_model.model.visual.blocks.8.norm2.weight": "model-00001-of-00003.safetensors",
394
+ "backbone.qwen_model.model.visual.blocks.9.attn.proj.bias": "model-00001-of-00003.safetensors",
395
+ "backbone.qwen_model.model.visual.blocks.9.attn.proj.weight": "model-00001-of-00003.safetensors",
396
+ "backbone.qwen_model.model.visual.blocks.9.attn.qkv.bias": "model-00001-of-00003.safetensors",
397
+ "backbone.qwen_model.model.visual.blocks.9.attn.qkv.weight": "model-00001-of-00003.safetensors",
398
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00003.safetensors",
399
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00003.safetensors",
400
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00003.safetensors",
401
+ "backbone.qwen_model.model.visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00003.safetensors",
402
+ "backbone.qwen_model.model.visual.blocks.9.norm1.bias": "model-00001-of-00003.safetensors",
403
+ "backbone.qwen_model.model.visual.blocks.9.norm1.weight": "model-00001-of-00003.safetensors",
404
+ "backbone.qwen_model.model.visual.blocks.9.norm2.bias": "model-00001-of-00003.safetensors",
405
+ "backbone.qwen_model.model.visual.blocks.9.norm2.weight": "model-00001-of-00003.safetensors",
406
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00003.safetensors",
407
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00003.safetensors",
408
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00003.safetensors",
409
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00003.safetensors",
410
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00003.safetensors",
411
+ "backbone.qwen_model.model.visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00003.safetensors",
412
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00003.safetensors",
413
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00003.safetensors",
414
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00003.safetensors",
415
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00003.safetensors",
416
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00003.safetensors",
417
+ "backbone.qwen_model.model.visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00003.safetensors",
418
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00003.safetensors",
419
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00003.safetensors",
420
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00003.safetensors",
421
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00003.safetensors",
422
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00003.safetensors",
423
+ "backbone.qwen_model.model.visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00003.safetensors",
424
+ "backbone.qwen_model.model.visual.merger.linear_fc1.bias": "model-00001-of-00003.safetensors",
425
+ "backbone.qwen_model.model.visual.merger.linear_fc1.weight": "model-00001-of-00003.safetensors",
426
+ "backbone.qwen_model.model.visual.merger.linear_fc2.bias": "model-00001-of-00003.safetensors",
427
+ "backbone.qwen_model.model.visual.merger.linear_fc2.weight": "model-00001-of-00003.safetensors",
428
+ "backbone.qwen_model.model.visual.merger.norm.bias": "model-00001-of-00003.safetensors",
429
+ "backbone.qwen_model.model.visual.merger.norm.weight": "model-00001-of-00003.safetensors",
430
+ "backbone.qwen_model.model.visual.patch_embed.proj.bias": "model-00001-of-00003.safetensors",
431
+ "backbone.qwen_model.model.visual.patch_embed.proj.weight": "model-00001-of-00003.safetensors",
432
+ "backbone.qwen_model.model.visual.pos_embed.weight": "model-00001-of-00003.safetensors",
433
+ "backbone.qwen_model.model.language_model.layers.10.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
434
+ "backbone.qwen_model.model.language_model.layers.10.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
435
+ "backbone.qwen_model.model.language_model.layers.10.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
436
+ "backbone.qwen_model.model.language_model.layers.10.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
437
+ "backbone.qwen_model.model.language_model.layers.10.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
438
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
439
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
440
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
441
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
442
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
443
+ "backbone.qwen_model.model.language_model.layers.10.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
444
+ "backbone.qwen_model.model.language_model.layers.11.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
445
+ "backbone.qwen_model.model.language_model.layers.11.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
446
+ "backbone.qwen_model.model.language_model.layers.11.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
447
+ "backbone.qwen_model.model.language_model.layers.11.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
448
+ "backbone.qwen_model.model.language_model.layers.11.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
449
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
450
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
451
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
452
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
453
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
454
+ "backbone.qwen_model.model.language_model.layers.11.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
455
+ "backbone.qwen_model.model.language_model.layers.12.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
456
+ "backbone.qwen_model.model.language_model.layers.12.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
457
+ "backbone.qwen_model.model.language_model.layers.12.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
458
+ "backbone.qwen_model.model.language_model.layers.12.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
459
+ "backbone.qwen_model.model.language_model.layers.12.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
460
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
461
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
462
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
463
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
464
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
465
+ "backbone.qwen_model.model.language_model.layers.12.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
466
+ "backbone.qwen_model.model.language_model.layers.13.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
467
+ "backbone.qwen_model.model.language_model.layers.13.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
468
+ "backbone.qwen_model.model.language_model.layers.13.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
469
+ "backbone.qwen_model.model.language_model.layers.13.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
470
+ "backbone.qwen_model.model.language_model.layers.13.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
471
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
472
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
473
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
474
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
475
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
476
+ "backbone.qwen_model.model.language_model.layers.13.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
477
+ "backbone.qwen_model.model.language_model.layers.14.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
478
+ "backbone.qwen_model.model.language_model.layers.14.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
479
+ "backbone.qwen_model.model.language_model.layers.14.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
480
+ "backbone.qwen_model.model.language_model.layers.14.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
481
+ "backbone.qwen_model.model.language_model.layers.14.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
482
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
483
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
484
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
485
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
486
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
487
+ "backbone.qwen_model.model.language_model.layers.14.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
488
+ "backbone.qwen_model.model.language_model.layers.15.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
489
+ "backbone.qwen_model.model.language_model.layers.15.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
490
+ "backbone.qwen_model.model.language_model.layers.15.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
491
+ "backbone.qwen_model.model.language_model.layers.15.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
492
+ "backbone.qwen_model.model.language_model.layers.15.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
493
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
494
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
495
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
496
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
497
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
498
+ "backbone.qwen_model.model.language_model.layers.15.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
499
+ "backbone.qwen_model.model.language_model.layers.16.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
500
+ "backbone.qwen_model.model.language_model.layers.16.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
501
+ "backbone.qwen_model.model.language_model.layers.16.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
502
+ "backbone.qwen_model.model.language_model.layers.16.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
503
+ "backbone.qwen_model.model.language_model.layers.16.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
504
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
505
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
506
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
507
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
508
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
509
+ "backbone.qwen_model.model.language_model.layers.16.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
510
+ "backbone.qwen_model.model.language_model.layers.17.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
511
+ "backbone.qwen_model.model.language_model.layers.17.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
512
+ "backbone.qwen_model.model.language_model.layers.17.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
513
+ "backbone.qwen_model.model.language_model.layers.17.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
514
+ "backbone.qwen_model.model.language_model.layers.17.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
515
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
516
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
517
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
518
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
519
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
520
+ "backbone.qwen_model.model.language_model.layers.17.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
521
+ "backbone.qwen_model.model.language_model.layers.6.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
522
+ "backbone.qwen_model.model.language_model.layers.6.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
523
+ "backbone.qwen_model.model.language_model.layers.6.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
524
+ "backbone.qwen_model.model.language_model.layers.6.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
525
+ "backbone.qwen_model.model.language_model.layers.7.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
526
+ "backbone.qwen_model.model.language_model.layers.7.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
527
+ "backbone.qwen_model.model.language_model.layers.7.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
528
+ "backbone.qwen_model.model.language_model.layers.7.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
529
+ "backbone.qwen_model.model.language_model.layers.7.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
530
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
531
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
532
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
533
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
534
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
535
+ "backbone.qwen_model.model.language_model.layers.7.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
536
+ "backbone.qwen_model.model.language_model.layers.8.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
537
+ "backbone.qwen_model.model.language_model.layers.8.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
538
+ "backbone.qwen_model.model.language_model.layers.8.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
539
+ "backbone.qwen_model.model.language_model.layers.8.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
540
+ "backbone.qwen_model.model.language_model.layers.8.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
541
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
542
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
543
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
544
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
545
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
546
+ "backbone.qwen_model.model.language_model.layers.8.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
547
+ "backbone.qwen_model.model.language_model.layers.9.layer.input_layernorm.weight": "model-00002-of-00003.safetensors",
548
+ "backbone.qwen_model.model.language_model.layers.9.layer.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
549
+ "backbone.qwen_model.model.language_model.layers.9.layer.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
550
+ "backbone.qwen_model.model.language_model.layers.9.layer.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
551
+ "backbone.qwen_model.model.language_model.layers.9.layer.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
552
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
553
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
554
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
555
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
556
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
557
+ "backbone.qwen_model.model.language_model.layers.9.layer.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
558
+ "backbone.qwen_model.model.language_model.norm.weight": "model-00002-of-00003.safetensors",
559
+ "action_model.action_decoder.layer1.W": "model-00003-of-00003.safetensors",
560
+ "action_model.action_decoder.layer1.b": "model-00003-of-00003.safetensors",
561
+ "action_model.action_decoder.layer2.W": "model-00003-of-00003.safetensors",
562
+ "action_model.action_decoder.layer2.b": "model-00003-of-00003.safetensors",
563
+ "action_model.action_encoder.W1.W": "model-00003-of-00003.safetensors",
564
+ "action_model.action_encoder.W1.b": "model-00003-of-00003.safetensors",
565
+ "action_model.action_encoder.W2.W": "model-00003-of-00003.safetensors",
566
+ "action_model.action_encoder.W2.b": "model-00003-of-00003.safetensors",
567
+ "action_model.action_encoder.W3.W": "model-00003-of-00003.safetensors",
568
+ "action_model.action_encoder.W3.b": "model-00003-of-00003.safetensors",
569
+ "action_model.model.double_blocks.0.k_norm_sa.weight": "model-00003-of-00003.safetensors",
570
+ "action_model.model.double_blocks.0.k_norm_vl.weight": "model-00003-of-00003.safetensors",
571
+ "action_model.model.double_blocks.0.q_norm_sa.weight": "model-00003-of-00003.safetensors",
572
+ "action_model.model.double_blocks.0.q_norm_vl.weight": "model-00003-of-00003.safetensors",
573
+ "action_model.model.double_blocks.0.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
574
+ "action_model.model.double_blocks.0.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
575
+ "action_model.model.double_blocks.0.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
576
+ "action_model.model.double_blocks.0.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
577
+ "action_model.model.double_blocks.0.sa_proj.bias": "model-00003-of-00003.safetensors",
578
+ "action_model.model.double_blocks.0.sa_proj.weight": "model-00003-of-00003.safetensors",
579
+ "action_model.model.double_blocks.0.sa_qkv.bias": "model-00003-of-00003.safetensors",
580
+ "action_model.model.double_blocks.0.sa_qkv.weight": "model-00003-of-00003.safetensors",
581
+ "action_model.model.double_blocks.0.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
582
+ "action_model.model.double_blocks.0.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
583
+ "action_model.model.double_blocks.0.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
584
+ "action_model.model.double_blocks.0.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
585
+ "action_model.model.double_blocks.0.vl_proj.bias": "model-00003-of-00003.safetensors",
586
+ "action_model.model.double_blocks.0.vl_proj.weight": "model-00003-of-00003.safetensors",
587
+ "action_model.model.double_blocks.0.vl_qkv.bias": "model-00003-of-00003.safetensors",
588
+ "action_model.model.double_blocks.0.vl_qkv.weight": "model-00003-of-00003.safetensors",
589
+ "action_model.model.double_blocks.1.k_norm_sa.weight": "model-00003-of-00003.safetensors",
590
+ "action_model.model.double_blocks.1.k_norm_vl.weight": "model-00003-of-00003.safetensors",
591
+ "action_model.model.double_blocks.1.q_norm_sa.weight": "model-00003-of-00003.safetensors",
592
+ "action_model.model.double_blocks.1.q_norm_vl.weight": "model-00003-of-00003.safetensors",
593
+ "action_model.model.double_blocks.1.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
594
+ "action_model.model.double_blocks.1.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
595
+ "action_model.model.double_blocks.1.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
596
+ "action_model.model.double_blocks.1.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
597
+ "action_model.model.double_blocks.1.sa_proj.bias": "model-00003-of-00003.safetensors",
598
+ "action_model.model.double_blocks.1.sa_proj.weight": "model-00003-of-00003.safetensors",
599
+ "action_model.model.double_blocks.1.sa_qkv.bias": "model-00003-of-00003.safetensors",
600
+ "action_model.model.double_blocks.1.sa_qkv.weight": "model-00003-of-00003.safetensors",
601
+ "action_model.model.double_blocks.1.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
602
+ "action_model.model.double_blocks.1.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
603
+ "action_model.model.double_blocks.1.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
604
+ "action_model.model.double_blocks.1.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
605
+ "action_model.model.double_blocks.1.vl_proj.bias": "model-00003-of-00003.safetensors",
606
+ "action_model.model.double_blocks.1.vl_proj.weight": "model-00003-of-00003.safetensors",
607
+ "action_model.model.double_blocks.1.vl_qkv.bias": "model-00003-of-00003.safetensors",
608
+ "action_model.model.double_blocks.1.vl_qkv.weight": "model-00003-of-00003.safetensors",
609
+ "action_model.model.double_blocks.2.k_norm_sa.weight": "model-00003-of-00003.safetensors",
610
+ "action_model.model.double_blocks.2.k_norm_vl.weight": "model-00003-of-00003.safetensors",
611
+ "action_model.model.double_blocks.2.q_norm_sa.weight": "model-00003-of-00003.safetensors",
612
+ "action_model.model.double_blocks.2.q_norm_vl.weight": "model-00003-of-00003.safetensors",
613
+ "action_model.model.double_blocks.2.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
614
+ "action_model.model.double_blocks.2.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
615
+ "action_model.model.double_blocks.2.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
616
+ "action_model.model.double_blocks.2.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
617
+ "action_model.model.double_blocks.2.sa_proj.bias": "model-00003-of-00003.safetensors",
618
+ "action_model.model.double_blocks.2.sa_proj.weight": "model-00003-of-00003.safetensors",
619
+ "action_model.model.double_blocks.2.sa_qkv.bias": "model-00003-of-00003.safetensors",
620
+ "action_model.model.double_blocks.2.sa_qkv.weight": "model-00003-of-00003.safetensors",
621
+ "action_model.model.double_blocks.2.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
622
+ "action_model.model.double_blocks.2.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
623
+ "action_model.model.double_blocks.2.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
624
+ "action_model.model.double_blocks.2.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
625
+ "action_model.model.double_blocks.2.vl_proj.bias": "model-00003-of-00003.safetensors",
626
+ "action_model.model.double_blocks.2.vl_proj.weight": "model-00003-of-00003.safetensors",
627
+ "action_model.model.double_blocks.2.vl_qkv.bias": "model-00003-of-00003.safetensors",
628
+ "action_model.model.double_blocks.2.vl_qkv.weight": "model-00003-of-00003.safetensors",
629
+ "action_model.model.double_blocks.3.k_norm_sa.weight": "model-00003-of-00003.safetensors",
630
+ "action_model.model.double_blocks.3.k_norm_vl.weight": "model-00003-of-00003.safetensors",
631
+ "action_model.model.double_blocks.3.q_norm_sa.weight": "model-00003-of-00003.safetensors",
632
+ "action_model.model.double_blocks.3.q_norm_vl.weight": "model-00003-of-00003.safetensors",
633
+ "action_model.model.double_blocks.3.sa_mlp.w12.bias": "model-00003-of-00003.safetensors",
634
+ "action_model.model.double_blocks.3.sa_mlp.w12.weight": "model-00003-of-00003.safetensors",
635
+ "action_model.model.double_blocks.3.sa_mlp.w3.bias": "model-00003-of-00003.safetensors",
636
+ "action_model.model.double_blocks.3.sa_mlp.w3.weight": "model-00003-of-00003.safetensors",
637
+ "action_model.model.double_blocks.3.sa_proj.bias": "model-00003-of-00003.safetensors",
638
+ "action_model.model.double_blocks.3.sa_proj.weight": "model-00003-of-00003.safetensors",
639
+ "action_model.model.double_blocks.3.sa_qkv.bias": "model-00003-of-00003.safetensors",
640
+ "action_model.model.double_blocks.3.sa_qkv.weight": "model-00003-of-00003.safetensors",
641
+ "action_model.model.double_blocks.3.vl_mlp.w12.bias": "model-00003-of-00003.safetensors",
642
+ "action_model.model.double_blocks.3.vl_mlp.w12.weight": "model-00003-of-00003.safetensors",
643
+ "action_model.model.double_blocks.3.vl_mlp.w3.bias": "model-00003-of-00003.safetensors",
644
+ "action_model.model.double_blocks.3.vl_mlp.w3.weight": "model-00003-of-00003.safetensors",
645
+ "action_model.model.double_blocks.3.vl_proj.bias": "model-00003-of-00003.safetensors",
646
+ "action_model.model.double_blocks.3.vl_proj.weight": "model-00003-of-00003.safetensors",
647
+ "action_model.model.double_blocks.3.vl_qkv.bias": "model-00003-of-00003.safetensors",
648
+ "action_model.model.double_blocks.3.vl_qkv.weight": "model-00003-of-00003.safetensors",
649
+ "action_model.model.proj_out_1.bias": "model-00003-of-00003.safetensors",
650
+ "action_model.model.proj_out_1.weight": "model-00003-of-00003.safetensors",
651
+ "action_model.model.proj_out_2.bias": "model-00003-of-00003.safetensors",
652
+ "action_model.model.proj_out_2.weight": "model-00003-of-00003.safetensors",
653
+ "action_model.model.single_blocks.0.k_norm.weight": "model-00003-of-00003.safetensors",
654
+ "action_model.model.single_blocks.0.linear1.bias": "model-00003-of-00003.safetensors",
655
+ "action_model.model.single_blocks.0.linear1.weight": "model-00003-of-00003.safetensors",
656
+ "action_model.model.single_blocks.0.linear2.bias": "model-00003-of-00003.safetensors",
657
+ "action_model.model.single_blocks.0.linear2.weight": "model-00003-of-00003.safetensors",
658
+ "action_model.model.single_blocks.0.mlp_proj.bias": "model-00003-of-00003.safetensors",
659
+ "action_model.model.single_blocks.0.mlp_proj.weight": "model-00003-of-00003.safetensors",
660
+ "action_model.model.single_blocks.0.q_norm.weight": "model-00003-of-00003.safetensors",
661
+ "action_model.model.single_blocks.1.k_norm.weight": "model-00003-of-00003.safetensors",
662
+ "action_model.model.single_blocks.1.linear1.bias": "model-00003-of-00003.safetensors",
663
+ "action_model.model.single_blocks.1.linear1.weight": "model-00003-of-00003.safetensors",
664
+ "action_model.model.single_blocks.1.linear2.bias": "model-00003-of-00003.safetensors",
665
+ "action_model.model.single_blocks.1.linear2.weight": "model-00003-of-00003.safetensors",
666
+ "action_model.model.single_blocks.1.mlp_proj.bias": "model-00003-of-00003.safetensors",
667
+ "action_model.model.single_blocks.1.mlp_proj.weight": "model-00003-of-00003.safetensors",
668
+ "action_model.model.single_blocks.1.q_norm.weight": "model-00003-of-00003.safetensors",
669
+ "action_model.model.single_blocks.2.k_norm.weight": "model-00003-of-00003.safetensors",
670
+ "action_model.model.single_blocks.2.linear1.bias": "model-00003-of-00003.safetensors",
671
+ "action_model.model.single_blocks.2.linear1.weight": "model-00003-of-00003.safetensors",
672
+ "action_model.model.single_blocks.2.linear2.bias": "model-00003-of-00003.safetensors",
673
+ "action_model.model.single_blocks.2.linear2.weight": "model-00003-of-00003.safetensors",
674
+ "action_model.model.single_blocks.2.mlp_proj.bias": "model-00003-of-00003.safetensors",
675
+ "action_model.model.single_blocks.2.mlp_proj.weight": "model-00003-of-00003.safetensors",
676
+ "action_model.model.single_blocks.2.q_norm.weight": "model-00003-of-00003.safetensors",
677
+ "action_model.model.single_blocks.3.k_norm.weight": "model-00003-of-00003.safetensors",
678
+ "action_model.model.single_blocks.3.linear1.bias": "model-00003-of-00003.safetensors",
679
+ "action_model.model.single_blocks.3.linear1.weight": "model-00003-of-00003.safetensors",
680
+ "action_model.model.single_blocks.3.linear2.bias": "model-00003-of-00003.safetensors",
681
+ "action_model.model.single_blocks.3.linear2.weight": "model-00003-of-00003.safetensors",
682
+ "action_model.model.single_blocks.3.mlp_proj.bias": "model-00003-of-00003.safetensors",
683
+ "action_model.model.single_blocks.3.mlp_proj.weight": "model-00003-of-00003.safetensors",
684
+ "action_model.model.single_blocks.3.q_norm.weight": "model-00003-of-00003.safetensors",
685
+ "action_model.model.single_blocks.4.k_norm.weight": "model-00003-of-00003.safetensors",
686
+ "action_model.model.single_blocks.4.linear1.bias": "model-00003-of-00003.safetensors",
687
+ "action_model.model.single_blocks.4.linear1.weight": "model-00003-of-00003.safetensors",
688
+ "action_model.model.single_blocks.4.linear2.bias": "model-00003-of-00003.safetensors",
689
+ "action_model.model.single_blocks.4.linear2.weight": "model-00003-of-00003.safetensors",
690
+ "action_model.model.single_blocks.4.mlp_proj.bias": "model-00003-of-00003.safetensors",
691
+ "action_model.model.single_blocks.4.mlp_proj.weight": "model-00003-of-00003.safetensors",
692
+ "action_model.model.single_blocks.4.q_norm.weight": "model-00003-of-00003.safetensors",
693
+ "action_model.model.single_blocks.5.k_norm.weight": "model-00003-of-00003.safetensors",
694
+ "action_model.model.single_blocks.5.linear1.bias": "model-00003-of-00003.safetensors",
695
+ "action_model.model.single_blocks.5.linear1.weight": "model-00003-of-00003.safetensors",
696
+ "action_model.model.single_blocks.5.linear2.bias": "model-00003-of-00003.safetensors",
697
+ "action_model.model.single_blocks.5.linear2.weight": "model-00003-of-00003.safetensors",
698
+ "action_model.model.single_blocks.5.mlp_proj.bias": "model-00003-of-00003.safetensors",
699
+ "action_model.model.single_blocks.5.mlp_proj.weight": "model-00003-of-00003.safetensors",
700
+ "action_model.model.single_blocks.5.q_norm.weight": "model-00003-of-00003.safetensors",
701
+ "action_model.model.single_blocks.6.k_norm.weight": "model-00003-of-00003.safetensors",
702
+ "action_model.model.single_blocks.6.linear1.bias": "model-00003-of-00003.safetensors",
703
+ "action_model.model.single_blocks.6.linear1.weight": "model-00003-of-00003.safetensors",
704
+ "action_model.model.single_blocks.6.linear2.bias": "model-00003-of-00003.safetensors",
705
+ "action_model.model.single_blocks.6.linear2.weight": "model-00003-of-00003.safetensors",
706
+ "action_model.model.single_blocks.6.mlp_proj.bias": "model-00003-of-00003.safetensors",
707
+ "action_model.model.single_blocks.6.mlp_proj.weight": "model-00003-of-00003.safetensors",
708
+ "action_model.model.single_blocks.6.q_norm.weight": "model-00003-of-00003.safetensors",
709
+ "action_model.model.single_blocks.7.k_norm.weight": "model-00003-of-00003.safetensors",
710
+ "action_model.model.single_blocks.7.linear1.bias": "model-00003-of-00003.safetensors",
711
+ "action_model.model.single_blocks.7.linear1.weight": "model-00003-of-00003.safetensors",
712
+ "action_model.model.single_blocks.7.linear2.bias": "model-00003-of-00003.safetensors",
713
+ "action_model.model.single_blocks.7.linear2.weight": "model-00003-of-00003.safetensors",
714
+ "action_model.model.single_blocks.7.mlp_proj.bias": "model-00003-of-00003.safetensors",
715
+ "action_model.model.single_blocks.7.mlp_proj.weight": "model-00003-of-00003.safetensors",
716
+ "action_model.model.single_blocks.7.q_norm.weight": "model-00003-of-00003.safetensors",
717
+ "action_model.model.timestep_encoder.timestep_embedder.linear_1.bias": "model-00003-of-00003.safetensors",
718
+ "action_model.model.timestep_encoder.timestep_embedder.linear_1.weight": "model-00003-of-00003.safetensors",
719
+ "action_model.model.timestep_encoder.timestep_embedder.linear_2.bias": "model-00003-of-00003.safetensors",
720
+ "action_model.model.timestep_encoder.timestep_embedder.linear_2.weight": "model-00003-of-00003.safetensors",
721
+ "action_model.model.vl_proj_to_sa.bias": "model-00003-of-00003.safetensors",
722
+ "action_model.model.vl_proj_to_sa.weight": "model-00003-of-00003.safetensors",
723
+ "action_model.position_embedding.weight": "model-00003-of-00003.safetensors",
724
+ "action_model.state_encoder.layer1.W": "model-00003-of-00003.safetensors",
725
+ "action_model.state_encoder.layer1.b": "model-00003-of-00003.safetensors",
726
+ "action_model.state_encoder.layer2.W": "model-00003-of-00003.safetensors",
727
+ "action_model.state_encoder.layer2.b": "model-00003-of-00003.safetensors",
728
+ "backbone.qwen_model.lm_head.weight": "model-00003-of-00003.safetensors"
729
+ }
730
+ }
processor_config.json ADDED
@@ -0,0 +1,2975 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "RLDXProcessor",
3
+ "processor_kwargs": {
4
+ "modality_configs": {
5
+ "bridge_orig": {
6
+ "video": {
7
+ "delta_indices": [
8
+ -6,
9
+ -4,
10
+ -2,
11
+ 0
12
+ ],
13
+ "modality_keys": [
14
+ "image_0"
15
+ ],
16
+ "sin_cos_embedding_keys": null,
17
+ "mean_std_embedding_keys": null,
18
+ "action_configs": null
19
+ },
20
+ "state": {
21
+ "delta_indices": [
22
+ 0
23
+ ],
24
+ "modality_keys": [
25
+ "end_effector_position",
26
+ "end_effector_rotation",
27
+ "gripper_position"
28
+ ],
29
+ "sin_cos_embedding_keys": null,
30
+ "mean_std_embedding_keys": null,
31
+ "action_configs": null
32
+ },
33
+ "action": {
34
+ "delta_indices": [
35
+ 0,
36
+ 1,
37
+ 2,
38
+ 3,
39
+ 4,
40
+ 5,
41
+ 6,
42
+ 7,
43
+ 8,
44
+ 9,
45
+ 10,
46
+ 11,
47
+ 12,
48
+ 13,
49
+ 14,
50
+ 15
51
+ ],
52
+ "modality_keys": [
53
+ "end_effector_position",
54
+ "end_effector_rotation",
55
+ "gripper_close"
56
+ ],
57
+ "sin_cos_embedding_keys": null,
58
+ "mean_std_embedding_keys": null,
59
+ "action_configs": [
60
+ {
61
+ "rep": "DELTA",
62
+ "type": "EEF",
63
+ "format": "DEFAULT",
64
+ "state_key": null
65
+ },
66
+ {
67
+ "rep": "DELTA",
68
+ "type": "EEF",
69
+ "format": "DEFAULT",
70
+ "state_key": null
71
+ },
72
+ {
73
+ "rep": "ABSOLUTE",
74
+ "type": "NON_EEF",
75
+ "format": "DEFAULT",
76
+ "state_key": null
77
+ }
78
+ ]
79
+ },
80
+ "language": {
81
+ "delta_indices": [
82
+ 0
83
+ ],
84
+ "modality_keys": [
85
+ "annotation.human.action.task_description"
86
+ ],
87
+ "sin_cos_embedding_keys": null,
88
+ "mean_std_embedding_keys": null,
89
+ "action_configs": null
90
+ }
91
+ },
92
+ "iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
93
+ "video": {
94
+ "delta_indices": [
95
+ -6,
96
+ -4,
97
+ -2,
98
+ 0
99
+ ],
100
+ "modality_keys": [
101
+ "primary",
102
+ "wrist"
103
+ ],
104
+ "sin_cos_embedding_keys": null,
105
+ "mean_std_embedding_keys": null,
106
+ "action_configs": null
107
+ },
108
+ "state": {
109
+ "delta_indices": [
110
+ 0
111
+ ],
112
+ "modality_keys": [
113
+ "end_effector_position",
114
+ "end_effector_rotation",
115
+ "gripper_position"
116
+ ],
117
+ "sin_cos_embedding_keys": null,
118
+ "mean_std_embedding_keys": null,
119
+ "action_configs": null
120
+ },
121
+ "action": {
122
+ "delta_indices": [
123
+ 0,
124
+ 1,
125
+ 2,
126
+ 3,
127
+ 4,
128
+ 5,
129
+ 6,
130
+ 7,
131
+ 8,
132
+ 9,
133
+ 10,
134
+ 11,
135
+ 12,
136
+ 13,
137
+ 14,
138
+ 15
139
+ ],
140
+ "modality_keys": [
141
+ "end_effector_position",
142
+ "end_effector_rotation",
143
+ "gripper_close"
144
+ ],
145
+ "sin_cos_embedding_keys": null,
146
+ "mean_std_embedding_keys": null,
147
+ "action_configs": [
148
+ {
149
+ "rep": "DELTA",
150
+ "type": "EEF",
151
+ "format": "DEFAULT",
152
+ "state_key": null
153
+ },
154
+ {
155
+ "rep": "DELTA",
156
+ "type": "EEF",
157
+ "format": "DEFAULT",
158
+ "state_key": null
159
+ },
160
+ {
161
+ "rep": "ABSOLUTE",
162
+ "type": "NON_EEF",
163
+ "format": "DEFAULT",
164
+ "state_key": null
165
+ }
166
+ ]
167
+ },
168
+ "language": {
169
+ "delta_indices": [
170
+ 0
171
+ ],
172
+ "modality_keys": [
173
+ "annotation.human.action.task_description"
174
+ ],
175
+ "sin_cos_embedding_keys": null,
176
+ "mean_std_embedding_keys": null,
177
+ "action_configs": null
178
+ }
179
+ },
180
+ "humanoid_everyday_g1": {
181
+ "video": {
182
+ "delta_indices": [
183
+ -6,
184
+ -4,
185
+ -2,
186
+ 0
187
+ ],
188
+ "modality_keys": [
189
+ "egocentric_resized"
190
+ ],
191
+ "sin_cos_embedding_keys": null,
192
+ "mean_std_embedding_keys": null,
193
+ "action_configs": null
194
+ },
195
+ "state": {
196
+ "delta_indices": [
197
+ 0
198
+ ],
199
+ "modality_keys": [
200
+ "left_arm",
201
+ "left_hand",
202
+ "right_arm",
203
+ "right_hand"
204
+ ],
205
+ "sin_cos_embedding_keys": null,
206
+ "mean_std_embedding_keys": null,
207
+ "action_configs": null
208
+ },
209
+ "action": {
210
+ "delta_indices": [
211
+ 0,
212
+ 1,
213
+ 2,
214
+ 3,
215
+ 4,
216
+ 5,
217
+ 6,
218
+ 7,
219
+ 8,
220
+ 9,
221
+ 10,
222
+ 11,
223
+ 12,
224
+ 13,
225
+ 14,
226
+ 15
227
+ ],
228
+ "modality_keys": [
229
+ "left_arm",
230
+ "left_hand",
231
+ "right_arm",
232
+ "right_hand"
233
+ ],
234
+ "sin_cos_embedding_keys": null,
235
+ "mean_std_embedding_keys": null,
236
+ "action_configs": [
237
+ {
238
+ "rep": "ABSOLUTE",
239
+ "type": "NON_EEF",
240
+ "format": "DEFAULT",
241
+ "state_key": null
242
+ },
243
+ {
244
+ "rep": "ABSOLUTE",
245
+ "type": "NON_EEF",
246
+ "format": "DEFAULT",
247
+ "state_key": null
248
+ },
249
+ {
250
+ "rep": "ABSOLUTE",
251
+ "type": "NON_EEF",
252
+ "format": "DEFAULT",
253
+ "state_key": null
254
+ },
255
+ {
256
+ "rep": "ABSOLUTE",
257
+ "type": "NON_EEF",
258
+ "format": "DEFAULT",
259
+ "state_key": null
260
+ }
261
+ ]
262
+ },
263
+ "language": {
264
+ "delta_indices": [
265
+ 0
266
+ ],
267
+ "modality_keys": [
268
+ "annotation.human.action.task_description"
269
+ ],
270
+ "sin_cos_embedding_keys": null,
271
+ "mean_std_embedding_keys": null,
272
+ "action_configs": null
273
+ }
274
+ },
275
+ "dlr_edan_shared_control_converted_externally_to_rlds": {
276
+ "video": {
277
+ "delta_indices": [
278
+ -6,
279
+ -4,
280
+ -2,
281
+ 0
282
+ ],
283
+ "modality_keys": [
284
+ "primary"
285
+ ],
286
+ "sin_cos_embedding_keys": null,
287
+ "mean_std_embedding_keys": null,
288
+ "action_configs": null
289
+ },
290
+ "state": {
291
+ "delta_indices": [
292
+ 0
293
+ ],
294
+ "modality_keys": [
295
+ "end_effector_position",
296
+ "end_effector_rotation",
297
+ "gripper_position"
298
+ ],
299
+ "sin_cos_embedding_keys": null,
300
+ "mean_std_embedding_keys": null,
301
+ "action_configs": null
302
+ },
303
+ "action": {
304
+ "delta_indices": [
305
+ 0,
306
+ 1,
307
+ 2,
308
+ 3,
309
+ 4,
310
+ 5,
311
+ 6,
312
+ 7,
313
+ 8,
314
+ 9,
315
+ 10,
316
+ 11,
317
+ 12,
318
+ 13,
319
+ 14,
320
+ 15
321
+ ],
322
+ "modality_keys": [
323
+ "end_effector_position",
324
+ "end_effector_rotation",
325
+ "gripper_close"
326
+ ],
327
+ "sin_cos_embedding_keys": null,
328
+ "mean_std_embedding_keys": null,
329
+ "action_configs": [
330
+ {
331
+ "rep": "DELTA",
332
+ "type": "EEF",
333
+ "format": "DEFAULT",
334
+ "state_key": null
335
+ },
336
+ {
337
+ "rep": "DELTA",
338
+ "type": "EEF",
339
+ "format": "DEFAULT",
340
+ "state_key": null
341
+ },
342
+ {
343
+ "rep": "ABSOLUTE",
344
+ "type": "NON_EEF",
345
+ "format": "DEFAULT",
346
+ "state_key": null
347
+ }
348
+ ]
349
+ },
350
+ "language": {
351
+ "delta_indices": [
352
+ 0
353
+ ],
354
+ "modality_keys": [
355
+ "annotation.human.action.task_description"
356
+ ],
357
+ "sin_cos_embedding_keys": null,
358
+ "mean_std_embedding_keys": null,
359
+ "action_configs": null
360
+ }
361
+ },
362
+ "austin_sailor_dataset_converted_externally_to_rlds": {
363
+ "video": {
364
+ "delta_indices": [
365
+ -6,
366
+ -4,
367
+ -2,
368
+ 0
369
+ ],
370
+ "modality_keys": [
371
+ "primary",
372
+ "wrist"
373
+ ],
374
+ "sin_cos_embedding_keys": null,
375
+ "mean_std_embedding_keys": null,
376
+ "action_configs": null
377
+ },
378
+ "state": {
379
+ "delta_indices": [
380
+ 0
381
+ ],
382
+ "modality_keys": [
383
+ "end_effector_position",
384
+ "end_effector_rotation",
385
+ "gripper_position"
386
+ ],
387
+ "sin_cos_embedding_keys": null,
388
+ "mean_std_embedding_keys": null,
389
+ "action_configs": null
390
+ },
391
+ "action": {
392
+ "delta_indices": [
393
+ 0,
394
+ 1,
395
+ 2,
396
+ 3,
397
+ 4,
398
+ 5,
399
+ 6,
400
+ 7,
401
+ 8,
402
+ 9,
403
+ 10,
404
+ 11,
405
+ 12,
406
+ 13,
407
+ 14,
408
+ 15
409
+ ],
410
+ "modality_keys": [
411
+ "end_effector_position",
412
+ "end_effector_rotation",
413
+ "gripper_close"
414
+ ],
415
+ "sin_cos_embedding_keys": null,
416
+ "mean_std_embedding_keys": null,
417
+ "action_configs": [
418
+ {
419
+ "rep": "DELTA",
420
+ "type": "EEF",
421
+ "format": "DEFAULT",
422
+ "state_key": null
423
+ },
424
+ {
425
+ "rep": "DELTA",
426
+ "type": "EEF",
427
+ "format": "DEFAULT",
428
+ "state_key": null
429
+ },
430
+ {
431
+ "rep": "ABSOLUTE",
432
+ "type": "NON_EEF",
433
+ "format": "DEFAULT",
434
+ "state_key": null
435
+ }
436
+ ]
437
+ },
438
+ "language": {
439
+ "delta_indices": [
440
+ 0
441
+ ],
442
+ "modality_keys": [
443
+ "annotation.human.action.task_description"
444
+ ],
445
+ "sin_cos_embedding_keys": null,
446
+ "mean_std_embedding_keys": null,
447
+ "action_configs": null
448
+ }
449
+ },
450
+ "berkeley_autolab_ur5": {
451
+ "video": {
452
+ "delta_indices": [
453
+ -6,
454
+ -4,
455
+ -2,
456
+ 0
457
+ ],
458
+ "modality_keys": [
459
+ "primary",
460
+ "wrist"
461
+ ],
462
+ "sin_cos_embedding_keys": null,
463
+ "mean_std_embedding_keys": null,
464
+ "action_configs": null
465
+ },
466
+ "state": {
467
+ "delta_indices": [
468
+ 0
469
+ ],
470
+ "modality_keys": [
471
+ "end_effector_position",
472
+ "end_effector_rotation",
473
+ "gripper_position"
474
+ ],
475
+ "sin_cos_embedding_keys": null,
476
+ "mean_std_embedding_keys": null,
477
+ "action_configs": null
478
+ },
479
+ "action": {
480
+ "delta_indices": [
481
+ 0,
482
+ 1,
483
+ 2,
484
+ 3,
485
+ 4,
486
+ 5,
487
+ 6,
488
+ 7,
489
+ 8,
490
+ 9,
491
+ 10,
492
+ 11,
493
+ 12,
494
+ 13,
495
+ 14,
496
+ 15
497
+ ],
498
+ "modality_keys": [
499
+ "end_effector_position",
500
+ "end_effector_rotation",
501
+ "gripper_close"
502
+ ],
503
+ "sin_cos_embedding_keys": null,
504
+ "mean_std_embedding_keys": null,
505
+ "action_configs": [
506
+ {
507
+ "rep": "DELTA",
508
+ "type": "EEF",
509
+ "format": "DEFAULT",
510
+ "state_key": null
511
+ },
512
+ {
513
+ "rep": "DELTA",
514
+ "type": "EEF",
515
+ "format": "DEFAULT",
516
+ "state_key": null
517
+ },
518
+ {
519
+ "rep": "ABSOLUTE",
520
+ "type": "NON_EEF",
521
+ "format": "DEFAULT",
522
+ "state_key": null
523
+ }
524
+ ]
525
+ },
526
+ "language": {
527
+ "delta_indices": [
528
+ 0
529
+ ],
530
+ "modality_keys": [
531
+ "annotation.human.action.task_description"
532
+ ],
533
+ "sin_cos_embedding_keys": null,
534
+ "mean_std_embedding_keys": null,
535
+ "action_configs": null
536
+ }
537
+ },
538
+ "fractal20220817_data": {
539
+ "video": {
540
+ "delta_indices": [
541
+ -6,
542
+ -4,
543
+ -2,
544
+ 0
545
+ ],
546
+ "modality_keys": [
547
+ "primary"
548
+ ],
549
+ "sin_cos_embedding_keys": null,
550
+ "mean_std_embedding_keys": null,
551
+ "action_configs": null
552
+ },
553
+ "state": {
554
+ "delta_indices": [
555
+ 0
556
+ ],
557
+ "modality_keys": [
558
+ "end_effector_position",
559
+ "end_effector_rotation",
560
+ "gripper_position"
561
+ ],
562
+ "sin_cos_embedding_keys": null,
563
+ "mean_std_embedding_keys": null,
564
+ "action_configs": null
565
+ },
566
+ "action": {
567
+ "delta_indices": [
568
+ 0,
569
+ 1,
570
+ 2,
571
+ 3,
572
+ 4,
573
+ 5,
574
+ 6,
575
+ 7,
576
+ 8,
577
+ 9,
578
+ 10,
579
+ 11,
580
+ 12,
581
+ 13,
582
+ 14,
583
+ 15
584
+ ],
585
+ "modality_keys": [
586
+ "end_effector_position",
587
+ "end_effector_rotation",
588
+ "gripper_close"
589
+ ],
590
+ "sin_cos_embedding_keys": null,
591
+ "mean_std_embedding_keys": null,
592
+ "action_configs": [
593
+ {
594
+ "rep": "DELTA",
595
+ "type": "EEF",
596
+ "format": "DEFAULT",
597
+ "state_key": null
598
+ },
599
+ {
600
+ "rep": "DELTA",
601
+ "type": "EEF",
602
+ "format": "DEFAULT",
603
+ "state_key": null
604
+ },
605
+ {
606
+ "rep": "ABSOLUTE",
607
+ "type": "NON_EEF",
608
+ "format": "DEFAULT",
609
+ "state_key": null
610
+ }
611
+ ]
612
+ },
613
+ "language": {
614
+ "delta_indices": [
615
+ 0
616
+ ],
617
+ "modality_keys": [
618
+ "annotation.human.action.task_description"
619
+ ],
620
+ "sin_cos_embedding_keys": null,
621
+ "mean_std_embedding_keys": null,
622
+ "action_configs": null
623
+ }
624
+ },
625
+ "cmu_stretch": {
626
+ "video": {
627
+ "delta_indices": [
628
+ -6,
629
+ -4,
630
+ -2,
631
+ 0
632
+ ],
633
+ "modality_keys": [
634
+ "primary"
635
+ ],
636
+ "sin_cos_embedding_keys": null,
637
+ "mean_std_embedding_keys": null,
638
+ "action_configs": null
639
+ },
640
+ "state": {
641
+ "delta_indices": [
642
+ 0
643
+ ],
644
+ "modality_keys": [
645
+ "end_effector_position",
646
+ "end_effector_rotation",
647
+ "gripper_position"
648
+ ],
649
+ "sin_cos_embedding_keys": null,
650
+ "mean_std_embedding_keys": null,
651
+ "action_configs": null
652
+ },
653
+ "action": {
654
+ "delta_indices": [
655
+ 0,
656
+ 1,
657
+ 2,
658
+ 3,
659
+ 4,
660
+ 5,
661
+ 6,
662
+ 7,
663
+ 8,
664
+ 9,
665
+ 10,
666
+ 11,
667
+ 12,
668
+ 13,
669
+ 14,
670
+ 15
671
+ ],
672
+ "modality_keys": [
673
+ "end_effector_position",
674
+ "end_effector_rotation",
675
+ "gripper_close"
676
+ ],
677
+ "sin_cos_embedding_keys": null,
678
+ "mean_std_embedding_keys": null,
679
+ "action_configs": [
680
+ {
681
+ "rep": "DELTA",
682
+ "type": "EEF",
683
+ "format": "DEFAULT",
684
+ "state_key": null
685
+ },
686
+ {
687
+ "rep": "DELTA",
688
+ "type": "EEF",
689
+ "format": "DEFAULT",
690
+ "state_key": null
691
+ },
692
+ {
693
+ "rep": "ABSOLUTE",
694
+ "type": "NON_EEF",
695
+ "format": "DEFAULT",
696
+ "state_key": null
697
+ }
698
+ ]
699
+ },
700
+ "language": {
701
+ "delta_indices": [
702
+ 0
703
+ ],
704
+ "modality_keys": [
705
+ "annotation.human.action.task_description"
706
+ ],
707
+ "sin_cos_embedding_keys": null,
708
+ "mean_std_embedding_keys": null,
709
+ "action_configs": null
710
+ }
711
+ },
712
+ "berkeley_cable_routing": {
713
+ "video": {
714
+ "delta_indices": [
715
+ -6,
716
+ -4,
717
+ -2,
718
+ 0
719
+ ],
720
+ "modality_keys": [
721
+ "primary",
722
+ "secondary",
723
+ "wrist"
724
+ ],
725
+ "sin_cos_embedding_keys": null,
726
+ "mean_std_embedding_keys": null,
727
+ "action_configs": null
728
+ },
729
+ "state": {
730
+ "delta_indices": [
731
+ 0
732
+ ],
733
+ "modality_keys": [
734
+ "joint_position"
735
+ ],
736
+ "sin_cos_embedding_keys": null,
737
+ "mean_std_embedding_keys": null,
738
+ "action_configs": null
739
+ },
740
+ "action": {
741
+ "delta_indices": [
742
+ 0,
743
+ 1,
744
+ 2,
745
+ 3,
746
+ 4,
747
+ 5,
748
+ 6,
749
+ 7,
750
+ 8,
751
+ 9,
752
+ 10,
753
+ 11,
754
+ 12,
755
+ 13,
756
+ 14,
757
+ 15
758
+ ],
759
+ "modality_keys": [
760
+ "end_effector_position",
761
+ "end_effector_rotation",
762
+ "gripper_close"
763
+ ],
764
+ "sin_cos_embedding_keys": null,
765
+ "mean_std_embedding_keys": null,
766
+ "action_configs": [
767
+ {
768
+ "rep": "DELTA",
769
+ "type": "EEF",
770
+ "format": "DEFAULT",
771
+ "state_key": null
772
+ },
773
+ {
774
+ "rep": "DELTA",
775
+ "type": "EEF",
776
+ "format": "DEFAULT",
777
+ "state_key": null
778
+ },
779
+ {
780
+ "rep": "ABSOLUTE",
781
+ "type": "NON_EEF",
782
+ "format": "DEFAULT",
783
+ "state_key": null
784
+ }
785
+ ]
786
+ },
787
+ "language": {
788
+ "delta_indices": [
789
+ 0
790
+ ],
791
+ "modality_keys": [
792
+ "annotation.human.action.task_description"
793
+ ],
794
+ "sin_cos_embedding_keys": null,
795
+ "mean_std_embedding_keys": null,
796
+ "action_configs": null
797
+ }
798
+ },
799
+ "stanford_hydra_dataset_converted_externally_to_rlds": {
800
+ "video": {
801
+ "delta_indices": [
802
+ -6,
803
+ -4,
804
+ -2,
805
+ 0
806
+ ],
807
+ "modality_keys": [
808
+ "primary",
809
+ "wrist"
810
+ ],
811
+ "sin_cos_embedding_keys": null,
812
+ "mean_std_embedding_keys": null,
813
+ "action_configs": null
814
+ },
815
+ "state": {
816
+ "delta_indices": [
817
+ 0
818
+ ],
819
+ "modality_keys": [
820
+ "end_effector_position",
821
+ "end_effector_rotation",
822
+ "gripper_position"
823
+ ],
824
+ "sin_cos_embedding_keys": null,
825
+ "mean_std_embedding_keys": null,
826
+ "action_configs": null
827
+ },
828
+ "action": {
829
+ "delta_indices": [
830
+ 0,
831
+ 1,
832
+ 2,
833
+ 3,
834
+ 4,
835
+ 5,
836
+ 6,
837
+ 7,
838
+ 8,
839
+ 9,
840
+ 10,
841
+ 11,
842
+ 12,
843
+ 13,
844
+ 14,
845
+ 15
846
+ ],
847
+ "modality_keys": [
848
+ "end_effector_position",
849
+ "end_effector_rotation",
850
+ "gripper_close"
851
+ ],
852
+ "sin_cos_embedding_keys": null,
853
+ "mean_std_embedding_keys": null,
854
+ "action_configs": [
855
+ {
856
+ "rep": "DELTA",
857
+ "type": "EEF",
858
+ "format": "DEFAULT",
859
+ "state_key": null
860
+ },
861
+ {
862
+ "rep": "DELTA",
863
+ "type": "EEF",
864
+ "format": "DEFAULT",
865
+ "state_key": null
866
+ },
867
+ {
868
+ "rep": "ABSOLUTE",
869
+ "type": "NON_EEF",
870
+ "format": "DEFAULT",
871
+ "state_key": null
872
+ }
873
+ ]
874
+ },
875
+ "language": {
876
+ "delta_indices": [
877
+ 0
878
+ ],
879
+ "modality_keys": [
880
+ "annotation.human.action.task_description"
881
+ ],
882
+ "sin_cos_embedding_keys": null,
883
+ "mean_std_embedding_keys": null,
884
+ "action_configs": null
885
+ }
886
+ },
887
+ "utaustin_mutex": {
888
+ "video": {
889
+ "delta_indices": [
890
+ -6,
891
+ -4,
892
+ -2,
893
+ 0
894
+ ],
895
+ "modality_keys": [
896
+ "primary",
897
+ "wrist"
898
+ ],
899
+ "sin_cos_embedding_keys": null,
900
+ "mean_std_embedding_keys": null,
901
+ "action_configs": null
902
+ },
903
+ "state": {
904
+ "delta_indices": [
905
+ 0
906
+ ],
907
+ "modality_keys": [
908
+ "joint_position",
909
+ "gripper_position"
910
+ ],
911
+ "sin_cos_embedding_keys": null,
912
+ "mean_std_embedding_keys": null,
913
+ "action_configs": null
914
+ },
915
+ "action": {
916
+ "delta_indices": [
917
+ 0,
918
+ 1,
919
+ 2,
920
+ 3,
921
+ 4,
922
+ 5,
923
+ 6,
924
+ 7,
925
+ 8,
926
+ 9,
927
+ 10,
928
+ 11,
929
+ 12,
930
+ 13,
931
+ 14,
932
+ 15
933
+ ],
934
+ "modality_keys": [
935
+ "end_effector_position",
936
+ "end_effector_rotation",
937
+ "gripper_close"
938
+ ],
939
+ "sin_cos_embedding_keys": null,
940
+ "mean_std_embedding_keys": null,
941
+ "action_configs": [
942
+ {
943
+ "rep": "DELTA",
944
+ "type": "EEF",
945
+ "format": "DEFAULT",
946
+ "state_key": null
947
+ },
948
+ {
949
+ "rep": "DELTA",
950
+ "type": "EEF",
951
+ "format": "DEFAULT",
952
+ "state_key": null
953
+ },
954
+ {
955
+ "rep": "ABSOLUTE",
956
+ "type": "NON_EEF",
957
+ "format": "DEFAULT",
958
+ "state_key": null
959
+ }
960
+ ]
961
+ },
962
+ "language": {
963
+ "delta_indices": [
964
+ 0
965
+ ],
966
+ "modality_keys": [
967
+ "annotation.human.action.task_description"
968
+ ],
969
+ "sin_cos_embedding_keys": null,
970
+ "mean_std_embedding_keys": null,
971
+ "action_configs": null
972
+ }
973
+ },
974
+ "furniture_bench_dataset_converted_externally_to_rlds": {
975
+ "video": {
976
+ "delta_indices": [
977
+ -6,
978
+ -4,
979
+ -2,
980
+ 0
981
+ ],
982
+ "modality_keys": [
983
+ "primary",
984
+ "wrist"
985
+ ],
986
+ "sin_cos_embedding_keys": null,
987
+ "mean_std_embedding_keys": null,
988
+ "action_configs": null
989
+ },
990
+ "state": {
991
+ "delta_indices": [
992
+ 0
993
+ ],
994
+ "modality_keys": [
995
+ "end_effector_position",
996
+ "end_effector_rotation",
997
+ "gripper_position"
998
+ ],
999
+ "sin_cos_embedding_keys": null,
1000
+ "mean_std_embedding_keys": null,
1001
+ "action_configs": null
1002
+ },
1003
+ "action": {
1004
+ "delta_indices": [
1005
+ 0,
1006
+ 1,
1007
+ 2,
1008
+ 3,
1009
+ 4,
1010
+ 5,
1011
+ 6,
1012
+ 7,
1013
+ 8,
1014
+ 9,
1015
+ 10,
1016
+ 11,
1017
+ 12,
1018
+ 13,
1019
+ 14,
1020
+ 15
1021
+ ],
1022
+ "modality_keys": [
1023
+ "end_effector_position",
1024
+ "end_effector_rotation",
1025
+ "gripper_close"
1026
+ ],
1027
+ "sin_cos_embedding_keys": null,
1028
+ "mean_std_embedding_keys": null,
1029
+ "action_configs": [
1030
+ {
1031
+ "rep": "DELTA",
1032
+ "type": "EEF",
1033
+ "format": "DEFAULT",
1034
+ "state_key": null
1035
+ },
1036
+ {
1037
+ "rep": "DELTA",
1038
+ "type": "EEF",
1039
+ "format": "DEFAULT",
1040
+ "state_key": null
1041
+ },
1042
+ {
1043
+ "rep": "ABSOLUTE",
1044
+ "type": "NON_EEF",
1045
+ "format": "DEFAULT",
1046
+ "state_key": null
1047
+ }
1048
+ ]
1049
+ },
1050
+ "language": {
1051
+ "delta_indices": [
1052
+ 0
1053
+ ],
1054
+ "modality_keys": [
1055
+ "annotation.human.action.task_description"
1056
+ ],
1057
+ "sin_cos_embedding_keys": null,
1058
+ "mean_std_embedding_keys": null,
1059
+ "action_configs": null
1060
+ }
1061
+ },
1062
+ "neural_gr1": {
1063
+ "video": {
1064
+ "delta_indices": [
1065
+ -6,
1066
+ -4,
1067
+ -2,
1068
+ 0
1069
+ ],
1070
+ "modality_keys": [
1071
+ "ego_view"
1072
+ ],
1073
+ "sin_cos_embedding_keys": null,
1074
+ "mean_std_embedding_keys": null,
1075
+ "action_configs": null
1076
+ },
1077
+ "state": {
1078
+ "delta_indices": [
1079
+ 0
1080
+ ],
1081
+ "modality_keys": [
1082
+ "left_arm",
1083
+ "left_hand",
1084
+ "left_leg",
1085
+ "neck",
1086
+ "right_arm",
1087
+ "right_hand",
1088
+ "right_leg",
1089
+ "waist"
1090
+ ],
1091
+ "sin_cos_embedding_keys": null,
1092
+ "mean_std_embedding_keys": null,
1093
+ "action_configs": null
1094
+ },
1095
+ "action": {
1096
+ "delta_indices": [
1097
+ 0,
1098
+ 1,
1099
+ 2,
1100
+ 3,
1101
+ 4,
1102
+ 5,
1103
+ 6,
1104
+ 7,
1105
+ 8,
1106
+ 9,
1107
+ 10,
1108
+ 11,
1109
+ 12,
1110
+ 13,
1111
+ 14,
1112
+ 15
1113
+ ],
1114
+ "modality_keys": [
1115
+ "left_arm",
1116
+ "left_hand",
1117
+ "left_leg",
1118
+ "neck",
1119
+ "right_arm",
1120
+ "right_hand",
1121
+ "right_leg",
1122
+ "waist"
1123
+ ],
1124
+ "sin_cos_embedding_keys": null,
1125
+ "mean_std_embedding_keys": null,
1126
+ "action_configs": [
1127
+ {
1128
+ "rep": "ABSOLUTE",
1129
+ "type": "NON_EEF",
1130
+ "format": "DEFAULT",
1131
+ "state_key": null
1132
+ },
1133
+ {
1134
+ "rep": "ABSOLUTE",
1135
+ "type": "NON_EEF",
1136
+ "format": "DEFAULT",
1137
+ "state_key": null
1138
+ },
1139
+ {
1140
+ "rep": "ABSOLUTE",
1141
+ "type": "NON_EEF",
1142
+ "format": "DEFAULT",
1143
+ "state_key": null
1144
+ },
1145
+ {
1146
+ "rep": "ABSOLUTE",
1147
+ "type": "NON_EEF",
1148
+ "format": "DEFAULT",
1149
+ "state_key": null
1150
+ },
1151
+ {
1152
+ "rep": "ABSOLUTE",
1153
+ "type": "NON_EEF",
1154
+ "format": "DEFAULT",
1155
+ "state_key": null
1156
+ },
1157
+ {
1158
+ "rep": "ABSOLUTE",
1159
+ "type": "NON_EEF",
1160
+ "format": "DEFAULT",
1161
+ "state_key": null
1162
+ },
1163
+ {
1164
+ "rep": "ABSOLUTE",
1165
+ "type": "NON_EEF",
1166
+ "format": "DEFAULT",
1167
+ "state_key": null
1168
+ },
1169
+ {
1170
+ "rep": "ABSOLUTE",
1171
+ "type": "NON_EEF",
1172
+ "format": "DEFAULT",
1173
+ "state_key": null
1174
+ }
1175
+ ]
1176
+ },
1177
+ "language": {
1178
+ "delta_indices": [
1179
+ 0
1180
+ ],
1181
+ "modality_keys": [
1182
+ "annotation.human.action.task_description"
1183
+ ],
1184
+ "sin_cos_embedding_keys": null,
1185
+ "mean_std_embedding_keys": null,
1186
+ "action_configs": null
1187
+ }
1188
+ },
1189
+ "agibot_gripper": {
1190
+ "video": {
1191
+ "delta_indices": [
1192
+ -6,
1193
+ -4,
1194
+ -2,
1195
+ 0
1196
+ ],
1197
+ "modality_keys": [
1198
+ "primary",
1199
+ "wrist_left",
1200
+ "wrist_right"
1201
+ ],
1202
+ "sin_cos_embedding_keys": null,
1203
+ "mean_std_embedding_keys": null,
1204
+ "action_configs": null
1205
+ },
1206
+ "state": {
1207
+ "delta_indices": [
1208
+ 0
1209
+ ],
1210
+ "modality_keys": [
1211
+ "state"
1212
+ ],
1213
+ "sin_cos_embedding_keys": null,
1214
+ "mean_std_embedding_keys": null,
1215
+ "action_configs": null
1216
+ },
1217
+ "action": {
1218
+ "delta_indices": [
1219
+ 0,
1220
+ 1,
1221
+ 2,
1222
+ 3,
1223
+ 4,
1224
+ 5,
1225
+ 6,
1226
+ 7,
1227
+ 8,
1228
+ 9,
1229
+ 10,
1230
+ 11,
1231
+ 12,
1232
+ 13,
1233
+ 14,
1234
+ 15
1235
+ ],
1236
+ "modality_keys": [
1237
+ "action"
1238
+ ],
1239
+ "sin_cos_embedding_keys": null,
1240
+ "mean_std_embedding_keys": null,
1241
+ "action_configs": [
1242
+ {
1243
+ "rep": "ABSOLUTE",
1244
+ "type": "NON_EEF",
1245
+ "format": "DEFAULT",
1246
+ "state_key": null
1247
+ }
1248
+ ]
1249
+ },
1250
+ "language": {
1251
+ "delta_indices": [
1252
+ 0
1253
+ ],
1254
+ "modality_keys": [
1255
+ "annotation.human.action.task_description"
1256
+ ],
1257
+ "sin_cos_embedding_keys": null,
1258
+ "mean_std_embedding_keys": null,
1259
+ "action_configs": null
1260
+ }
1261
+ },
1262
+ "fmb_dataset": {
1263
+ "video": {
1264
+ "delta_indices": [
1265
+ -6,
1266
+ -4,
1267
+ -2,
1268
+ 0
1269
+ ],
1270
+ "modality_keys": [
1271
+ "primary",
1272
+ "secondary",
1273
+ "wrist"
1274
+ ],
1275
+ "sin_cos_embedding_keys": null,
1276
+ "mean_std_embedding_keys": null,
1277
+ "action_configs": null
1278
+ },
1279
+ "state": {
1280
+ "delta_indices": [
1281
+ 0
1282
+ ],
1283
+ "modality_keys": [
1284
+ "end_effector_position",
1285
+ "end_effector_rotation",
1286
+ "gripper_position"
1287
+ ],
1288
+ "sin_cos_embedding_keys": null,
1289
+ "mean_std_embedding_keys": null,
1290
+ "action_configs": null
1291
+ },
1292
+ "action": {
1293
+ "delta_indices": [
1294
+ 0,
1295
+ 1,
1296
+ 2,
1297
+ 3,
1298
+ 4,
1299
+ 5,
1300
+ 6,
1301
+ 7,
1302
+ 8,
1303
+ 9,
1304
+ 10,
1305
+ 11,
1306
+ 12,
1307
+ 13,
1308
+ 14,
1309
+ 15
1310
+ ],
1311
+ "modality_keys": [
1312
+ "end_effector_position",
1313
+ "end_effector_rotation",
1314
+ "gripper_close"
1315
+ ],
1316
+ "sin_cos_embedding_keys": null,
1317
+ "mean_std_embedding_keys": null,
1318
+ "action_configs": [
1319
+ {
1320
+ "rep": "DELTA",
1321
+ "type": "EEF",
1322
+ "format": "DEFAULT",
1323
+ "state_key": null
1324
+ },
1325
+ {
1326
+ "rep": "DELTA",
1327
+ "type": "EEF",
1328
+ "format": "DEFAULT",
1329
+ "state_key": null
1330
+ },
1331
+ {
1332
+ "rep": "ABSOLUTE",
1333
+ "type": "NON_EEF",
1334
+ "format": "DEFAULT",
1335
+ "state_key": null
1336
+ }
1337
+ ]
1338
+ },
1339
+ "language": {
1340
+ "delta_indices": [
1341
+ 0
1342
+ ],
1343
+ "modality_keys": [
1344
+ "annotation.human.action.task_description"
1345
+ ],
1346
+ "sin_cos_embedding_keys": null,
1347
+ "mean_std_embedding_keys": null,
1348
+ "action_configs": null
1349
+ }
1350
+ },
1351
+ "dobbe": {
1352
+ "video": {
1353
+ "delta_indices": [
1354
+ -6,
1355
+ -4,
1356
+ -2,
1357
+ 0
1358
+ ],
1359
+ "modality_keys": [
1360
+ "wrist"
1361
+ ],
1362
+ "sin_cos_embedding_keys": null,
1363
+ "mean_std_embedding_keys": null,
1364
+ "action_configs": null
1365
+ },
1366
+ "state": {
1367
+ "delta_indices": [
1368
+ 0
1369
+ ],
1370
+ "modality_keys": [
1371
+ "end_effector_position",
1372
+ "end_effector_rotation",
1373
+ "gripper_position"
1374
+ ],
1375
+ "sin_cos_embedding_keys": null,
1376
+ "mean_std_embedding_keys": null,
1377
+ "action_configs": null
1378
+ },
1379
+ "action": {
1380
+ "delta_indices": [
1381
+ 0,
1382
+ 1,
1383
+ 2,
1384
+ 3,
1385
+ 4,
1386
+ 5,
1387
+ 6,
1388
+ 7,
1389
+ 8,
1390
+ 9,
1391
+ 10,
1392
+ 11,
1393
+ 12,
1394
+ 13,
1395
+ 14,
1396
+ 15
1397
+ ],
1398
+ "modality_keys": [
1399
+ "end_effector_position",
1400
+ "end_effector_rotation",
1401
+ "gripper_close"
1402
+ ],
1403
+ "sin_cos_embedding_keys": null,
1404
+ "mean_std_embedding_keys": null,
1405
+ "action_configs": [
1406
+ {
1407
+ "rep": "DELTA",
1408
+ "type": "EEF",
1409
+ "format": "DEFAULT",
1410
+ "state_key": null
1411
+ },
1412
+ {
1413
+ "rep": "DELTA",
1414
+ "type": "EEF",
1415
+ "format": "DEFAULT",
1416
+ "state_key": null
1417
+ },
1418
+ {
1419
+ "rep": "ABSOLUTE",
1420
+ "type": "NON_EEF",
1421
+ "format": "DEFAULT",
1422
+ "state_key": null
1423
+ }
1424
+ ]
1425
+ },
1426
+ "language": {
1427
+ "delta_indices": [
1428
+ 0
1429
+ ],
1430
+ "modality_keys": [
1431
+ "annotation.human.action.task_description"
1432
+ ],
1433
+ "sin_cos_embedding_keys": null,
1434
+ "mean_std_embedding_keys": null,
1435
+ "action_configs": null
1436
+ }
1437
+ },
1438
+ "viola": {
1439
+ "video": {
1440
+ "delta_indices": [
1441
+ -6,
1442
+ -4,
1443
+ -2,
1444
+ 0
1445
+ ],
1446
+ "modality_keys": [
1447
+ "primary",
1448
+ "wrist"
1449
+ ],
1450
+ "sin_cos_embedding_keys": null,
1451
+ "mean_std_embedding_keys": null,
1452
+ "action_configs": null
1453
+ },
1454
+ "state": {
1455
+ "delta_indices": [
1456
+ 0
1457
+ ],
1458
+ "modality_keys": [
1459
+ "joint_position",
1460
+ "gripper_position"
1461
+ ],
1462
+ "sin_cos_embedding_keys": null,
1463
+ "mean_std_embedding_keys": null,
1464
+ "action_configs": null
1465
+ },
1466
+ "action": {
1467
+ "delta_indices": [
1468
+ 0,
1469
+ 1,
1470
+ 2,
1471
+ 3,
1472
+ 4,
1473
+ 5,
1474
+ 6,
1475
+ 7,
1476
+ 8,
1477
+ 9,
1478
+ 10,
1479
+ 11,
1480
+ 12,
1481
+ 13,
1482
+ 14,
1483
+ 15
1484
+ ],
1485
+ "modality_keys": [
1486
+ "end_effector_position",
1487
+ "end_effector_rotation",
1488
+ "gripper_close"
1489
+ ],
1490
+ "sin_cos_embedding_keys": null,
1491
+ "mean_std_embedding_keys": null,
1492
+ "action_configs": [
1493
+ {
1494
+ "rep": "DELTA",
1495
+ "type": "EEF",
1496
+ "format": "DEFAULT",
1497
+ "state_key": null
1498
+ },
1499
+ {
1500
+ "rep": "DELTA",
1501
+ "type": "EEF",
1502
+ "format": "DEFAULT",
1503
+ "state_key": null
1504
+ },
1505
+ {
1506
+ "rep": "ABSOLUTE",
1507
+ "type": "NON_EEF",
1508
+ "format": "DEFAULT",
1509
+ "state_key": null
1510
+ }
1511
+ ]
1512
+ },
1513
+ "language": {
1514
+ "delta_indices": [
1515
+ 0
1516
+ ],
1517
+ "modality_keys": [
1518
+ "annotation.human.action.task_description"
1519
+ ],
1520
+ "sin_cos_embedding_keys": null,
1521
+ "mean_std_embedding_keys": null,
1522
+ "action_configs": null
1523
+ }
1524
+ },
1525
+ "humanoid_everyday_h1": {
1526
+ "video": {
1527
+ "delta_indices": [
1528
+ -6,
1529
+ -4,
1530
+ -2,
1531
+ 0
1532
+ ],
1533
+ "modality_keys": [
1534
+ "egocentric_resized"
1535
+ ],
1536
+ "sin_cos_embedding_keys": null,
1537
+ "mean_std_embedding_keys": null,
1538
+ "action_configs": null
1539
+ },
1540
+ "state": {
1541
+ "delta_indices": [
1542
+ 0
1543
+ ],
1544
+ "modality_keys": [
1545
+ "left_arm",
1546
+ "left_hand",
1547
+ "right_arm",
1548
+ "right_hand"
1549
+ ],
1550
+ "sin_cos_embedding_keys": null,
1551
+ "mean_std_embedding_keys": null,
1552
+ "action_configs": null
1553
+ },
1554
+ "action": {
1555
+ "delta_indices": [
1556
+ 0,
1557
+ 1,
1558
+ 2,
1559
+ 3,
1560
+ 4,
1561
+ 5,
1562
+ 6,
1563
+ 7,
1564
+ 8,
1565
+ 9,
1566
+ 10,
1567
+ 11,
1568
+ 12,
1569
+ 13,
1570
+ 14,
1571
+ 15
1572
+ ],
1573
+ "modality_keys": [
1574
+ "left_arm",
1575
+ "left_hand",
1576
+ "right_arm",
1577
+ "right_hand"
1578
+ ],
1579
+ "sin_cos_embedding_keys": null,
1580
+ "mean_std_embedding_keys": null,
1581
+ "action_configs": [
1582
+ {
1583
+ "rep": "ABSOLUTE",
1584
+ "type": "NON_EEF",
1585
+ "format": "DEFAULT",
1586
+ "state_key": null
1587
+ },
1588
+ {
1589
+ "rep": "ABSOLUTE",
1590
+ "type": "NON_EEF",
1591
+ "format": "DEFAULT",
1592
+ "state_key": null
1593
+ },
1594
+ {
1595
+ "rep": "ABSOLUTE",
1596
+ "type": "NON_EEF",
1597
+ "format": "DEFAULT",
1598
+ "state_key": null
1599
+ },
1600
+ {
1601
+ "rep": "ABSOLUTE",
1602
+ "type": "NON_EEF",
1603
+ "format": "DEFAULT",
1604
+ "state_key": null
1605
+ }
1606
+ ]
1607
+ },
1608
+ "language": {
1609
+ "delta_indices": [
1610
+ 0
1611
+ ],
1612
+ "modality_keys": [
1613
+ "annotation.human.action.task_description"
1614
+ ],
1615
+ "sin_cos_embedding_keys": null,
1616
+ "mean_std_embedding_keys": null,
1617
+ "action_configs": null
1618
+ }
1619
+ },
1620
+ "austin_buds_dataset_converted_externally_to_rlds": {
1621
+ "video": {
1622
+ "delta_indices": [
1623
+ -6,
1624
+ -4,
1625
+ -2,
1626
+ 0
1627
+ ],
1628
+ "modality_keys": [
1629
+ "primary",
1630
+ "wrist"
1631
+ ],
1632
+ "sin_cos_embedding_keys": null,
1633
+ "mean_std_embedding_keys": null,
1634
+ "action_configs": null
1635
+ },
1636
+ "state": {
1637
+ "delta_indices": [
1638
+ 0
1639
+ ],
1640
+ "modality_keys": [
1641
+ "joint_position",
1642
+ "gripper_position"
1643
+ ],
1644
+ "sin_cos_embedding_keys": null,
1645
+ "mean_std_embedding_keys": null,
1646
+ "action_configs": null
1647
+ },
1648
+ "action": {
1649
+ "delta_indices": [
1650
+ 0,
1651
+ 1,
1652
+ 2,
1653
+ 3,
1654
+ 4,
1655
+ 5,
1656
+ 6,
1657
+ 7,
1658
+ 8,
1659
+ 9,
1660
+ 10,
1661
+ 11,
1662
+ 12,
1663
+ 13,
1664
+ 14,
1665
+ 15
1666
+ ],
1667
+ "modality_keys": [
1668
+ "end_effector_position",
1669
+ "end_effector_rotation",
1670
+ "gripper_close"
1671
+ ],
1672
+ "sin_cos_embedding_keys": null,
1673
+ "mean_std_embedding_keys": null,
1674
+ "action_configs": [
1675
+ {
1676
+ "rep": "DELTA",
1677
+ "type": "EEF",
1678
+ "format": "DEFAULT",
1679
+ "state_key": null
1680
+ },
1681
+ {
1682
+ "rep": "DELTA",
1683
+ "type": "EEF",
1684
+ "format": "DEFAULT",
1685
+ "state_key": null
1686
+ },
1687
+ {
1688
+ "rep": "ABSOLUTE",
1689
+ "type": "NON_EEF",
1690
+ "format": "DEFAULT",
1691
+ "state_key": null
1692
+ }
1693
+ ]
1694
+ },
1695
+ "language": {
1696
+ "delta_indices": [
1697
+ 0
1698
+ ],
1699
+ "modality_keys": [
1700
+ "annotation.human.action.task_description"
1701
+ ],
1702
+ "sin_cos_embedding_keys": null,
1703
+ "mean_std_embedding_keys": null,
1704
+ "action_configs": null
1705
+ }
1706
+ },
1707
+ "taco_play": {
1708
+ "video": {
1709
+ "delta_indices": [
1710
+ -6,
1711
+ -4,
1712
+ -2,
1713
+ 0
1714
+ ],
1715
+ "modality_keys": [
1716
+ "primary",
1717
+ "wrist"
1718
+ ],
1719
+ "sin_cos_embedding_keys": null,
1720
+ "mean_std_embedding_keys": null,
1721
+ "action_configs": null
1722
+ },
1723
+ "state": {
1724
+ "delta_indices": [
1725
+ 0
1726
+ ],
1727
+ "modality_keys": [
1728
+ "end_effector_position",
1729
+ "end_effector_rotation",
1730
+ "gripper_position"
1731
+ ],
1732
+ "sin_cos_embedding_keys": null,
1733
+ "mean_std_embedding_keys": null,
1734
+ "action_configs": null
1735
+ },
1736
+ "action": {
1737
+ "delta_indices": [
1738
+ 0,
1739
+ 1,
1740
+ 2,
1741
+ 3,
1742
+ 4,
1743
+ 5,
1744
+ 6,
1745
+ 7,
1746
+ 8,
1747
+ 9,
1748
+ 10,
1749
+ 11,
1750
+ 12,
1751
+ 13,
1752
+ 14,
1753
+ 15
1754
+ ],
1755
+ "modality_keys": [
1756
+ "end_effector_position",
1757
+ "end_effector_rotation",
1758
+ "gripper_close"
1759
+ ],
1760
+ "sin_cos_embedding_keys": null,
1761
+ "mean_std_embedding_keys": null,
1762
+ "action_configs": [
1763
+ {
1764
+ "rep": "DELTA",
1765
+ "type": "EEF",
1766
+ "format": "DEFAULT",
1767
+ "state_key": null
1768
+ },
1769
+ {
1770
+ "rep": "DELTA",
1771
+ "type": "EEF",
1772
+ "format": "DEFAULT",
1773
+ "state_key": null
1774
+ },
1775
+ {
1776
+ "rep": "ABSOLUTE",
1777
+ "type": "NON_EEF",
1778
+ "format": "DEFAULT",
1779
+ "state_key": null
1780
+ }
1781
+ ]
1782
+ },
1783
+ "language": {
1784
+ "delta_indices": [
1785
+ 0
1786
+ ],
1787
+ "modality_keys": [
1788
+ "annotation.human.action.task_description"
1789
+ ],
1790
+ "sin_cos_embedding_keys": null,
1791
+ "mean_std_embedding_keys": null,
1792
+ "action_configs": null
1793
+ }
1794
+ },
1795
+ "toto": {
1796
+ "video": {
1797
+ "delta_indices": [
1798
+ -6,
1799
+ -4,
1800
+ -2,
1801
+ 0
1802
+ ],
1803
+ "modality_keys": [
1804
+ "primary"
1805
+ ],
1806
+ "sin_cos_embedding_keys": null,
1807
+ "mean_std_embedding_keys": null,
1808
+ "action_configs": null
1809
+ },
1810
+ "state": {
1811
+ "delta_indices": [
1812
+ 0
1813
+ ],
1814
+ "modality_keys": [
1815
+ "joint_position",
1816
+ "gripper_position"
1817
+ ],
1818
+ "sin_cos_embedding_keys": null,
1819
+ "mean_std_embedding_keys": null,
1820
+ "action_configs": null
1821
+ },
1822
+ "action": {
1823
+ "delta_indices": [
1824
+ 0,
1825
+ 1,
1826
+ 2,
1827
+ 3,
1828
+ 4,
1829
+ 5,
1830
+ 6,
1831
+ 7,
1832
+ 8,
1833
+ 9,
1834
+ 10,
1835
+ 11,
1836
+ 12,
1837
+ 13,
1838
+ 14,
1839
+ 15
1840
+ ],
1841
+ "modality_keys": [
1842
+ "end_effector_position",
1843
+ "end_effector_rotation",
1844
+ "gripper_close"
1845
+ ],
1846
+ "sin_cos_embedding_keys": null,
1847
+ "mean_std_embedding_keys": null,
1848
+ "action_configs": [
1849
+ {
1850
+ "rep": "DELTA",
1851
+ "type": "EEF",
1852
+ "format": "DEFAULT",
1853
+ "state_key": null
1854
+ },
1855
+ {
1856
+ "rep": "DELTA",
1857
+ "type": "EEF",
1858
+ "format": "DEFAULT",
1859
+ "state_key": null
1860
+ },
1861
+ {
1862
+ "rep": "ABSOLUTE",
1863
+ "type": "NON_EEF",
1864
+ "format": "DEFAULT",
1865
+ "state_key": null
1866
+ }
1867
+ ]
1868
+ },
1869
+ "language": {
1870
+ "delta_indices": [
1871
+ 0
1872
+ ],
1873
+ "modality_keys": [
1874
+ "annotation.human.action.task_description"
1875
+ ],
1876
+ "sin_cos_embedding_keys": null,
1877
+ "mean_std_embedding_keys": null,
1878
+ "action_configs": null
1879
+ }
1880
+ },
1881
+ "language_table": {
1882
+ "video": {
1883
+ "delta_indices": [
1884
+ -6,
1885
+ -4,
1886
+ -2,
1887
+ 0
1888
+ ],
1889
+ "modality_keys": [
1890
+ "primary"
1891
+ ],
1892
+ "sin_cos_embedding_keys": null,
1893
+ "mean_std_embedding_keys": null,
1894
+ "action_configs": null
1895
+ },
1896
+ "state": {
1897
+ "delta_indices": [
1898
+ 0
1899
+ ],
1900
+ "modality_keys": [
1901
+ "end_effector_position"
1902
+ ],
1903
+ "sin_cos_embedding_keys": null,
1904
+ "mean_std_embedding_keys": null,
1905
+ "action_configs": null
1906
+ },
1907
+ "action": {
1908
+ "delta_indices": [
1909
+ 0,
1910
+ 1,
1911
+ 2,
1912
+ 3,
1913
+ 4,
1914
+ 5,
1915
+ 6,
1916
+ 7,
1917
+ 8,
1918
+ 9,
1919
+ 10,
1920
+ 11,
1921
+ 12,
1922
+ 13,
1923
+ 14,
1924
+ 15
1925
+ ],
1926
+ "modality_keys": [
1927
+ "end_effector_position"
1928
+ ],
1929
+ "sin_cos_embedding_keys": null,
1930
+ "mean_std_embedding_keys": null,
1931
+ "action_configs": [
1932
+ {
1933
+ "rep": "DELTA",
1934
+ "type": "EEF",
1935
+ "format": "DEFAULT",
1936
+ "state_key": null
1937
+ }
1938
+ ]
1939
+ },
1940
+ "language": {
1941
+ "delta_indices": [
1942
+ 0
1943
+ ],
1944
+ "modality_keys": [
1945
+ "annotation.human.action.task_description"
1946
+ ],
1947
+ "sin_cos_embedding_keys": null,
1948
+ "mean_std_embedding_keys": null,
1949
+ "action_configs": null
1950
+ }
1951
+ },
1952
+ "nyu_franka_play_dataset_converted_externally_to_rlds": {
1953
+ "video": {
1954
+ "delta_indices": [
1955
+ -6,
1956
+ -4,
1957
+ -2,
1958
+ 0
1959
+ ],
1960
+ "modality_keys": [
1961
+ "primary",
1962
+ "secondary"
1963
+ ],
1964
+ "sin_cos_embedding_keys": null,
1965
+ "mean_std_embedding_keys": null,
1966
+ "action_configs": null
1967
+ },
1968
+ "state": {
1969
+ "delta_indices": [
1970
+ 0
1971
+ ],
1972
+ "modality_keys": [
1973
+ "end_effector_position",
1974
+ "end_effector_rotation",
1975
+ "gripper_position"
1976
+ ],
1977
+ "sin_cos_embedding_keys": null,
1978
+ "mean_std_embedding_keys": null,
1979
+ "action_configs": null
1980
+ },
1981
+ "action": {
1982
+ "delta_indices": [
1983
+ 0,
1984
+ 1,
1985
+ 2,
1986
+ 3,
1987
+ 4,
1988
+ 5,
1989
+ 6,
1990
+ 7,
1991
+ 8,
1992
+ 9,
1993
+ 10,
1994
+ 11,
1995
+ 12,
1996
+ 13,
1997
+ 14,
1998
+ 15
1999
+ ],
2000
+ "modality_keys": [
2001
+ "end_effector_position",
2002
+ "end_effector_rotation",
2003
+ "gripper_close"
2004
+ ],
2005
+ "sin_cos_embedding_keys": null,
2006
+ "mean_std_embedding_keys": null,
2007
+ "action_configs": [
2008
+ {
2009
+ "rep": "DELTA",
2010
+ "type": "EEF",
2011
+ "format": "DEFAULT",
2012
+ "state_key": null
2013
+ },
2014
+ {
2015
+ "rep": "DELTA",
2016
+ "type": "EEF",
2017
+ "format": "DEFAULT",
2018
+ "state_key": null
2019
+ },
2020
+ {
2021
+ "rep": "ABSOLUTE",
2022
+ "type": "NON_EEF",
2023
+ "format": "DEFAULT",
2024
+ "state_key": null
2025
+ }
2026
+ ]
2027
+ },
2028
+ "language": {
2029
+ "delta_indices": [
2030
+ 0
2031
+ ],
2032
+ "modality_keys": [
2033
+ "annotation.human.action.task_description"
2034
+ ],
2035
+ "sin_cos_embedding_keys": null,
2036
+ "mean_std_embedding_keys": null,
2037
+ "action_configs": null
2038
+ }
2039
+ },
2040
+ "ucsd_kitchen_dataset_converted_externally_to_rlds": {
2041
+ "video": {
2042
+ "delta_indices": [
2043
+ -6,
2044
+ -4,
2045
+ -2,
2046
+ 0
2047
+ ],
2048
+ "modality_keys": [
2049
+ "primary"
2050
+ ],
2051
+ "sin_cos_embedding_keys": null,
2052
+ "mean_std_embedding_keys": null,
2053
+ "action_configs": null
2054
+ },
2055
+ "state": {
2056
+ "delta_indices": [
2057
+ 0
2058
+ ],
2059
+ "modality_keys": [
2060
+ "joint_position"
2061
+ ],
2062
+ "sin_cos_embedding_keys": null,
2063
+ "mean_std_embedding_keys": null,
2064
+ "action_configs": null
2065
+ },
2066
+ "action": {
2067
+ "delta_indices": [
2068
+ 0,
2069
+ 1,
2070
+ 2,
2071
+ 3,
2072
+ 4,
2073
+ 5,
2074
+ 6,
2075
+ 7,
2076
+ 8,
2077
+ 9,
2078
+ 10,
2079
+ 11,
2080
+ 12,
2081
+ 13,
2082
+ 14,
2083
+ 15
2084
+ ],
2085
+ "modality_keys": [
2086
+ "end_effector_position",
2087
+ "end_effector_rotation",
2088
+ "gripper_close"
2089
+ ],
2090
+ "sin_cos_embedding_keys": null,
2091
+ "mean_std_embedding_keys": null,
2092
+ "action_configs": [
2093
+ {
2094
+ "rep": "DELTA",
2095
+ "type": "EEF",
2096
+ "format": "DEFAULT",
2097
+ "state_key": null
2098
+ },
2099
+ {
2100
+ "rep": "DELTA",
2101
+ "type": "EEF",
2102
+ "format": "DEFAULT",
2103
+ "state_key": null
2104
+ },
2105
+ {
2106
+ "rep": "ABSOLUTE",
2107
+ "type": "NON_EEF",
2108
+ "format": "DEFAULT",
2109
+ "state_key": null
2110
+ }
2111
+ ]
2112
+ },
2113
+ "language": {
2114
+ "delta_indices": [
2115
+ 0
2116
+ ],
2117
+ "modality_keys": [
2118
+ "annotation.human.action.task_description"
2119
+ ],
2120
+ "sin_cos_embedding_keys": null,
2121
+ "mean_std_embedding_keys": null,
2122
+ "action_configs": null
2123
+ }
2124
+ },
2125
+ "austin_sirius_dataset_converted_externally_to_rlds": {
2126
+ "video": {
2127
+ "delta_indices": [
2128
+ -6,
2129
+ -4,
2130
+ -2,
2131
+ 0
2132
+ ],
2133
+ "modality_keys": [
2134
+ "primary",
2135
+ "wrist"
2136
+ ],
2137
+ "sin_cos_embedding_keys": null,
2138
+ "mean_std_embedding_keys": null,
2139
+ "action_configs": null
2140
+ },
2141
+ "state": {
2142
+ "delta_indices": [
2143
+ 0
2144
+ ],
2145
+ "modality_keys": [
2146
+ "end_effector_position",
2147
+ "end_effector_rotation",
2148
+ "gripper_position"
2149
+ ],
2150
+ "sin_cos_embedding_keys": null,
2151
+ "mean_std_embedding_keys": null,
2152
+ "action_configs": null
2153
+ },
2154
+ "action": {
2155
+ "delta_indices": [
2156
+ 0,
2157
+ 1,
2158
+ 2,
2159
+ 3,
2160
+ 4,
2161
+ 5,
2162
+ 6,
2163
+ 7,
2164
+ 8,
2165
+ 9,
2166
+ 10,
2167
+ 11,
2168
+ 12,
2169
+ 13,
2170
+ 14,
2171
+ 15
2172
+ ],
2173
+ "modality_keys": [
2174
+ "end_effector_position",
2175
+ "end_effector_rotation",
2176
+ "gripper_close"
2177
+ ],
2178
+ "sin_cos_embedding_keys": null,
2179
+ "mean_std_embedding_keys": null,
2180
+ "action_configs": [
2181
+ {
2182
+ "rep": "DELTA",
2183
+ "type": "EEF",
2184
+ "format": "DEFAULT",
2185
+ "state_key": null
2186
+ },
2187
+ {
2188
+ "rep": "DELTA",
2189
+ "type": "EEF",
2190
+ "format": "DEFAULT",
2191
+ "state_key": null
2192
+ },
2193
+ {
2194
+ "rep": "ABSOLUTE",
2195
+ "type": "NON_EEF",
2196
+ "format": "DEFAULT",
2197
+ "state_key": null
2198
+ }
2199
+ ]
2200
+ },
2201
+ "language": {
2202
+ "delta_indices": [
2203
+ 0
2204
+ ],
2205
+ "modality_keys": [
2206
+ "annotation.human.action.task_description"
2207
+ ],
2208
+ "sin_cos_embedding_keys": null,
2209
+ "mean_std_embedding_keys": null,
2210
+ "action_configs": null
2211
+ }
2212
+ },
2213
+ "droid": {
2214
+ "video": {
2215
+ "delta_indices": [
2216
+ -6,
2217
+ -4,
2218
+ -2,
2219
+ 0
2220
+ ],
2221
+ "modality_keys": [
2222
+ "primary",
2223
+ "secondary",
2224
+ "wrist"
2225
+ ],
2226
+ "sin_cos_embedding_keys": null,
2227
+ "mean_std_embedding_keys": null,
2228
+ "action_configs": null
2229
+ },
2230
+ "state": {
2231
+ "delta_indices": [
2232
+ 0
2233
+ ],
2234
+ "modality_keys": [
2235
+ "end_effector_position",
2236
+ "end_effector_rotation",
2237
+ "gripper_position"
2238
+ ],
2239
+ "sin_cos_embedding_keys": null,
2240
+ "mean_std_embedding_keys": null,
2241
+ "action_configs": null
2242
+ },
2243
+ "action": {
2244
+ "delta_indices": [
2245
+ 0,
2246
+ 1,
2247
+ 2,
2248
+ 3,
2249
+ 4,
2250
+ 5,
2251
+ 6,
2252
+ 7,
2253
+ 8,
2254
+ 9,
2255
+ 10,
2256
+ 11,
2257
+ 12,
2258
+ 13,
2259
+ 14,
2260
+ 15
2261
+ ],
2262
+ "modality_keys": [
2263
+ "end_effector_position",
2264
+ "end_effector_rotation",
2265
+ "gripper_close"
2266
+ ],
2267
+ "sin_cos_embedding_keys": null,
2268
+ "mean_std_embedding_keys": null,
2269
+ "action_configs": [
2270
+ {
2271
+ "rep": "DELTA",
2272
+ "type": "EEF",
2273
+ "format": "DEFAULT",
2274
+ "state_key": null
2275
+ },
2276
+ {
2277
+ "rep": "DELTA",
2278
+ "type": "EEF",
2279
+ "format": "DEFAULT",
2280
+ "state_key": null
2281
+ },
2282
+ {
2283
+ "rep": "ABSOLUTE",
2284
+ "type": "NON_EEF",
2285
+ "format": "DEFAULT",
2286
+ "state_key": null
2287
+ }
2288
+ ]
2289
+ },
2290
+ "language": {
2291
+ "delta_indices": [
2292
+ 0
2293
+ ],
2294
+ "modality_keys": [
2295
+ "annotation.human.action.task_description"
2296
+ ],
2297
+ "sin_cos_embedding_keys": null,
2298
+ "mean_std_embedding_keys": null,
2299
+ "action_configs": null
2300
+ }
2301
+ },
2302
+ "bc_z": {
2303
+ "video": {
2304
+ "delta_indices": [
2305
+ -6,
2306
+ -4,
2307
+ -2,
2308
+ 0
2309
+ ],
2310
+ "modality_keys": [
2311
+ "primary"
2312
+ ],
2313
+ "sin_cos_embedding_keys": null,
2314
+ "mean_std_embedding_keys": null,
2315
+ "action_configs": null
2316
+ },
2317
+ "state": {
2318
+ "delta_indices": [
2319
+ 0
2320
+ ],
2321
+ "modality_keys": [
2322
+ "end_effector_position",
2323
+ "end_effector_rotation",
2324
+ "gripper_position"
2325
+ ],
2326
+ "sin_cos_embedding_keys": null,
2327
+ "mean_std_embedding_keys": null,
2328
+ "action_configs": null
2329
+ },
2330
+ "action": {
2331
+ "delta_indices": [
2332
+ 0,
2333
+ 1,
2334
+ 2,
2335
+ 3,
2336
+ 4,
2337
+ 5,
2338
+ 6,
2339
+ 7,
2340
+ 8,
2341
+ 9,
2342
+ 10,
2343
+ 11,
2344
+ 12,
2345
+ 13,
2346
+ 14,
2347
+ 15
2348
+ ],
2349
+ "modality_keys": [
2350
+ "end_effector_position",
2351
+ "end_effector_rotation",
2352
+ "gripper_close"
2353
+ ],
2354
+ "sin_cos_embedding_keys": null,
2355
+ "mean_std_embedding_keys": null,
2356
+ "action_configs": [
2357
+ {
2358
+ "rep": "DELTA",
2359
+ "type": "EEF",
2360
+ "format": "DEFAULT",
2361
+ "state_key": null
2362
+ },
2363
+ {
2364
+ "rep": "DELTA",
2365
+ "type": "EEF",
2366
+ "format": "DEFAULT",
2367
+ "state_key": null
2368
+ },
2369
+ {
2370
+ "rep": "ABSOLUTE",
2371
+ "type": "NON_EEF",
2372
+ "format": "DEFAULT",
2373
+ "state_key": null
2374
+ }
2375
+ ]
2376
+ },
2377
+ "language": {
2378
+ "delta_indices": [
2379
+ 0
2380
+ ],
2381
+ "modality_keys": [
2382
+ "annotation.human.action.task_description"
2383
+ ],
2384
+ "sin_cos_embedding_keys": null,
2385
+ "mean_std_embedding_keys": null,
2386
+ "action_configs": null
2387
+ }
2388
+ },
2389
+ "kuka": {
2390
+ "video": {
2391
+ "delta_indices": [
2392
+ -6,
2393
+ -4,
2394
+ -2,
2395
+ 0
2396
+ ],
2397
+ "modality_keys": [
2398
+ "primary"
2399
+ ],
2400
+ "sin_cos_embedding_keys": null,
2401
+ "mean_std_embedding_keys": null,
2402
+ "action_configs": null
2403
+ },
2404
+ "state": {
2405
+ "delta_indices": [
2406
+ 0
2407
+ ],
2408
+ "modality_keys": [
2409
+ "end_effector_position",
2410
+ "end_effector_rotation",
2411
+ "gripper_position"
2412
+ ],
2413
+ "sin_cos_embedding_keys": null,
2414
+ "mean_std_embedding_keys": null,
2415
+ "action_configs": null
2416
+ },
2417
+ "action": {
2418
+ "delta_indices": [
2419
+ 0,
2420
+ 1,
2421
+ 2,
2422
+ 3,
2423
+ 4,
2424
+ 5,
2425
+ 6,
2426
+ 7,
2427
+ 8,
2428
+ 9,
2429
+ 10,
2430
+ 11,
2431
+ 12,
2432
+ 13,
2433
+ 14,
2434
+ 15
2435
+ ],
2436
+ "modality_keys": [
2437
+ "end_effector_position",
2438
+ "end_effector_rotation",
2439
+ "gripper_close"
2440
+ ],
2441
+ "sin_cos_embedding_keys": null,
2442
+ "mean_std_embedding_keys": null,
2443
+ "action_configs": [
2444
+ {
2445
+ "rep": "DELTA",
2446
+ "type": "EEF",
2447
+ "format": "DEFAULT",
2448
+ "state_key": null
2449
+ },
2450
+ {
2451
+ "rep": "DELTA",
2452
+ "type": "EEF",
2453
+ "format": "DEFAULT",
2454
+ "state_key": null
2455
+ },
2456
+ {
2457
+ "rep": "ABSOLUTE",
2458
+ "type": "NON_EEF",
2459
+ "format": "DEFAULT",
2460
+ "state_key": null
2461
+ }
2462
+ ]
2463
+ },
2464
+ "language": {
2465
+ "delta_indices": [
2466
+ 0
2467
+ ],
2468
+ "modality_keys": [
2469
+ "annotation.human.action.task_description"
2470
+ ],
2471
+ "sin_cos_embedding_keys": null,
2472
+ "mean_std_embedding_keys": null,
2473
+ "action_configs": null
2474
+ }
2475
+ },
2476
+ "agibot_dexhand": {
2477
+ "video": {
2478
+ "delta_indices": [
2479
+ -6,
2480
+ -4,
2481
+ -2,
2482
+ 0
2483
+ ],
2484
+ "modality_keys": [
2485
+ "primary"
2486
+ ],
2487
+ "sin_cos_embedding_keys": null,
2488
+ "mean_std_embedding_keys": null,
2489
+ "action_configs": null
2490
+ },
2491
+ "state": {
2492
+ "delta_indices": [
2493
+ 0
2494
+ ],
2495
+ "modality_keys": [
2496
+ "state"
2497
+ ],
2498
+ "sin_cos_embedding_keys": null,
2499
+ "mean_std_embedding_keys": null,
2500
+ "action_configs": null
2501
+ },
2502
+ "action": {
2503
+ "delta_indices": [
2504
+ 0,
2505
+ 1,
2506
+ 2,
2507
+ 3,
2508
+ 4,
2509
+ 5,
2510
+ 6,
2511
+ 7,
2512
+ 8,
2513
+ 9,
2514
+ 10,
2515
+ 11,
2516
+ 12,
2517
+ 13,
2518
+ 14,
2519
+ 15
2520
+ ],
2521
+ "modality_keys": [
2522
+ "action"
2523
+ ],
2524
+ "sin_cos_embedding_keys": null,
2525
+ "mean_std_embedding_keys": null,
2526
+ "action_configs": [
2527
+ {
2528
+ "rep": "ABSOLUTE",
2529
+ "type": "NON_EEF",
2530
+ "format": "DEFAULT",
2531
+ "state_key": null
2532
+ }
2533
+ ]
2534
+ },
2535
+ "language": {
2536
+ "delta_indices": [
2537
+ 0
2538
+ ],
2539
+ "modality_keys": [
2540
+ "annotation.human.action.task_description"
2541
+ ],
2542
+ "sin_cos_embedding_keys": null,
2543
+ "mean_std_embedding_keys": null,
2544
+ "action_configs": null
2545
+ }
2546
+ },
2547
+ "action_net": {
2548
+ "video": {
2549
+ "delta_indices": [
2550
+ -6,
2551
+ -4,
2552
+ -2,
2553
+ 0
2554
+ ],
2555
+ "modality_keys": [
2556
+ "primary"
2557
+ ],
2558
+ "sin_cos_embedding_keys": null,
2559
+ "mean_std_embedding_keys": null,
2560
+ "action_configs": null
2561
+ },
2562
+ "state": {
2563
+ "delta_indices": [
2564
+ 0
2565
+ ],
2566
+ "modality_keys": [
2567
+ "state"
2568
+ ],
2569
+ "sin_cos_embedding_keys": null,
2570
+ "mean_std_embedding_keys": null,
2571
+ "action_configs": null
2572
+ },
2573
+ "action": {
2574
+ "delta_indices": [
2575
+ 0,
2576
+ 1,
2577
+ 2,
2578
+ 3,
2579
+ 4,
2580
+ 5,
2581
+ 6,
2582
+ 7,
2583
+ 8,
2584
+ 9,
2585
+ 10,
2586
+ 11,
2587
+ 12,
2588
+ 13,
2589
+ 14,
2590
+ 15
2591
+ ],
2592
+ "modality_keys": [
2593
+ "action"
2594
+ ],
2595
+ "sin_cos_embedding_keys": null,
2596
+ "mean_std_embedding_keys": null,
2597
+ "action_configs": [
2598
+ {
2599
+ "rep": "ABSOLUTE",
2600
+ "type": "NON_EEF",
2601
+ "format": "DEFAULT",
2602
+ "state_key": null
2603
+ }
2604
+ ]
2605
+ },
2606
+ "language": {
2607
+ "delta_indices": [
2608
+ 0
2609
+ ],
2610
+ "modality_keys": [
2611
+ "annotation.human.action.task_description"
2612
+ ],
2613
+ "sin_cos_embedding_keys": null,
2614
+ "mean_std_embedding_keys": null,
2615
+ "action_configs": null
2616
+ }
2617
+ },
2618
+ "galaxea": {
2619
+ "video": {
2620
+ "delta_indices": [
2621
+ -6,
2622
+ -4,
2623
+ -2,
2624
+ 0
2625
+ ],
2626
+ "modality_keys": [
2627
+ "primary",
2628
+ "wrist_left",
2629
+ "wrist_right"
2630
+ ],
2631
+ "sin_cos_embedding_keys": null,
2632
+ "mean_std_embedding_keys": null,
2633
+ "action_configs": null
2634
+ },
2635
+ "state": {
2636
+ "delta_indices": [
2637
+ 0
2638
+ ],
2639
+ "modality_keys": [
2640
+ "state"
2641
+ ],
2642
+ "sin_cos_embedding_keys": null,
2643
+ "mean_std_embedding_keys": null,
2644
+ "action_configs": null
2645
+ },
2646
+ "action": {
2647
+ "delta_indices": [
2648
+ 0,
2649
+ 1,
2650
+ 2,
2651
+ 3,
2652
+ 4,
2653
+ 5,
2654
+ 6,
2655
+ 7,
2656
+ 8,
2657
+ 9,
2658
+ 10,
2659
+ 11,
2660
+ 12,
2661
+ 13,
2662
+ 14,
2663
+ 15
2664
+ ],
2665
+ "modality_keys": [
2666
+ "action"
2667
+ ],
2668
+ "sin_cos_embedding_keys": null,
2669
+ "mean_std_embedding_keys": null,
2670
+ "action_configs": [
2671
+ {
2672
+ "rep": "ABSOLUTE",
2673
+ "type": "NON_EEF",
2674
+ "format": "DEFAULT",
2675
+ "state_key": null
2676
+ }
2677
+ ]
2678
+ },
2679
+ "language": {
2680
+ "delta_indices": [
2681
+ 0
2682
+ ],
2683
+ "modality_keys": [
2684
+ "annotation.human.action.task_description"
2685
+ ],
2686
+ "sin_cos_embedding_keys": null,
2687
+ "mean_std_embedding_keys": null,
2688
+ "action_configs": null
2689
+ }
2690
+ },
2691
+ "roboturk": {
2692
+ "video": {
2693
+ "delta_indices": [
2694
+ -6,
2695
+ -4,
2696
+ -2,
2697
+ 0
2698
+ ],
2699
+ "modality_keys": [
2700
+ "primary"
2701
+ ],
2702
+ "sin_cos_embedding_keys": null,
2703
+ "mean_std_embedding_keys": null,
2704
+ "action_configs": null
2705
+ },
2706
+ "state": {
2707
+ "delta_indices": [
2708
+ 0
2709
+ ],
2710
+ "modality_keys": [
2711
+ "none"
2712
+ ],
2713
+ "sin_cos_embedding_keys": null,
2714
+ "mean_std_embedding_keys": null,
2715
+ "action_configs": null
2716
+ },
2717
+ "action": {
2718
+ "delta_indices": [
2719
+ 0,
2720
+ 1,
2721
+ 2,
2722
+ 3,
2723
+ 4,
2724
+ 5,
2725
+ 6,
2726
+ 7,
2727
+ 8,
2728
+ 9,
2729
+ 10,
2730
+ 11,
2731
+ 12,
2732
+ 13,
2733
+ 14,
2734
+ 15
2735
+ ],
2736
+ "modality_keys": [
2737
+ "end_effector_position",
2738
+ "end_effector_rotation",
2739
+ "gripper_close"
2740
+ ],
2741
+ "sin_cos_embedding_keys": null,
2742
+ "mean_std_embedding_keys": null,
2743
+ "action_configs": [
2744
+ {
2745
+ "rep": "DELTA",
2746
+ "type": "EEF",
2747
+ "format": "DEFAULT",
2748
+ "state_key": null
2749
+ },
2750
+ {
2751
+ "rep": "DELTA",
2752
+ "type": "EEF",
2753
+ "format": "DEFAULT",
2754
+ "state_key": null
2755
+ },
2756
+ {
2757
+ "rep": "ABSOLUTE",
2758
+ "type": "NON_EEF",
2759
+ "format": "DEFAULT",
2760
+ "state_key": null
2761
+ }
2762
+ ]
2763
+ },
2764
+ "language": {
2765
+ "delta_indices": [
2766
+ 0
2767
+ ],
2768
+ "modality_keys": [
2769
+ "annotation.human.action.task_description"
2770
+ ],
2771
+ "sin_cos_embedding_keys": null,
2772
+ "mean_std_embedding_keys": null,
2773
+ "action_configs": null
2774
+ }
2775
+ },
2776
+ "berkeley_fanuc_manipulation": {
2777
+ "video": {
2778
+ "delta_indices": [
2779
+ -6,
2780
+ -4,
2781
+ -2,
2782
+ 0
2783
+ ],
2784
+ "modality_keys": [
2785
+ "primary",
2786
+ "wrist"
2787
+ ],
2788
+ "sin_cos_embedding_keys": null,
2789
+ "mean_std_embedding_keys": null,
2790
+ "action_configs": null
2791
+ },
2792
+ "state": {
2793
+ "delta_indices": [
2794
+ 0
2795
+ ],
2796
+ "modality_keys": [
2797
+ "joint_position",
2798
+ "gripper_position"
2799
+ ],
2800
+ "sin_cos_embedding_keys": null,
2801
+ "mean_std_embedding_keys": null,
2802
+ "action_configs": null
2803
+ },
2804
+ "action": {
2805
+ "delta_indices": [
2806
+ 0,
2807
+ 1,
2808
+ 2,
2809
+ 3,
2810
+ 4,
2811
+ 5,
2812
+ 6,
2813
+ 7,
2814
+ 8,
2815
+ 9,
2816
+ 10,
2817
+ 11,
2818
+ 12,
2819
+ 13,
2820
+ 14,
2821
+ 15
2822
+ ],
2823
+ "modality_keys": [
2824
+ "end_effector_position",
2825
+ "end_effector_rotation",
2826
+ "gripper_close"
2827
+ ],
2828
+ "sin_cos_embedding_keys": null,
2829
+ "mean_std_embedding_keys": null,
2830
+ "action_configs": [
2831
+ {
2832
+ "rep": "DELTA",
2833
+ "type": "EEF",
2834
+ "format": "DEFAULT",
2835
+ "state_key": null
2836
+ },
2837
+ {
2838
+ "rep": "DELTA",
2839
+ "type": "EEF",
2840
+ "format": "DEFAULT",
2841
+ "state_key": null
2842
+ },
2843
+ {
2844
+ "rep": "ABSOLUTE",
2845
+ "type": "NON_EEF",
2846
+ "format": "DEFAULT",
2847
+ "state_key": null
2848
+ }
2849
+ ]
2850
+ },
2851
+ "language": {
2852
+ "delta_indices": [
2853
+ 0
2854
+ ],
2855
+ "modality_keys": [
2856
+ "annotation.human.action.task_description"
2857
+ ],
2858
+ "sin_cos_embedding_keys": null,
2859
+ "mean_std_embedding_keys": null,
2860
+ "action_configs": null
2861
+ }
2862
+ },
2863
+ "jaco_play": {
2864
+ "video": {
2865
+ "delta_indices": [
2866
+ -6,
2867
+ -4,
2868
+ -2,
2869
+ 0
2870
+ ],
2871
+ "modality_keys": [
2872
+ "primary",
2873
+ "wrist"
2874
+ ],
2875
+ "sin_cos_embedding_keys": null,
2876
+ "mean_std_embedding_keys": null,
2877
+ "action_configs": null
2878
+ },
2879
+ "state": {
2880
+ "delta_indices": [
2881
+ 0
2882
+ ],
2883
+ "modality_keys": [
2884
+ "end_effector_position",
2885
+ "end_effector_rotation",
2886
+ "gripper_position"
2887
+ ],
2888
+ "sin_cos_embedding_keys": null,
2889
+ "mean_std_embedding_keys": null,
2890
+ "action_configs": null
2891
+ },
2892
+ "action": {
2893
+ "delta_indices": [
2894
+ 0,
2895
+ 1,
2896
+ 2,
2897
+ 3,
2898
+ 4,
2899
+ 5,
2900
+ 6,
2901
+ 7,
2902
+ 8,
2903
+ 9,
2904
+ 10,
2905
+ 11,
2906
+ 12,
2907
+ 13,
2908
+ 14,
2909
+ 15
2910
+ ],
2911
+ "modality_keys": [
2912
+ "end_effector_position",
2913
+ "end_effector_rotation",
2914
+ "gripper_close"
2915
+ ],
2916
+ "sin_cos_embedding_keys": null,
2917
+ "mean_std_embedding_keys": null,
2918
+ "action_configs": [
2919
+ {
2920
+ "rep": "DELTA",
2921
+ "type": "EEF",
2922
+ "format": "DEFAULT",
2923
+ "state_key": null
2924
+ },
2925
+ {
2926
+ "rep": "DELTA",
2927
+ "type": "EEF",
2928
+ "format": "DEFAULT",
2929
+ "state_key": null
2930
+ },
2931
+ {
2932
+ "rep": "ABSOLUTE",
2933
+ "type": "NON_EEF",
2934
+ "format": "DEFAULT",
2935
+ "state_key": null
2936
+ }
2937
+ ]
2938
+ },
2939
+ "language": {
2940
+ "delta_indices": [
2941
+ 0
2942
+ ],
2943
+ "modality_keys": [
2944
+ "annotation.human.action.task_description"
2945
+ ],
2946
+ "sin_cos_embedding_keys": null,
2947
+ "mean_std_embedding_keys": null,
2948
+ "action_configs": null
2949
+ }
2950
+ }
2951
+ },
2952
+ "random_rotation_angle": null,
2953
+ "color_jitter_params": {
2954
+ "brightness": 0.3,
2955
+ "contrast": 0.4,
2956
+ "saturation": 0.5,
2957
+ "hue": 0.08
2958
+ },
2959
+ "model_name": "RLWRLD/RLDX-1-VLM",
2960
+ "model_type": "vtc_qwen3_vl",
2961
+ "formalize_language": true,
2962
+ "max_state_dim": 64,
2963
+ "max_action_dim": 64,
2964
+ "max_action_horizon": 16,
2965
+ "use_percentiles": true,
2966
+ "clip_outliers": true,
2967
+ "apply_sincos_state_encoding": false,
2968
+ "use_relative_action": true,
2969
+ "memory_length": 1,
2970
+ "general_embodiment_train_ratio": 0,
2971
+ "random_crop_fraction": 1.0,
2972
+ "image_max_area": null,
2973
+ "image_resize_m": 32
2974
+ }
2975
+ }
statistics.json ADDED
The diff for this file is too large to render. See raw diff
 
teaser.png ADDED

Git LFS Details

  • SHA256: 6b34b11f6c8e2699766e26aa210be9e4b3e5f3f8f45ed009ae5c7ef07c7c7cd7
  • Pointer size: 133 Bytes
  • Size of remote file: 10.4 MB