xiwenyoumu commited on
Commit
5e9a603
·
verified ·
1 Parent(s): b1bd585

Initial Fast-dDrive 3B release

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,92 @@
1
  ---
2
  license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ library_name: transformers
4
+ pipeline_tag: image-text-to-text
5
+ tags:
6
+ - block-diffusion
7
+ - vision-language-action
8
+ - autonomous-driving
9
+ - qwen2.5-vl
10
  ---
11
+
12
+ # Fast-dDrive
13
+
14
+ Fast-dDrive is a block-diffusion Vision-Language-Action (VLA) model for
15
+ end-to-end autonomous driving, built on Qwen2.5-VL-3B. It pairs section-aware
16
+ structured-diffusion training (SASD) with scaffold-aware speculative decoding
17
+ (Scaffold Spec) and an optional shared-prefix multi-trajectory inference
18
+ scaling scheme, and reaches SOTA accuracy on the Waymo Open Dataset
19
+ End-to-End Driving (WOD-E2E) benchmark at over 200 tokens / second on a
20
+ single H100.
21
+
22
+ ## Quick start
23
+
24
+ ```python
25
+ import torch
26
+ from transformers import AutoModelForCausalLM, AutoProcessor
27
+
28
+ MODEL = "Efficient-Large-Model/Fast_dDrive_3B" # or your local clone
29
+
30
+ processor = AutoProcessor.from_pretrained(MODEL, trust_remote_code=True)
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ MODEL,
33
+ trust_remote_code=True,
34
+ dtype=torch.bfloat16,
35
+ ).cuda().eval()
36
+
37
+ # Scaffold Spec (paper canonical, threshold = 0.0)
38
+ output_ids = model.scaffold_speculative_sample(
39
+ input_ids=input_ids,
40
+ attention_mask=attention_mask,
41
+ pixel_values=pixel_values,
42
+ image_grid_thw=image_grid_thw,
43
+ confidence_threshold=0.0,
44
+ block_size=32,
45
+ max_new_tokens=512,
46
+ )
47
+ ```
48
+
49
+ ## Inference paths
50
+
51
+ This release exposes three decoding paths as bound methods on the model:
52
+
53
+ | Method | Description | Threshold |
54
+ |---|---|---|
55
+ | `mdm_sample_deep_scaffold` | **Section Diffusion (SD)** — iterative MDM denoising over a pre-filled JSON scaffold | `0.9` |
56
+ | `scaffold_speculative_sample` | **Scaffold Spec (SS)** — scaffold-aware self-speculative decoding (MDM draft + AR verify per block). Paper canonical. | `0.0` |
57
+ | `scaffold_spec_with_ss_multi_traj` | **SS multi-rollout** — shared-prefix N-rollout inference scaling on the trajectory section | `0.0` |
58
+
59
+ > **Important:** `scaffold_speculative_sample` and its multi-traj variant must
60
+ > be run with `confidence_threshold=0.0` to reproduce the paper numbers.
61
+ > Running at `0.9` silently degrades both ADE and throughput.
62
+
63
+ ## Headline results — WOD-E2E test set (single H100)
64
+
65
+ | Mode | RFS ↑ | ADE@3s ↓ | ADE@5s ↓ | TPS ↑ | Tok/Step ↑ |
66
+ |---|---|---|---|---|---|
67
+ | Scaffold Spec | 7.823 | 1.254 | 2.907 | 210.4 | 4.90 |
68
+ | + Inference scaling (N=4) | 7.827 | 1.240 | 2.821 | 114.7 | 2.76 |
69
+
70
+ On the WOD-E2E val set, Scaffold Spec runs at 1919 ms / sample (4.1× over the
71
+ AR baseline); fused with SGLang the same configuration drops to 665 ms /
72
+ sample at 608.5 TPS — the 11.8× / 12× speedup over AR cited in the paper.
73
+
74
+ ## Files
75
+
76
+ - `modeling.py` — model definition (`Fast_dDriveForConditionalGeneration`)
77
+ - `configuration.py` — config classes
78
+ - `section_utils.py` — scaffold construction + section-aligned block index utilities
79
+ - `generation_utils.py` — the three inference paths, attached to the model class on import
80
+ - `config.json`, `generation_config.json`, `preprocessor_config.json`, `chat_template.jinja`, tokenizer files — standard HF artifacts
81
+ - `model-0000{1..4}-of-00004.safetensors` — model weights (4 shards)
82
+
83
+ ## Citation
84
+
85
+ ```bibtex
86
+ @misc{fastddrive2026,
87
+ title = {Fast-dDrive: Section-Aware Diffusion VLAs for End-to-End Driving},
88
+ author = {Anonymous},
89
+ year = {2026},
90
+ note = {Submitted to NeurIPS 2026},
91
+ }
92
+ ```
__init__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fast-dDrive HF release package.
2
+
3
+ Mirrors the layout of ``Efficient-Large-Model/Fast_dVLM_3B`` on the Hugging Face
4
+ Hub: a single :mod:`modeling` module that holds the model definition plus
5
+ inference-time decoding paths (Section Diffusion, Scaffold Spec, and
6
+ Scaffold Spec with multi-trajectory rollouts), and a :mod:`configuration`
7
+ module with the config classes.
8
+
9
+ Users normally load the model via::
10
+
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer
12
+ model = AutoModelForCausalLM.from_pretrained(
13
+ "Efficient-Large-Model/Fast_dDrive", # or local path
14
+ trust_remote_code=True,
15
+ )
16
+
17
+ with the ``auto_map`` entry in :file:`config.json` pointing back to the classes
18
+ defined here.
19
+ """
20
+
21
+ from .configuration import (
22
+ Fast_dDriveConfig,
23
+ Fast_dDriveTextConfig,
24
+ Fast_dDriveVisionConfig,
25
+ )
26
+ from .modeling import Fast_dDriveForConditionalGeneration
27
+
28
+ __all__ = [
29
+ "Fast_dDriveConfig",
30
+ "Fast_dDriveTextConfig",
31
+ "Fast_dDriveVisionConfig",
32
+ "Fast_dDriveForConditionalGeneration",
33
+ ]
added_tokens.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|NULL|>": 151666,
5
+ "<|box_end|>": 151649,
6
+ "<|box_start|>": 151648,
7
+ "<|endoftext|>": 151643,
8
+ "<|file_sep|>": 151664,
9
+ "<|fim_middle|>": 151660,
10
+ "<|fim_pad|>": 151662,
11
+ "<|fim_prefix|>": 151659,
12
+ "<|fim_suffix|>": 151661,
13
+ "<|im_end|>": 151645,
14
+ "<|im_start|>": 151644,
15
+ "<|image_pad|>": 151655,
16
+ "<|object_ref_end|>": 151647,
17
+ "<|object_ref_start|>": 151646,
18
+ "<|quad_end|>": 151651,
19
+ "<|quad_start|>": 151650,
20
+ "<|repo_name|>": 151663,
21
+ "<|video_pad|>": 151656,
22
+ "<|vision_end|>": 151653,
23
+ "<|vision_pad|>": 151654,
24
+ "<|vision_start|>": 151652,
25
+ "|<MASK>|": 151665
26
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "always_mask_im_end": true,
3
+ "anneal_block_size": false,
4
+ "architectures": [
5
+ "Fast_dDriveForConditionalGeneration"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bd_size": 32,
9
+ "block_causal_no_dynamic": false,
10
+ "complementary_mask": true,
11
+ "deep_json_scaffold": true,
12
+ "dtype": "float32",
13
+ "enable_efficient_vision_embed": false,
14
+ "entropy_loss": false,
15
+ "entropy_loss_weight": 1.0,
16
+ "eos_token_id": 151645,
17
+ "flexible_bd_size": false,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 2048,
20
+ "image_token_id": 151655,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 11008,
23
+ "max_position_embeddings": 128000,
24
+ "max_window_layers": 70,
25
+ "minimum_noise_level": 0.001,
26
+ "model_type": "fast_d_drive",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 36,
29
+ "num_key_value_heads": 2,
30
+ "pad_token_id": 151643,
31
+ "rms_norm_eps": 1e-06,
32
+ "rope_scaling": {
33
+ "mrope_section": [
34
+ 16,
35
+ 24,
36
+ 24
37
+ ],
38
+ "rope_type": "default",
39
+ "type": "default"
40
+ },
41
+ "rope_theta": 1000000.0,
42
+ "section_block_steps": null,
43
+ "section_loss_weights": {
44
+ "critical_objects": 1.5,
45
+ "explanation": 1.0,
46
+ "future_meta_behavior": 2.0,
47
+ "trajectory": 3.0
48
+ },
49
+ "section_noise_schedule": {
50
+ "critical_objects": "1.0,2.0",
51
+ "explanation": "1.0,1.0",
52
+ "future_meta_behavior": "1.0,1.5",
53
+ "trajectory": "2.0,1.0"
54
+ },
55
+ "section_token_budgets": null,
56
+ "sliding_window": 32768,
57
+ "static_json_scaffold": false,
58
+ "text_config": {
59
+ "always_mask_im_end": true,
60
+ "anneal_block_size": false,
61
+ "architectures": [
62
+ "Fast_dDriveForConditionalGeneration"
63
+ ],
64
+ "attention_dropout": 0.0,
65
+ "auto_map": {
66
+ "AutoConfig": "configuration.Fast_dDriveConfig",
67
+ "AutoModel": "modeling.Fast_dDriveForConditionalGeneration",
68
+ "AutoModelForCausalLM": "modeling.Fast_dDriveForConditionalGeneration"
69
+ },
70
+ "bd_size": 32,
71
+ "block_causal_no_dynamic": false,
72
+ "block_length": null,
73
+ "bos_token_id": 151643,
74
+ "complementary_mask": true,
75
+ "deep_json_scaffold": true,
76
+ "dtype": "float32",
77
+ "enable_efficient_vision_embed": false,
78
+ "entropy_loss": false,
79
+ "entropy_loss_weight": 1.0,
80
+ "eos_token_id": 151645,
81
+ "flexible_bd_size": false,
82
+ "hidden_act": "silu",
83
+ "hidden_size": 2048,
84
+ "image_token_id": null,
85
+ "initializer_range": 0.02,
86
+ "intermediate_size": 11008,
87
+ "layer_types": [
88
+ "full_attention",
89
+ "full_attention",
90
+ "full_attention",
91
+ "full_attention",
92
+ "full_attention",
93
+ "full_attention",
94
+ "full_attention",
95
+ "full_attention",
96
+ "full_attention",
97
+ "full_attention",
98
+ "full_attention",
99
+ "full_attention",
100
+ "full_attention",
101
+ "full_attention",
102
+ "full_attention",
103
+ "full_attention",
104
+ "full_attention",
105
+ "full_attention",
106
+ "full_attention",
107
+ "full_attention",
108
+ "full_attention",
109
+ "full_attention",
110
+ "full_attention",
111
+ "full_attention",
112
+ "full_attention",
113
+ "full_attention",
114
+ "full_attention",
115
+ "full_attention",
116
+ "full_attention",
117
+ "full_attention",
118
+ "full_attention",
119
+ "full_attention",
120
+ "full_attention",
121
+ "full_attention",
122
+ "full_attention",
123
+ "full_attention"
124
+ ],
125
+ "max_position_embeddings": 128000,
126
+ "max_window_layers": 70,
127
+ "minimum_noise_level": 0.001,
128
+ "model_type": "fast_d_drive_for_causal_lm",
129
+ "num_attention_heads": 16,
130
+ "num_hidden_layers": 36,
131
+ "num_key_value_heads": 2,
132
+ "rms_norm_eps": 1e-06,
133
+ "rope_scaling": {
134
+ "mrope_section": [
135
+ 16,
136
+ 24,
137
+ 24
138
+ ],
139
+ "rope_type": "default",
140
+ "type": "default"
141
+ },
142
+ "rope_theta": 1000000.0,
143
+ "section_block_steps": null,
144
+ "section_token_budgets": null,
145
+ "self_spec_inference_mode": null,
146
+ "sliding_window": null,
147
+ "static_json_scaffold": false,
148
+ "tie_word_embeddings": true,
149
+ "use_block_causal_mask": true,
150
+ "use_cache": true,
151
+ "use_json_scaffold": true,
152
+ "use_sliding_window": false,
153
+ "video_token_id": null,
154
+ "vision_end_token_id": 151653,
155
+ "vision_start_token_id": 151652,
156
+ "vision_token_id": 151654,
157
+ "vocab_size": 151936
158
+ },
159
+ "transformers_version": "4.57.1",
160
+ "use_block_causal_mask": true,
161
+ "use_cache": true,
162
+ "use_json_scaffold": true,
163
+ "use_sliding_window": false,
164
+ "video_token_id": 151656,
165
+ "vision_config": {
166
+ "depth": 32,
167
+ "dtype": "float32",
168
+ "fullatt_block_indexes": [
169
+ 7,
170
+ 15,
171
+ 23,
172
+ 31
173
+ ],
174
+ "hidden_act": "silu",
175
+ "hidden_size": 1280,
176
+ "in_channels": 3,
177
+ "in_chans": 3,
178
+ "initializer_range": 0.02,
179
+ "intermediate_size": 3420,
180
+ "model_type": "fast_d_drive",
181
+ "num_heads": 16,
182
+ "out_hidden_size": 2048,
183
+ "patch_size": 14,
184
+ "spatial_merge_size": 2,
185
+ "spatial_patch_size": 14,
186
+ "temporal_patch_size": 2,
187
+ "tokens_per_second": 2,
188
+ "window_size": 112
189
+ },
190
+ "vision_end_token_id": 151653,
191
+ "vision_start_token_id": 151652,
192
+ "vision_token_id": 151654,
193
+ "vocab_size": 151936,
194
+ "auto_map": {
195
+ "AutoConfig": "configuration.Fast_dDriveConfig",
196
+ "AutoModel": "modeling.Fast_dDriveForConditionalGeneration",
197
+ "AutoModelForCausalLM": "modeling.Fast_dDriveForConditionalGeneration"
198
+ }
199
+ }
configuration.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # limitations under the License.
3
+ from transformers.configuration_utils import PretrainedConfig, layer_type_validation
4
+ from transformers.modeling_rope_utils import rope_config_validation
5
+
6
+
7
+ class Fast_dDriveVisionConfig(PretrainedConfig):
8
+ model_type = "fast_d_drive"
9
+ base_config_key = "vision_config"
10
+
11
+ def __init__(
12
+ self,
13
+ depth=32,
14
+ hidden_size=3584,
15
+ hidden_act="silu",
16
+ intermediate_size=3420,
17
+ num_heads=16,
18
+ in_channels=3,
19
+ patch_size=14,
20
+ spatial_merge_size=2,
21
+ temporal_patch_size=2,
22
+ tokens_per_second=4,
23
+ window_size=112,
24
+ out_hidden_size=3584,
25
+ fullatt_block_indexes=[7, 15, 23, 31],
26
+ initializer_range=0.02,
27
+ **kwargs,
28
+ ):
29
+ super().__init__(**kwargs)
30
+
31
+ self.depth = depth
32
+ self.hidden_size = hidden_size
33
+ self.hidden_act = hidden_act
34
+ self.intermediate_size = intermediate_size
35
+ self.num_heads = num_heads
36
+ self.in_channels = in_channels
37
+ self.patch_size = patch_size
38
+ self.spatial_merge_size = spatial_merge_size
39
+ self.temporal_patch_size = temporal_patch_size
40
+ self.tokens_per_second = tokens_per_second
41
+ self.window_size = window_size
42
+ self.fullatt_block_indexes = fullatt_block_indexes
43
+ self.out_hidden_size = out_hidden_size
44
+ self.initializer_range = initializer_range
45
+
46
+
47
+ class Fast_dDriveTextConfig(PretrainedConfig):
48
+
49
+ model_type = "fast_d_drive_for_causal_lm"
50
+ base_config_key = "text_config"
51
+ keys_to_ignore_at_inference = ["past_key_values"]
52
+ base_model_tp_plan = {
53
+ "layers.*.self_attn.q_proj": "colwise",
54
+ "layers.*.self_attn.k_proj": "colwise",
55
+ "layers.*.self_attn.v_proj": "colwise",
56
+ "layers.*.self_attn.o_proj": "rowwise",
57
+ "layers.*.mlp.gate_proj": "colwise",
58
+ "layers.*.mlp.up_proj": "colwise",
59
+ "layers.*.mlp.down_proj": "rowwise",
60
+ }
61
+ base_model_pp_plan = {
62
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
63
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
64
+ "norm": (["hidden_states"], ["hidden_states"]),
65
+ }
66
+
67
+ def __init__(
68
+ self,
69
+ vocab_size=152064,
70
+ hidden_size=8192,
71
+ intermediate_size=29568,
72
+ num_hidden_layers=80,
73
+ num_attention_heads=64,
74
+ num_key_value_heads=8,
75
+ hidden_act="silu",
76
+ max_position_embeddings=32768,
77
+ initializer_range=0.02,
78
+ rms_norm_eps=1e-05,
79
+ use_cache=True,
80
+ tie_word_embeddings=False,
81
+ rope_theta=1000000.0,
82
+ use_sliding_window=False,
83
+ sliding_window=4096,
84
+ max_window_layers=80,
85
+ layer_types=None,
86
+ attention_dropout=0.0,
87
+ rope_scaling=None,
88
+ image_token_id=None,
89
+ video_token_id=None,
90
+ bd_size=8,
91
+ self_spec_inference_mode=None,
92
+ block_length=None,
93
+ use_block_causal_mask=False,
94
+ complementary_mask=True,
95
+ minimum_noise_level=1e-3,
96
+ entropy_loss=False,
97
+ entropy_loss_weight=1.0,
98
+ block_causal_no_dynamic=False,
99
+ **kwargs,
100
+ ):
101
+ self.vocab_size = vocab_size
102
+ self.max_position_embeddings = max_position_embeddings
103
+ self.hidden_size = hidden_size
104
+ self.intermediate_size = intermediate_size
105
+ self.num_hidden_layers = num_hidden_layers
106
+ self.num_attention_heads = num_attention_heads
107
+ self.use_sliding_window = use_sliding_window
108
+ self.sliding_window = sliding_window if self.use_sliding_window else None
109
+ self.max_window_layers = max_window_layers
110
+
111
+ # for backward compatibility
112
+ if num_key_value_heads is None:
113
+ num_key_value_heads = num_attention_heads
114
+
115
+ self.num_key_value_heads = num_key_value_heads
116
+ self.hidden_act = hidden_act
117
+ self.initializer_range = initializer_range
118
+ self.rms_norm_eps = rms_norm_eps
119
+ self.use_cache = use_cache
120
+ self.rope_theta = rope_theta
121
+ self.attention_dropout = attention_dropout
122
+ self.rope_scaling = rope_scaling
123
+ self.bd_size = bd_size
124
+ self.layer_types = layer_types
125
+ self.use_block_causal_mask = use_block_causal_mask
126
+ self.complementary_mask = complementary_mask
127
+ self.minimum_noise_level = minimum_noise_level
128
+ self.entropy_loss = entropy_loss
129
+ self.entropy_loss_weight = entropy_loss_weight
130
+ self.block_causal_no_dynamic = block_causal_no_dynamic
131
+ self.self_spec_inference_mode = self_spec_inference_mode
132
+ self.block_length = block_length
133
+ if self.layer_types is None:
134
+ self.layer_types = [
135
+ "sliding_attention"
136
+ if self.sliding_window is not None and i >= self.max_window_layers
137
+ else "full_attention"
138
+ for i in range(self.num_hidden_layers)
139
+ ]
140
+ layer_type_validation(self.layer_types)
141
+
142
+ # Validate the correctness of rotary position embeddings parameters
143
+ # BC: if there is a 'type' field, move it to 'rope_type'.
144
+ # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations
145
+ # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
146
+ # TODO: @raushan update config in the hub
147
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
148
+ if self.rope_scaling["type"] == "mrope":
149
+ self.rope_scaling["type"] = "default"
150
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
151
+ rope_config_validation(self, ignore_keys={"mrope_section"})
152
+ self.image_token_id = image_token_id
153
+ self.video_token_id = video_token_id
154
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
155
+
156
+
157
+ class Fast_dDriveConfig(PretrainedConfig):
158
+
159
+ model_type = "fast_d_drive"
160
+ sub_configs = {"vision_config": Fast_dDriveVisionConfig, "text_config": Fast_dDriveTextConfig}
161
+ keys_to_ignore_at_inference = ["past_key_values"]
162
+
163
+ def __init__(
164
+ self,
165
+ text_config=None,
166
+ vision_config=None,
167
+ image_token_id=151655,
168
+ video_token_id=151656,
169
+ enable_efficient_vision_embed=False,
170
+ always_mask_im_end=False,
171
+ flexible_bd_size=False,
172
+ anneal_block_size=False,
173
+ **kwargs,
174
+ ):
175
+ if isinstance(vision_config, dict):
176
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
177
+ elif vision_config is None:
178
+ self.vision_config = self.sub_configs["vision_config"]()
179
+
180
+ if isinstance(text_config, dict):
181
+ self.text_config = self.sub_configs["text_config"](**text_config)
182
+ elif text_config is None:
183
+ # For BC use all kwargs to init `TextConfig`
184
+ self.text_config = self.sub_configs["text_config"](**kwargs)
185
+
186
+ self.image_token_id = image_token_id
187
+ self.video_token_id = video_token_id
188
+ self.enable_efficient_vision_embed = enable_efficient_vision_embed
189
+ self.always_mask_im_end = always_mask_im_end
190
+ self.flexible_bd_size = flexible_bd_size
191
+ self.anneal_block_size = anneal_block_size
192
+
193
+ super().__init__(**kwargs)
194
+
195
+ # def to_dict(self):
196
+ # output = super().to_dict()
197
+ # output.pop("auto_map", None)
198
+ # return output
199
+
200
+
201
+ __all__ = ["Fast_dDriveConfig", "Fast_dDriveTextConfig"]
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.05,
9
+ "temperature": 1e-06,
10
+ "transformers_version": "4.57.1"
11
+ }
generation_utils.py ADDED
@@ -0,0 +1,1192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generation utilities for Fast-dDrive.
2
+
3
+ This module provides the three inference paths exposed by the canonical paper
4
+ release:
5
+
6
+ * ``mdm_sample_deep_scaffold`` — Section Diffusion (SD): iterative MDM
7
+ denoising over a pre-filled JSON scaffold, no AR verification.
8
+ * ``scaffold_speculative_sample`` — Scaffold Spec (SS): scaffold-aware
9
+ self-speculative decoding (MDM draft + AR verify per block).
10
+ * ``scaffold_spec_with_ss_multi_traj`` — SS with shared-prefix multi-trajectory
11
+ rollouts (the test-time inference-scaling path).
12
+
13
+ All three are attached as bound methods on
14
+ :class:`Fast_dDriveForConditionalGeneration` when this module is
15
+ imported (see ``modeling.py`` for the import hook).
16
+ """
17
+
18
+ import os
19
+ import re
20
+ import sys
21
+ import math
22
+ import torch
23
+ import types
24
+ import numpy as np
25
+ from transformers.cache_utils import DynamicCache
26
+
27
+
28
+ def _crop_cache(past_key_values, max_length: int):
29
+ """Crop a DynamicCache to max_length tokens, compatible with Qwen cache layout."""
30
+ new_past_key_values = []
31
+ for layer_num in range(len(past_key_values)):
32
+ layer_past_key_values = ()
33
+ for kv_idx in range(len(past_key_values[layer_num])):
34
+ layer_past_key_values += (past_key_values[layer_num][kv_idx][:, :, :max_length, :],)
35
+ new_past_key_values.append(layer_past_key_values)
36
+ return DynamicCache(new_past_key_values)
37
+
38
+
39
+ def _sample_from_logits(logits, temperature=0.0):
40
+ """Sample token ids from logits with optional temperature scaling.
41
+
42
+ When temperature <= 0, falls back to argmax (greedy).
43
+ """
44
+ if temperature <= 0:
45
+ return logits.argmax(dim=-1)
46
+ scaled = logits / temperature
47
+ probs = torch.softmax(scaled, dim=-1)
48
+ original_shape = probs.shape[:-1]
49
+ flat_probs = probs.reshape(-1, probs.shape[-1])
50
+ sampled = torch.multinomial(flat_probs, num_samples=1).squeeze(-1)
51
+ return sampled.reshape(original_shape)
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # mdm_sample_deep_scaffold — Section Diffusion (SD)
56
+ # ---------------------------------------------------------------------------
57
+
58
+ def mdm_sample_deep_scaffold(
59
+ self,
60
+ input_ids,
61
+ tokenizer,
62
+ max_tokens=512,
63
+ pixel_values=None,
64
+ image_grid_thw=None,
65
+ mask_id=151665,
66
+ null_id=151666,
67
+ threshold=0.9,
68
+ stop_token=151645,
69
+ explanation_block_size=32,
70
+ explanation_max_blocks=6,
71
+ block_size=32,
72
+ return_stats=False,
73
+ use_kv_cache=True,
74
+ temperature=0.0,
75
+ ):
76
+ """
77
+ Deep scaffold MDM generation with train-consistent hybrid block causal mask.
78
+
79
+ Pre-fills the entire JSON scaffold (including sub-keys for critical_objects,
80
+ future_meta_behavior, trajectory) with MASK tokens at value positions only.
81
+ Then denoises each section's value tokens via iterative unmasking.
82
+
83
+ The attention mask matches training: prompt tokens use causal attention,
84
+ response tokens use block-causal attention where each section's denoise
85
+ steps form separate blocks. Block i can see all prompt tokens and blocks
86
+ 0..i, but NOT blocks i+1..N (which still contain MASK tokens).
87
+
88
+ For explanation (variable length), NULL tokens in the output signal that
89
+ the section content is complete — trailing NULLs are stripped.
90
+
91
+ KV-cache path (``use_kv_cache=True``, default):
92
+ Prompt K/V is computed once with vision embedding scatter, then each
93
+ response block, once fully denoised, gets its K/V appended to the
94
+ cache. Subsequent blocks' iterative unmasking only forwards their
95
+ own ~block_size tokens against the cache (plus prior committed
96
+ blocks), avoiding O(seqlen^2) recomputation of the prompt + prior
97
+ blocks every iteration. Correctness is preserved because
98
+ block-causal attention means block k only attends to prompt +
99
+ blocks 0..k, which is exactly what the cache provides.
100
+ """
101
+ import math
102
+ import os as _os
103
+ from .section_utils import (
104
+ build_deep_json_scaffold,
105
+ SECTION_KEYS,
106
+ NULL_TOKEN_ID,
107
+ )
108
+
109
+ # Env override for A/B testing the KV cache path without editing code.
110
+ _kv_env = _os.environ.get("MDM_DS_USE_KV_CACHE")
111
+ if _kv_env is not None:
112
+ use_kv_cache = _kv_env not in ("0", "false", "False", "")
113
+
114
+ scaffold_tokens, section_ranges, scaffold_mask_list = build_deep_json_scaffold(
115
+ tokenizer,
116
+ mask_id=mask_id,
117
+ null_id=null_id,
118
+ explanation_block_size=explanation_block_size,
119
+ explanation_max_blocks=explanation_max_blocks,
120
+ )
121
+
122
+ tokens_per_step = []
123
+ original_input_length = input_ids.shape[1]
124
+
125
+ # Phase 1: Build sequence with scaffold appended
126
+ scaffold_tensor = torch.tensor(scaffold_tokens, device=self.device, dtype=torch.long).unsqueeze(0)
127
+ x_t = torch.cat([input_ids, scaffold_tensor], dim=1)
128
+ seqlen = x_t.shape[1]
129
+
130
+ # Track scaffold (frozen) vs value (to denoise) positions in scaffold region
131
+ scaffold_frozen = torch.tensor(scaffold_mask_list, device=self.device, dtype=torch.bool)
132
+
133
+ # ── Build response_block_idx matching training's compute_section_block_idx_deep_static ──
134
+ response_block_idx = torch.full((seqlen,), -1, device=self.device, dtype=torch.long)
135
+ current_block = 0
136
+ assigned = set()
137
+
138
+ for section_name in SECTION_KEYS:
139
+ if section_name not in section_ranges:
140
+ continue
141
+ sec_start, sec_end = section_ranges[section_name]
142
+
143
+ # Find value positions (non-scaffold) in this section
144
+ value_positions = []
145
+ for i in range(sec_start, sec_end):
146
+ if not scaffold_mask_list[i]: # 0 = value token
147
+ value_positions.append(original_input_length + i)
148
+
149
+ if not value_positions:
150
+ current_block += 1
151
+ continue
152
+
153
+ # Block assignment MUST match training's
154
+ # compute_section_block_idx_deep_static: n_blocks = ceil(value/block_size)
155
+ # for every section. Previously non-explanation sections were forced to
156
+ # a single block; that broke attention alignment for trajectory
157
+ # (70 value tokens → training 3 blocks vs inference 1 block), causing
158
+ # trajectory over-extrapolation. CO (12) and FMB (6) still resolve
159
+ # to 1 block since their value counts are < block_size.
160
+ tokens_per_step_sec = block_size
161
+ n_steps = max(1, math.ceil(len(value_positions) / tokens_per_step_sec))
162
+
163
+ # Assign block indices to value tokens
164
+ for vi, abs_pos in enumerate(value_positions):
165
+ block_in_section = min(vi // tokens_per_step_sec, n_steps - 1)
166
+ response_block_idx[abs_pos] = current_block + block_in_section
167
+ assigned.add(abs_pos)
168
+
169
+ # Assign scaffold tokens to nearest value token's block
170
+ for i in range(sec_start, sec_end):
171
+ abs_pos = original_input_length + i
172
+ if scaffold_mask_list[i] and abs_pos not in assigned:
173
+ best_block = -1
174
+ for delta in range(1, sec_end - sec_start + 10):
175
+ for cand in [abs_pos + delta, abs_pos - delta]:
176
+ if cand in assigned:
177
+ best_block = response_block_idx[cand].item()
178
+ break
179
+ if best_block >= 0:
180
+ break
181
+ if best_block >= 0:
182
+ response_block_idx[abs_pos] = best_block
183
+ assigned.add(abs_pos)
184
+
185
+ current_block += n_steps
186
+
187
+ # Assign any remaining unassigned scaffold tokens (e.g. top-level separators)
188
+ for i in range(len(scaffold_tokens)):
189
+ abs_pos = original_input_length + i
190
+ if abs_pos not in assigned:
191
+ # Find nearest assigned position
192
+ best_block = -1
193
+ for delta in range(1, seqlen):
194
+ for cand in [abs_pos + delta, abs_pos - delta]:
195
+ if 0 <= cand < seqlen and cand in assigned:
196
+ best_block = response_block_idx[cand].item()
197
+ break
198
+ if best_block >= 0:
199
+ break
200
+ if best_block >= 0:
201
+ response_block_idx[abs_pos] = best_block
202
+ assigned.add(abs_pos)
203
+
204
+ # ── Build hybrid block causal mask (computed once, reused for all forward passes) ──
205
+ attention_mask = self.model.eval_hybrid_mask(seqlen, response_block_idx).to(self.device)
206
+
207
+ # Section-MoE-LoRA: set section_ids before language model forward
208
+ set_section_ids = lambda *a, **kw: None # noqa: E731 (Section-MoE-LoRA disabled in release)
209
+ # Map block indices to section IDs (0=CO, 1=Exp, 2=FMB, 3=Traj, 4=Other/Prompt)
210
+ _sec_ids = torch.full((seqlen,), 4, device=self.device, dtype=torch.long)
211
+ for section_name, (sec_start, sec_end) in section_ranges.items():
212
+ abs_start = original_input_length + sec_start
213
+ abs_end = original_input_length + sec_end
214
+ if section_name == "critical_objects":
215
+ _sec_ids[abs_start:abs_end] = 0
216
+ elif section_name == "explanation":
217
+ _sec_ids[abs_start:abs_end] = 1
218
+ elif section_name == "future_meta_behavior":
219
+ _sec_ids[abs_start:abs_end] = 2
220
+ elif section_name == "trajectory":
221
+ _sec_ids[abs_start:abs_end] = 3
222
+
223
+ # Add batch dimension
224
+ _sec_ids_batch = _sec_ids.unsqueeze(0)
225
+ set_section_ids(_sec_ids_batch)
226
+
227
+ # ── Precompute vision embeddings and position_ids once ──
228
+ # BUG FIX: Previously pixel_values was only passed on the first forward
229
+ # (step==0) but with use_cache=False every forward is independent, so all
230
+ # subsequent forwards lost vision information entirely.
231
+ _embed_fn = self.model.get_input_embeddings()
232
+ _cached_image_embeds = None
233
+ _cached_image_mask = None
234
+
235
+ if pixel_values is not None:
236
+ _cached_image_embeds = self.model.get_image_features(pixel_values, image_grid_thw)
237
+ _cached_image_embeds = torch.cat(_cached_image_embeds, dim=0).to(
238
+ self.device, _embed_fn.weight.dtype
239
+ )
240
+ _tmp_embeds = _embed_fn(x_t)
241
+ _cached_image_mask, _ = self.model.get_placeholder_mask(
242
+ x_t, inputs_embeds=_tmp_embeds, image_features=_cached_image_embeds
243
+ )
244
+
245
+ # Compute position_ids once with correct image_grid_thw (3D RoPE)
246
+ _position_ids, _rope_deltas = self.model.get_rope_index(
247
+ x_t, image_grid_thw, None
248
+ )
249
+ self.model.rope_deltas = _rope_deltas
250
+
251
+ # ── Compute contiguous block ranges in the response region ──
252
+ # Each block's absolute [start, end) range in x_t is the maximal
253
+ # contiguous span of positions sharing the same response_block_idx.
254
+ # Blocks are ordered by block_idx and cover the entire response.
255
+ _block_ranges = [] # list of (block_idx, abs_start, abs_end)
256
+ _cur_bi = None
257
+ _cur_start = None
258
+ for _p in range(seqlen):
259
+ _bi = int(response_block_idx[_p].item())
260
+ if _bi < 0:
261
+ if _cur_bi is not None:
262
+ _block_ranges.append((_cur_bi, _cur_start, _p))
263
+ _cur_bi, _cur_start = None, None
264
+ continue
265
+ if _cur_bi is None:
266
+ _cur_bi, _cur_start = _bi, _p
267
+ elif _bi != _cur_bi:
268
+ _block_ranges.append((_cur_bi, _cur_start, _p))
269
+ _cur_bi, _cur_start = _bi, _p
270
+ if _cur_bi is not None:
271
+ _block_ranges.append((_cur_bi, _cur_start, seqlen))
272
+
273
+ # Map block_idx -> section_name for downstream logic (section-specific
274
+ # behaviors like explanation NULL handling can still be scoped).
275
+ _block_idx_to_section = {}
276
+ for _sname, (_sstart, _send) in section_ranges.items():
277
+ _sabs_start = original_input_length + _sstart
278
+ _sabs_end = original_input_length + _send
279
+ for _bi, _bs, _be in _block_ranges:
280
+ # Assign section by whether the block's range overlaps the section
281
+ if _bs < _sabs_end and _be > _sabs_start:
282
+ _block_idx_to_section.setdefault(_bi, _sname)
283
+
284
+ # ── Phase 2: Denoise block-by-block with optional KV cache ──
285
+ # Without cache (fallback): each forward replays the entire sequence.
286
+ # With cache: prompt K/V computed once; each block's finalized K/V is
287
+ # appended after denoising, so later blocks only forward their own
288
+ # ~block_size tokens against the cache.
289
+ step = 0
290
+
291
+ past_kv = None
292
+ prev_last_logit = None # logit at the position just before the next block
293
+
294
+ if use_kv_cache:
295
+ # Phase 0: prompt prefill. Includes vision scatter; cache becomes
296
+ # the reusable foundation for every scaffold block.
297
+ prompt_tokens = x_t[:, :original_input_length]
298
+ prompt_embeds = _embed_fn(prompt_tokens)
299
+ if _cached_image_embeds is not None:
300
+ prompt_image_mask = _cached_image_mask[:, :original_input_length]
301
+ prompt_embeds = prompt_embeds.masked_scatter(
302
+ prompt_image_mask, _cached_image_embeds
303
+ )
304
+ prompt_position_ids = _position_ids[..., :original_input_length]
305
+
306
+ # Causal over prompt (matches training's prompt-side attention).
307
+ # When attention_mask=None, the model's eval_mask auto-builds causal
308
+ # because use_block_causal_mask=True and update_kv_cache=True.
309
+ prompt_out = self.forward(
310
+ inputs_embeds=prompt_embeds,
311
+ position_ids=prompt_position_ids,
312
+ attention_mask=None,
313
+ past_key_values=None,
314
+ use_cache=True,
315
+ update_kv_cache=True,
316
+ )
317
+ past_kv = prompt_out.past_key_values
318
+ # Logit at position (original_input_length - 1); used to predict
319
+ # the first token of the first response block via causal shift.
320
+ prev_last_logit = prompt_out.logits[:, -1:, :]
321
+
322
+ # ── Iterate blocks in order ──
323
+ for _block_idx, block_abs_start, block_abs_end in _block_ranges:
324
+ B = block_abs_end - block_abs_start
325
+ section_name = _block_idx_to_section.get(_block_idx, None)
326
+
327
+ # Count MASK tokens in this block
328
+ block_slice = x_t[0, block_abs_start:block_abs_end]
329
+ n_masks_in_block = int((block_slice == mask_id).sum().item())
330
+
331
+ # ── Iterative unmasking within this block (if any MASKs) ──
332
+ if n_masks_in_block > 0:
333
+ max_iter = n_masks_in_block + 5 # safety limit
334
+ for _ in range(max_iter):
335
+ current_block_masks = (x_t[:, block_abs_start:block_abs_end] == mask_id)
336
+ if current_block_masks.sum() == 0:
337
+ break
338
+
339
+ if use_kv_cache:
340
+ # Feed only this block; past_kv covers prompt + prior blocks.
341
+ block_tokens = x_t[:, block_abs_start:block_abs_end]
342
+ block_embeds = _embed_fn(block_tokens)
343
+ block_position_ids = _position_ids[..., block_abs_start:block_abs_end]
344
+ L_cached = past_kv.get_seq_length() if past_kv is not None else 0
345
+ # Block-causal + bidirectional-within-block ⇒ this
346
+ # block's queries attend to all cached KV plus all
347
+ # fresh block KV ⇒ all-True mask of shape [B, L+B].
348
+ block_attn = torch.ones(
349
+ B, L_cached + B, device=self.device, dtype=torch.bool
350
+ )
351
+ output = self.forward(
352
+ inputs_embeds=block_embeds,
353
+ attention_mask=block_attn,
354
+ position_ids=block_position_ids,
355
+ past_key_values=past_kv,
356
+ use_cache=True,
357
+ update_kv_cache=False, # read-only during iteration
358
+ )
359
+ logits = output.logits # [1, B, V]
360
+ # Shift: pred for abs_pos uses logit at abs_pos-1.
361
+ # logit at block_abs_start-1 is prev_last_logit; the
362
+ # rest come from this forward's earlier positions.
363
+ sec_logits = torch.cat([prev_last_logit, logits[:, :-1, :]], dim=1)
364
+ else:
365
+ # Full-sequence forward (fallback path, same as before)
366
+ _cur_embeds = _embed_fn(x_t)
367
+ if _cached_image_embeds is not None:
368
+ _cur_embeds = _cur_embeds.masked_scatter(
369
+ _cached_image_mask, _cached_image_embeds
370
+ )
371
+ output = self.forward(
372
+ input_ids=x_t,
373
+ inputs_embeds=_cur_embeds,
374
+ attention_mask=attention_mask,
375
+ position_ids=_position_ids,
376
+ use_cache=False,
377
+ )
378
+ logits = output.logits
379
+ sec_logits = logits[:, block_abs_start:block_abs_end, :]
380
+ sec_logits = torch.cat(
381
+ [logits[:, block_abs_start - 1:block_abs_start, :],
382
+ sec_logits[:, :-1, :]], dim=1
383
+ )
384
+
385
+ if temperature > 0:
386
+ # Temperature sampling for diverse generation (e.g. GRPO rollouts)
387
+ sampling_probs = torch.softmax(sec_logits / temperature, dim=-1)
388
+ x_1 = torch.multinomial(
389
+ sampling_probs.view(-1, sampling_probs.shape[-1]), num_samples=1
390
+ ).view(sampling_probs.shape[:-1])
391
+ else:
392
+ # Greedy (default, backward compatible)
393
+ x_1 = sec_logits.argmax(dim=-1)
394
+ probs = torch.softmax(sec_logits, dim=-1)
395
+ x1_p = torch.gather(probs, dim=-1, index=x_1.unsqueeze(-1)).squeeze(-1)
396
+
397
+ # Only consider currently-masked positions in this block
398
+ x1_p = torch.where(current_block_masks, x1_p, -torch.inf)
399
+ unmask_idx = (x1_p > threshold)
400
+
401
+ if unmask_idx.sum() > 0:
402
+ x_t[:, block_abs_start:block_abs_end][unmask_idx] = x_1[unmask_idx]
403
+ tokens_per_step.append(int(unmask_idx.sum()))
404
+ else:
405
+ # Fallback: unmask highest-confidence token
406
+ pos = x1_p.argmax()
407
+ row = 0
408
+ col = pos.item()
409
+ x_t[:, block_abs_start:block_abs_end][row, col] = x_1[row, col]
410
+ tokens_per_step.append(1)
411
+
412
+ step += 1
413
+ if step > max_tokens:
414
+ break
415
+
416
+ # ── Commit this block's K/V to the cache ──
417
+ # Run one final forward at block's fully-denoised state with
418
+ # update_kv_cache=True so future blocks can attend to it via cache.
419
+ # prev_last_logit is refreshed to the logit at the last position
420
+ # of this block for the NEXT block's first-position prediction.
421
+ if use_kv_cache:
422
+ block_tokens = x_t[:, block_abs_start:block_abs_end]
423
+ block_embeds = _embed_fn(block_tokens)
424
+ block_position_ids = _position_ids[..., block_abs_start:block_abs_end]
425
+ L_cached = past_kv.get_seq_length() if past_kv is not None else 0
426
+ block_attn = torch.ones(
427
+ B, L_cached + B, device=self.device, dtype=torch.bool
428
+ )
429
+ commit_out = self.forward(
430
+ inputs_embeds=block_embeds,
431
+ attention_mask=block_attn,
432
+ position_ids=block_position_ids,
433
+ past_key_values=past_kv,
434
+ use_cache=True,
435
+ update_kv_cache=True,
436
+ )
437
+ past_kv = commit_out.past_key_values
438
+ prev_last_logit = commit_out.logits[:, -1:, :]
439
+
440
+ # NOTE: a previous null_ratio>0.3 early-stopping heuristic was
441
+ # removed. It computed the ratio globally across the whole
442
+ # explanation and, when tripped, force-filled every remaining
443
+ # MASK with NULL — including MASKs in middle positions that
444
+ # should have held real text — which cut short explanations
445
+ # mid-sentence. Training always produces 192 value tokens
446
+ # (real text + <|NULL|> padding at the tail) and the model
447
+ # learned to emit NULL cleanly at the tail, so the final
448
+ # NULL-strip below is sufficient. Cost: every sample now
449
+ # denoises all 6 explanation blocks.
450
+
451
+ # Post-process: strip NULL tokens from the output
452
+ gen_tokens = x_t[0, original_input_length:].tolist()
453
+ cleaned = [t for t in gen_tokens if t != null_id and t != mask_id]
454
+ x_t = torch.cat([
455
+ input_ids,
456
+ torch.tensor([cleaned], device=self.device, dtype=torch.long)
457
+ ], dim=1)
458
+
459
+ gen_length = x_t.shape[1] - original_input_length
460
+
461
+ if return_stats:
462
+ stats = {
463
+ "tokens_per_step": tokens_per_step,
464
+ "total_steps": step,
465
+ "gen_length": gen_length,
466
+ "null_tokens_stripped": len(gen_tokens) - len(cleaned),
467
+ "block_size": block_size,
468
+ }
469
+ return x_t, stats
470
+ return x_t
471
+
472
+ @torch.no_grad()
473
+
474
+
475
+ # ---------------------------------------------------------------------------
476
+ # scaffold_speculative_sample — Scaffold Spec (SS)
477
+ # ---------------------------------------------------------------------------
478
+
479
+ def scaffold_speculative_sample(
480
+ self,
481
+ input_ids,
482
+ tokenizer,
483
+ block_size=32,
484
+ max_tokens=1024,
485
+ pixel_values=None,
486
+ image_grid_thw=None,
487
+ mask_id=151665,
488
+ null_id=151666,
489
+ threshold=0.9,
490
+ stop_token=151645,
491
+ explanation_block_size=32,
492
+ explanation_max_blocks=6,
493
+ return_stats=False,
494
+ draft_temperature=0.0,
495
+ verify_temperature=0.0,
496
+ ):
497
+ """
498
+ Scaffold-aware self-speculative decoding.
499
+
500
+ Minimal modification of standard self-spec
501
+ (speculative_block_causal_sample_cache): scaffold (structural JSON)
502
+ tokens are pre-filled in the draft block instead of MASK and
503
+ auto-accepted during causal verification.
504
+
505
+ Key design: uses *exactly the same* attention patterns as standard
506
+ self-spec (block-diff for draft, **causal** for verify via
507
+ auto eval_mask). Only the draft block content differs — scaffold
508
+ positions carry known tokens instead of MASK, giving the draft
509
+ better context while scaffold tokens are "free" during acceptance.
510
+ """
511
+ from .section_utils import (
512
+ build_deep_json_scaffold,
513
+ NULL_TOKEN_ID,
514
+ )
515
+
516
+ scaffold_tokens, section_ranges, scaffold_mask_list = build_deep_json_scaffold(
517
+ tokenizer,
518
+ mask_id=mask_id,
519
+ null_id=null_id,
520
+ explanation_block_size=explanation_block_size,
521
+ explanation_max_blocks=explanation_max_blocks,
522
+ )
523
+
524
+ scaffold_len = len(scaffold_tokens)
525
+ original_input_length = input_ids.shape[1]
526
+ tokens_per_step = []
527
+ self.model.bd_size = block_size
528
+
529
+ _ss_profile = bool(os.environ.get("SS_PROFILE"))
530
+ _ss_traj_start = section_ranges.get("trajectory", (None, None))[0]
531
+ if _ss_profile:
532
+ import time as _time
533
+ torch.cuda.synchronize()
534
+ _ss_t = {"start": _time.perf_counter()}
535
+ _ss_marked_traj_start = False
536
+ _ss_n_fwd_prefix = 0
537
+ _ss_n_fwd_traj = 0
538
+
539
+ # Pre-convert to tensors for vectorized operations in the loop
540
+ scaffold_tok_t = torch.tensor(
541
+ scaffold_tokens, device=self.device, dtype=torch.long
542
+ )
543
+ scaffold_is_fixed = torch.tensor(
544
+ scaffold_mask_list, device=self.device, dtype=torch.bool
545
+ )
546
+
547
+ # ── Phase 1: Prefill prompt (identical to standard self-spec) ──
548
+ output = self.forward(
549
+ input_ids=input_ids,
550
+ pixel_values=pixel_values,
551
+ image_grid_thw=image_grid_thw,
552
+ use_cache=True,
553
+ update_kv_cache=True,
554
+ )
555
+ logits, past_key_values = output.logits, output.past_key_values
556
+ if _ss_profile:
557
+ torch.cuda.synchronize()
558
+ _ss_t["after_prefill"] = _time.perf_counter()
559
+
560
+ # First token — use scaffold token (always '{')
561
+ next_token = torch.tensor(
562
+ [[scaffold_tokens[0]]], device=self.device, dtype=torch.long
563
+ )
564
+ input_ids = torch.cat([input_ids, next_token], dim=1)
565
+ tokens_per_step.append(1)
566
+ scaffold_cursor = 1
567
+ step = 1
568
+
569
+ # ── Phase 2: Self-speculative decoding loop ──
570
+ # Follows the exact same structure as
571
+ # speculative_block_causal_sample_cache, with scaffold-aware draft.
572
+ while scaffold_cursor < scaffold_len:
573
+ if _ss_profile and (not _ss_marked_traj_start) and (
574
+ _ss_traj_start is not None and scaffold_cursor >= _ss_traj_start
575
+ ):
576
+ torch.cuda.synchronize()
577
+ _ss_t["enter_traj"] = _time.perf_counter()
578
+ _ss_marked_traj_start = True
579
+ prompt_length = input_ids.shape[1]
580
+ n_draft = min(block_size - 1, scaffold_len - scaffold_cursor)
581
+
582
+ # Build draft block: [seed, scaffold_or_MASK × n_draft]
583
+ sc_end = scaffold_cursor + n_draft
584
+ is_fixed = scaffold_is_fixed[scaffold_cursor:sc_end]
585
+ draft_tensor = torch.where(
586
+ is_fixed,
587
+ scaffold_tok_t[scaffold_cursor:sc_end],
588
+ mask_id,
589
+ ).unsqueeze(0)
590
+ x_t = torch.cat([input_ids[:, -1:], draft_tensor], dim=1)
591
+ mask_idx = (x_t == mask_id)
592
+
593
+ # ── Draft (block-diff bidirectional via auto eval_mask) ──
594
+ logits = self.forward(
595
+ input_ids=x_t,
596
+ use_cache=True,
597
+ past_key_values=past_key_values,
598
+ update_kv_cache=False,
599
+ eval_bd_size=block_size,
600
+ ).logits
601
+ tokens_per_step.append(0)
602
+ step += 1
603
+
604
+ # Shift logits (same as standard self-spec)
605
+ logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
606
+ if draft_temperature > 0:
607
+ # Temperature sampling for draft diversity
608
+ scaled = logits / draft_temperature
609
+ draft_probs = torch.softmax(scaled, dim=-1)
610
+ x_1 = torch.multinomial(
611
+ draft_probs.view(-1, draft_probs.shape[-1]), num_samples=1
612
+ ).view(draft_probs.shape[:-1])
613
+ # Confidence uses unscaled probs for thresholding
614
+ probs = torch.softmax(logits, dim=-1)
615
+ x1_p = torch.gather(
616
+ probs, dim=-1, index=x_1.unsqueeze(-1)
617
+ ).squeeze(-1)
618
+ else:
619
+ x_1 = logits.argmax(dim=-1)
620
+ probs = torch.softmax(logits, dim=-1)
621
+ x1_p = torch.gather(
622
+ probs, dim=-1, index=x_1.unsqueeze(-1)
623
+ ).squeeze(-1)
624
+
625
+ # Only fill MASK positions; scaffold positions keep their tokens
626
+ x1_p = torch.where(mask_idx, x1_p, -torch.inf)
627
+ unmask_idx = (x1_p > 0) # threshold=0 for draft filling
628
+
629
+ if unmask_idx.sum() > 0:
630
+ x_t[unmask_idx] = x_1[unmask_idx]
631
+ else:
632
+ # Fallback: fill most confident MASK
633
+ mask_only_p = x1_p.clone()
634
+ mask_only_p[~mask_idx] = -torch.inf
635
+ if mask_only_p.max() > -torch.inf:
636
+ best = mask_only_p.argmax()
637
+ x_t.view(-1)[best] = x_1.view(-1)[best]
638
+
639
+ # ── Verify (causal via auto eval_mask, commit to cache) ──
640
+ output = self.forward(
641
+ input_ids=x_t,
642
+ use_cache=True,
643
+ past_key_values=past_key_values,
644
+ update_kv_cache=True,
645
+ eval_bd_size=block_size,
646
+ )
647
+ past_key_values = output.past_key_values
648
+ if verify_temperature > 0:
649
+ verify_logits = output.logits / verify_temperature
650
+ verify_probs = torch.softmax(verify_logits, dim=-1)
651
+ ar_block_token = torch.multinomial(
652
+ verify_probs.view(-1, verify_probs.shape[-1]), num_samples=1
653
+ ).view(verify_probs.shape[:-1])
654
+ else:
655
+ ar_block_token = output.logits.argmax(dim=-1)
656
+
657
+ # ── AR acceptance (scaffold positions auto-pass) ──
658
+ ar_matches = (ar_block_token[0, :n_draft] == x_t[0, 1:n_draft + 1])
659
+ accepted_token_num = 0
660
+ for i in range(n_draft):
661
+ if is_fixed[i] or ar_matches[i]:
662
+ accepted_token_num += 1
663
+ else:
664
+ break
665
+ accepted_token_num += 1 # bonus token
666
+
667
+ tokens_per_step.append(accepted_token_num)
668
+
669
+ # Force scaffold tokens at scaffold positions, AR predictions elsewhere
670
+ accepted_ids = ar_block_token[:, :accepted_token_num].clone()
671
+ acc_end = min(scaffold_cursor + accepted_token_num, scaffold_len)
672
+ acc_fixed = scaffold_is_fixed[scaffold_cursor:acc_end]
673
+ accepted_ids[0, :len(acc_fixed)][acc_fixed] = \
674
+ scaffold_tok_t[scaffold_cursor:acc_end][acc_fixed]
675
+
676
+ input_ids = torch.cat([input_ids, accepted_ids], dim=1)
677
+ scaffold_cursor += accepted_token_num
678
+
679
+ past_key_values = _crop_cache(past_key_values, input_ids.shape[1] - 1)
680
+
681
+ step += 1
682
+
683
+ # Stop conditions
684
+ if input_ids.shape[1] - original_input_length > max_tokens:
685
+ break
686
+ if stop_token in input_ids[:, prompt_length:]:
687
+ stop_token_idx = (
688
+ input_ids[:, prompt_length:] == stop_token
689
+ ).nonzero()[0][1]
690
+ if (
691
+ input_ids[:, prompt_length:prompt_length + stop_token_idx]
692
+ == mask_id
693
+ ).sum() == 0:
694
+ break
695
+
696
+ if _ss_profile:
697
+ torch.cuda.synchronize()
698
+ _ss_t["end"] = _time.perf_counter()
699
+ _t_total = _ss_t["end"] - _ss_t["start"]
700
+ _t_pre = _ss_t["after_prefill"] - _ss_t["start"]
701
+ _t_traj_in = _ss_t.get("enter_traj")
702
+ if _t_traj_in is not None:
703
+ _t_prefix = _t_traj_in - _ss_t["after_prefill"]
704
+ _t_traj = _ss_t["end"] - _t_traj_in
705
+ else:
706
+ _t_prefix = _ss_t["end"] - _ss_t["after_prefill"]
707
+ _t_traj = 0.0
708
+ print(
709
+ f"[ss profile] total={_t_total*1000:.0f}ms "
710
+ f"prefill={_t_pre*1000:.0f}ms "
711
+ f"prefix-decode={_t_prefix*1000:.0f}ms "
712
+ f"traj-decode={_t_traj*1000:.0f}ms",
713
+ flush=True,
714
+ )
715
+
716
+ # ── Phase 3: Post-process — truncate at stop, strip NULL ──
717
+ if stop_token in input_ids[:, original_input_length:]:
718
+ stop_token_idx = (
719
+ input_ids[:, original_input_length:] == stop_token
720
+ ).nonzero()[0][1]
721
+ input_ids = input_ids[
722
+ :, :stop_token_idx + original_input_length + 1
723
+ ]
724
+
725
+ gen_tokens = input_ids[0, original_input_length:].tolist()
726
+ cleaned = [t for t in gen_tokens if t != null_id and t != mask_id]
727
+ output_ids = torch.cat(
728
+ [
729
+ input_ids[:, :original_input_length],
730
+ torch.tensor(
731
+ [cleaned], device=self.device, dtype=torch.long
732
+ ),
733
+ ],
734
+ dim=1,
735
+ )
736
+
737
+ gen_length = output_ids.shape[1] - original_input_length
738
+
739
+ if return_stats:
740
+ stats = {
741
+ "tokens_per_step": tokens_per_step,
742
+ "total_steps": step,
743
+ "gen_length": gen_length,
744
+ "null_tokens_stripped": len(gen_tokens) - len(cleaned),
745
+ "block_size": block_size,
746
+ "method": "scaffold_speculative_v5",
747
+ }
748
+ return output_ids, stats
749
+ return output_ids
750
+
751
+ @torch.no_grad()
752
+
753
+
754
+ # ---------------------------------------------------------------------------
755
+ # scaffold_spec_with_ss_multi_traj — SS multi-rollout inference scaling
756
+ # ---------------------------------------------------------------------------
757
+
758
+ def scaffold_spec_with_ss_multi_traj(
759
+ self,
760
+ input_ids,
761
+ tokenizer,
762
+ block_size=32,
763
+ max_tokens=1024,
764
+ pixel_values=None,
765
+ image_grid_thw=None,
766
+ mask_id=151665,
767
+ null_id=151666,
768
+ threshold=0.9,
769
+ stop_token=151645,
770
+ explanation_block_size=32,
771
+ explanation_max_blocks=6,
772
+ return_stats=False,
773
+ num_traj_rollouts=4,
774
+ traj_verify_temperature=0.5,
775
+ traj_draft_temperature=0.0,
776
+ merge_weights=None,
777
+ batch_parallel=False,
778
+ ):
779
+ """Scaffold Spec with shared prefix + N SS rollouts on the trajectory section.
780
+
781
+ Decoding pipeline:
782
+ 0) Prompt prefill [shared]
783
+ 1) Scaffold Spec for sections 1-3 (CoT) at verify_temp = 0 [shared, deterministic]
784
+ 2) Fork KV cache N times [O(N) memory]
785
+ 3) For each fork: continue Scaffold Spec on the trajectory
786
+ section with verify_temperature = traj_verify_temperature
787
+ (each rollout draws different samples in the AR-verify step
788
+ because torch.multinomial is invoked with a global RNG).
789
+ 4) Parse all N trajectories and return their weighted mean.
790
+
791
+ Cost: roughly 1 full SS pass (sections 1-3 are ~88%% of decoded tokens
792
+ on our schema) + N x trajectory-only SS passes. For N = 4 this is
793
+ ~1.5x the cost of a single SS, vs ~4x for naive sequential rerolling.
794
+
795
+ If batch_parallel = True, the N trajectory rollouts are executed in a
796
+ batched (batch_size = N) manner: one shared model.forward per
797
+ speculative draft / verify step over an N-replicated trajectory
798
+ suffix, which removes the per-rollout serial overhead at the cost of
799
+ replicating the per-layer KV cache N-fold along the batch dimension.
800
+
801
+ Returns: (output_ids, stats) if return_stats else output_ids.
802
+ """
803
+ from .section_utils import (
804
+ build_deep_json_scaffold,
805
+ SECTION_KEYS,
806
+ )
807
+
808
+ scaffold_tokens, section_ranges, scaffold_mask_list = build_deep_json_scaffold(
809
+ tokenizer,
810
+ mask_id=mask_id,
811
+ null_id=null_id,
812
+ explanation_block_size=explanation_block_size,
813
+ explanation_max_blocks=explanation_max_blocks,
814
+ )
815
+
816
+ scaffold_len = len(scaffold_tokens)
817
+ original_input_length = input_ids.shape[1]
818
+ tokens_per_step = []
819
+ self.model.bd_size = block_size
820
+
821
+ scaffold_tok_t = torch.tensor(scaffold_tokens, device=self.device, dtype=torch.long)
822
+ scaffold_is_fixed = torch.tensor(scaffold_mask_list, device=self.device, dtype=torch.bool)
823
+ traj_start_in_scaffold = section_ranges["trajectory"][0]
824
+
825
+ _profile = bool(os.environ.get("SS_MT_PROFILE"))
826
+ if _profile:
827
+ import time as _time
828
+ torch.cuda.synchronize()
829
+ _t_phase = {"start": _time.perf_counter()}
830
+ _phase_clone_total = 0.0
831
+ _phase_rollout_each = []
832
+
833
+ # ── Phase 0: Prefill prompt ──
834
+ output = self.forward(
835
+ input_ids=input_ids, pixel_values=pixel_values,
836
+ image_grid_thw=image_grid_thw,
837
+ use_cache=True, update_kv_cache=True,
838
+ )
839
+ logits, past_key_values = output.logits, output.past_key_values
840
+ if _profile:
841
+ torch.cuda.synchronize()
842
+ _t_phase["after_prefill"] = _time.perf_counter()
843
+
844
+ next_token = torch.tensor(
845
+ [[scaffold_tokens[0]]], device=self.device, dtype=torch.long,
846
+ )
847
+ input_ids = torch.cat([input_ids, next_token], dim=1)
848
+ tokens_per_step.append(1)
849
+ scaffold_cursor = 1
850
+ step = 1
851
+
852
+ # ── Phase 1: Scaffold Spec for non-trajectory sections (shared, vt=0) ──
853
+ while scaffold_cursor < scaffold_len and scaffold_cursor < traj_start_in_scaffold:
854
+ remaining_before_traj = traj_start_in_scaffold - scaffold_cursor
855
+ n_draft = min(block_size - 1, remaining_before_traj)
856
+ if n_draft <= 0:
857
+ break
858
+
859
+ sc_end = scaffold_cursor + n_draft
860
+ is_fixed = scaffold_is_fixed[scaffold_cursor:sc_end]
861
+ draft_tensor = torch.where(
862
+ is_fixed, scaffold_tok_t[scaffold_cursor:sc_end], mask_id,
863
+ ).unsqueeze(0)
864
+ x_t = torch.cat([input_ids[:, -1:], draft_tensor], dim=1)
865
+ mask_idx = (x_t == mask_id)
866
+
867
+ # Draft (block-bidirectional)
868
+ logits = self.forward(
869
+ input_ids=x_t, use_cache=True,
870
+ past_key_values=past_key_values,
871
+ update_kv_cache=False, eval_bd_size=block_size,
872
+ ).logits
873
+ tokens_per_step.append(0)
874
+ step += 1
875
+
876
+ logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
877
+ x_1 = logits.argmax(dim=-1)
878
+ probs = torch.softmax(logits, dim=-1)
879
+ x1_p = torch.gather(probs, dim=-1, index=x_1.unsqueeze(-1)).squeeze(-1)
880
+ x1_p = torch.where(mask_idx, x1_p, -torch.inf)
881
+ unmask_idx = (x1_p > 0)
882
+ if unmask_idx.sum() > 0:
883
+ x_t[unmask_idx] = x_1[unmask_idx]
884
+ else:
885
+ mask_only_p = x1_p.clone()
886
+ mask_only_p[~mask_idx] = -torch.inf
887
+ if mask_only_p.max() > -torch.inf:
888
+ best = mask_only_p.argmax()
889
+ x_t.view(-1)[best] = x_1.view(-1)[best]
890
+
891
+ # Verify (causal, greedy)
892
+ output = self.forward(
893
+ input_ids=x_t, use_cache=True,
894
+ past_key_values=past_key_values,
895
+ update_kv_cache=True, eval_bd_size=block_size,
896
+ )
897
+ past_key_values = output.past_key_values
898
+ ar_block_token = output.logits.argmax(dim=-1)
899
+
900
+ ar_matches = (ar_block_token[0, :n_draft] == x_t[0, 1:n_draft + 1])
901
+ accepted_token_num = 0
902
+ for i in range(n_draft):
903
+ if is_fixed[i] or ar_matches[i]:
904
+ accepted_token_num += 1
905
+ else:
906
+ break
907
+ accepted_token_num += 1
908
+
909
+ max_accept = traj_start_in_scaffold - scaffold_cursor
910
+ if accepted_token_num > max_accept:
911
+ accepted_token_num = max_accept
912
+
913
+ tokens_per_step.append(accepted_token_num)
914
+ accepted_ids = ar_block_token[:, :accepted_token_num].clone()
915
+ acc_end = min(scaffold_cursor + accepted_token_num, scaffold_len)
916
+ acc_fixed = scaffold_is_fixed[scaffold_cursor:acc_end]
917
+ accepted_ids[0, :len(acc_fixed)][acc_fixed] = \
918
+ scaffold_tok_t[scaffold_cursor:acc_end][acc_fixed]
919
+
920
+ input_ids = torch.cat([input_ids, accepted_ids], dim=1)
921
+ scaffold_cursor += accepted_token_num
922
+ past_key_values = _crop_cache(past_key_values, input_ids.shape[1] - 1)
923
+ step += 1
924
+
925
+ if input_ids.shape[1] - original_input_length > max_tokens:
926
+ break
927
+
928
+ if _profile:
929
+ torch.cuda.synchronize()
930
+ _t_phase["after_phase1"] = _time.perf_counter()
931
+
932
+ # ── Phase 2: Fork KV cache N times (one per trajectory rollout) ──
933
+ prefix_input_ids = input_ids.clone()
934
+ prefix_len = prefix_input_ids.shape[1]
935
+
936
+ def _clone_cache(kv):
937
+ if _profile:
938
+ torch.cuda.synchronize()
939
+ _t0 = _time.perf_counter()
940
+ cloned = []
941
+ for layer_num in range(len(kv)):
942
+ cloned.append(tuple(t.clone() for t in kv[layer_num]))
943
+ ret = DynamicCache(cloned)
944
+ if _profile:
945
+ torch.cuda.synchronize()
946
+ nonlocal _phase_clone_total
947
+ _phase_clone_total += _time.perf_counter() - _t0
948
+ return ret
949
+
950
+ # ── Phase 3: N SS rollouts on trajectory section, each with vt > 0 ──
951
+ # All rollouts start from the same prefix; randomness comes from
952
+ # the multinomial calls in draft / verify (RNG is process-global).
953
+ N = max(1, int(num_traj_rollouts))
954
+
955
+ def _run_one_traj_rollout(start_kv, start_input_ids):
956
+ """Continue Scaffold Spec from start_kv / start_input_ids over the
957
+ trajectory section, applying traj_*_temperature. Returns the
958
+ final ss_input_ids (with trajectory tokens appended) and the
959
+ extracted trajectory value tokens."""
960
+ local_kv = start_kv
961
+ local_input = start_input_ids
962
+ local_cursor = scaffold_cursor
963
+
964
+ while local_cursor < scaffold_len:
965
+ n_draft = min(block_size - 1, scaffold_len - local_cursor)
966
+ sc_end = local_cursor + n_draft
967
+ is_fixed = scaffold_is_fixed[local_cursor:sc_end]
968
+ draft_tensor = torch.where(
969
+ is_fixed, scaffold_tok_t[local_cursor:sc_end], mask_id,
970
+ ).unsqueeze(0)
971
+ x_t = torch.cat([local_input[:, -1:], draft_tensor], dim=1)
972
+ mask_idx = (x_t == mask_id)
973
+
974
+ # Draft (block-bidirectional, optionally temp-sampled)
975
+ draft_logits = self.forward(
976
+ input_ids=x_t, use_cache=True, past_key_values=local_kv,
977
+ update_kv_cache=False, eval_bd_size=block_size,
978
+ ).logits
979
+ draft_logits = torch.cat(
980
+ [draft_logits[:, :1, :], draft_logits[:, :-1, :]], dim=1,
981
+ )
982
+ if traj_draft_temperature > 0:
983
+ scaled = draft_logits / traj_draft_temperature
984
+ draft_probs = torch.softmax(scaled, dim=-1)
985
+ x_1 = torch.multinomial(
986
+ draft_probs.view(-1, draft_probs.shape[-1]),
987
+ num_samples=1,
988
+ ).view(draft_probs.shape[:-1])
989
+ else:
990
+ x_1 = draft_logits.argmax(dim=-1)
991
+ probs = torch.softmax(draft_logits, dim=-1)
992
+ x1_p = torch.gather(
993
+ probs, dim=-1, index=x_1.unsqueeze(-1),
994
+ ).squeeze(-1)
995
+ x1_p = torch.where(mask_idx, x1_p, -torch.inf)
996
+ unmask_idx = (x1_p > 0)
997
+ if unmask_idx.sum() > 0:
998
+ x_t[unmask_idx] = x_1[unmask_idx]
999
+ else:
1000
+ mask_only_p = x1_p.clone()
1001
+ mask_only_p[~mask_idx] = -torch.inf
1002
+ if mask_only_p.max() > -torch.inf:
1003
+ x_t.view(-1)[mask_only_p.argmax()] = \
1004
+ x_1.view(-1)[mask_only_p.argmax()]
1005
+
1006
+ # Verify (causal, optionally temp-sampled)
1007
+ v_out = self.forward(
1008
+ input_ids=x_t, use_cache=True, past_key_values=local_kv,
1009
+ update_kv_cache=True, eval_bd_size=block_size,
1010
+ )
1011
+ local_kv = v_out.past_key_values
1012
+ if traj_verify_temperature > 0:
1013
+ v_logits = v_out.logits / traj_verify_temperature
1014
+ v_probs = torch.softmax(v_logits, dim=-1)
1015
+ ar_block_token = torch.multinomial(
1016
+ v_probs.view(-1, v_probs.shape[-1]),
1017
+ num_samples=1,
1018
+ ).view(v_probs.shape[:-1])
1019
+ else:
1020
+ ar_block_token = v_out.logits.argmax(dim=-1)
1021
+
1022
+ ar_matches = (ar_block_token[0, :n_draft] == x_t[0, 1:n_draft + 1])
1023
+ accepted_token_num = 0
1024
+ for i in range(n_draft):
1025
+ if is_fixed[i] or ar_matches[i]:
1026
+ accepted_token_num += 1
1027
+ else:
1028
+ break
1029
+ accepted_token_num += 1
1030
+
1031
+ accepted_ids = ar_block_token[:, :accepted_token_num].clone()
1032
+ acc_end = min(local_cursor + accepted_token_num, scaffold_len)
1033
+ acc_fixed = scaffold_is_fixed[local_cursor:acc_end]
1034
+ accepted_ids[0, :len(acc_fixed)][acc_fixed] = \
1035
+ scaffold_tok_t[local_cursor:acc_end][acc_fixed]
1036
+
1037
+ local_input = torch.cat([local_input, accepted_ids], dim=1)
1038
+ local_cursor += accepted_token_num
1039
+ local_kv = _crop_cache(local_kv, local_input.shape[1] - 1)
1040
+
1041
+ if local_input.shape[1] - original_input_length > max_tokens:
1042
+ break
1043
+ if stop_token in local_input[:, prefix_len:]:
1044
+ st_idx = (local_input[:, prefix_len:] == stop_token).nonzero()
1045
+ if st_idx.numel() > 0:
1046
+ cand_st = st_idx[0][1].item()
1047
+ if (local_input[:, prefix_len:prefix_len + cand_st] == mask_id).sum() == 0:
1048
+ break
1049
+
1050
+ traj_values = [
1051
+ t for i, t in enumerate(local_input[0, original_input_length:].tolist())
1052
+ if i >= traj_start_in_scaffold and i < scaffold_len
1053
+ and not scaffold_mask_list[i] and t != null_id and t != mask_id
1054
+ ]
1055
+ return local_input, traj_values
1056
+
1057
+ # Sequential N rollouts (Option A; batch_parallel=False).
1058
+ rollout_inputs = []
1059
+ rollout_traj_values = []
1060
+ for _i in range(N):
1061
+ if _profile:
1062
+ torch.cuda.synchronize()
1063
+ _t_r0 = _time.perf_counter()
1064
+ cand_kv = _clone_cache(past_key_values)
1065
+ cand_input = prefix_input_ids.clone()
1066
+ cand_input, traj_vals = _run_one_traj_rollout(cand_kv, cand_input)
1067
+ rollout_inputs.append(cand_input)
1068
+ rollout_traj_values.append(traj_vals)
1069
+ step += 1
1070
+ if _profile:
1071
+ torch.cuda.synchronize()
1072
+ _phase_rollout_each.append(_time.perf_counter() - _t_r0)
1073
+
1074
+ if _profile:
1075
+ torch.cuda.synchronize()
1076
+ _t_phase["after_rollouts"] = _time.perf_counter()
1077
+ _t_total = _t_phase["after_rollouts"] - _t_phase["start"]
1078
+ _t_pre = _t_phase["after_prefill"] - _t_phase["start"]
1079
+ _t_p1 = _t_phase["after_phase1"] - _t_phase["after_prefill"]
1080
+ _t_rolls = _t_phase["after_rollouts"] - _t_phase["after_phase1"]
1081
+ print(
1082
+ f"[ss_mt profile] total={_t_total*1000:.0f}ms "
1083
+ f"prefill(P0)={_t_pre*1000:.0f}ms "
1084
+ f"prefix-decode(P1)={_t_p1*1000:.0f}ms "
1085
+ f"rollouts(P2+P3)={_t_rolls*1000:.0f}ms "
1086
+ f"of which kv-clone={_phase_clone_total*1000:.0f}ms "
1087
+ f"per-rollout={[f'{r*1000:.0f}' for r in _phase_rollout_each]}ms",
1088
+ flush=True,
1089
+ )
1090
+
1091
+ # ── Phase 4: Parse all rollouts, weighted-merge waypoints ──
1092
+ def _decode_trajectory(traj_tokens):
1093
+ text = tokenizer.decode(traj_tokens, skip_special_tokens=False)
1094
+ text = text.replace("<|NULL|>", "").strip()
1095
+ coords = re.findall(r"[+-]?\d+\.?\d*", text)
1096
+ wps = []
1097
+ for i in range(0, len(coords) - 1, 2):
1098
+ wps.append([float(coords[i]), float(coords[i + 1])])
1099
+ return wps
1100
+
1101
+ rollout_waypoints = [_decode_trajectory(v) for v in rollout_traj_values]
1102
+
1103
+ if merge_weights is None or len(merge_weights) != N:
1104
+ ws = [1.0 / N] * N
1105
+ else:
1106
+ total = sum(merge_weights)
1107
+ ws = [w / total for w in merge_weights]
1108
+
1109
+ if rollout_waypoints and all(len(w) > 0 for w in rollout_waypoints):
1110
+ n_wp = min(len(w) for w in rollout_waypoints)
1111
+ merged_waypoints = []
1112
+ for i in range(n_wp):
1113
+ mx = sum(ws[c] * rollout_waypoints[c][i][0] for c in range(N))
1114
+ my = sum(ws[c] * rollout_waypoints[c][i][1] for c in range(N))
1115
+ merged_waypoints.append([mx, my])
1116
+ else:
1117
+ merged_waypoints = next(
1118
+ (w for w in rollout_waypoints if w), [],
1119
+ )
1120
+
1121
+ # Output text: take rollout 0's full text but replace its trajectory
1122
+ # with the merged waypoints.
1123
+ base_input = rollout_inputs[0]
1124
+ if stop_token in base_input[:, original_input_length:]:
1125
+ st_idx = (base_input[:, original_input_length:] == stop_token).nonzero()[0][1]
1126
+ base_input = base_input[:, :st_idx + original_input_length + 1]
1127
+ base_raw_tokens = base_input[0, original_input_length:].tolist()
1128
+ base_cleaned = [t for t in base_raw_tokens if t != null_id and t != mask_id]
1129
+ base_null_stripped = len(base_raw_tokens) - len(base_cleaned)
1130
+ base_text = tokenizer.decode(base_cleaned, skip_special_tokens=False)
1131
+
1132
+ traj_parts = [
1133
+ f"[{x:+07.2f},{y:+06.2f}]" for x, y in merged_waypoints
1134
+ ]
1135
+ merged_traj_str = "[" + ", ".join(traj_parts) + "]"
1136
+ replaced_text = re.sub(
1137
+ r'("trajectory"\s*:\s*")(\[\[.*?\]\])',
1138
+ r"\g<1>" + merged_traj_str, base_text,
1139
+ )
1140
+
1141
+ merged_tokens = tokenizer.encode(replaced_text, add_special_tokens=False)
1142
+ output_ids = torch.cat([
1143
+ input_ids[:, :original_input_length],
1144
+ torch.tensor([merged_tokens], device=self.device, dtype=torch.long),
1145
+ ], dim=1)
1146
+
1147
+ gen_length = output_ids.shape[1] - original_input_length
1148
+
1149
+ if return_stats:
1150
+ stats = {
1151
+ "tokens_per_step": tokens_per_step,
1152
+ "total_steps": step,
1153
+ "gen_length": gen_length,
1154
+ "null_tokens_stripped": base_null_stripped,
1155
+ "block_size": block_size,
1156
+ "method": "scaffold_spec_with_ss_multi_traj",
1157
+ "num_traj_rollouts": N,
1158
+ "traj_verify_temperature": traj_verify_temperature,
1159
+ "rollout_waypoints": rollout_waypoints,
1160
+ "merged_waypoints": merged_waypoints,
1161
+ "merge_weights": ws,
1162
+ }
1163
+ return output_ids, stats
1164
+ return output_ids
1165
+
1166
+ @torch.no_grad()
1167
+
1168
+
1169
+ # ---------------------------------------------------------------------------
1170
+ # Bind decoding methods onto the model class.
1171
+ #
1172
+ # ``modeling.py`` imports this module at the bottom of the file, after the
1173
+ # ``Fast_dDriveForConditionalGeneration`` class has been defined. We
1174
+ # attach the three decoding paths as ordinary methods so callers can invoke
1175
+ # them as ``model.mdm_sample_deep_scaffold(...)`` etc. without any extra
1176
+ # registration step.
1177
+ # ---------------------------------------------------------------------------
1178
+
1179
+ def attach_generation_methods(cls):
1180
+ """Attach the three release decoding paths as methods of ``cls``."""
1181
+ cls.mdm_sample_deep_scaffold = mdm_sample_deep_scaffold
1182
+ cls.scaffold_speculative_sample = scaffold_speculative_sample
1183
+ cls.scaffold_spec_with_ss_multi_traj = scaffold_spec_with_ss_multi_traj
1184
+ return cls
1185
+
1186
+
1187
+ __all__ = [
1188
+ "mdm_sample_deep_scaffold",
1189
+ "scaffold_speculative_sample",
1190
+ "scaffold_spec_with_ss_multi_traj",
1191
+ "attach_generation_methods",
1192
+ ]
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08a8945f208e0b7e62c71542d3301d755d95d02bcdb54d7deec28f8a819b4a2d
3
+ size 4972304384
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f399e3ccf37f2016964e531e2cfb5371a3d96d46205cb1b3eba24cd13d0de6aa
3
+ size 4932949248
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c77c01870520022fd0111459ce27eff492ce932a64740dd26d23598289f3ed
3
+ size 4932949336
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c13c8eff2f5e9435ac7692790d1028ff1dbc5f8bd53db6e5a91b22c322162008
3
+ size 1425040040
model.safetensors.index.json ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 234663856,
4
+ "total_size": 16263151616
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
128
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
140
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
144
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
147
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
148
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
150
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
152
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
153
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
164
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
165
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
176
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
188
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
200
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
212
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
224
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
236
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
248
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
260
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
262
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
263
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
264
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
265
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
266
+ "model.layers.28.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
267
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
268
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
269
+ "model.layers.28.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
270
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
271
+ "model.layers.28.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
272
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
273
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
274
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
275
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
276
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
277
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
278
+ "model.layers.29.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
279
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
280
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
281
+ "model.layers.29.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
282
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
283
+ "model.layers.29.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
284
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
285
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors",
286
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
287
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
289
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
290
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
291
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
293
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
294
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
295
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
296
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
298
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
299
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
300
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
301
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
302
+ "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
303
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
304
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
305
+ "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
306
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
307
+ "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
308
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
309
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
310
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
311
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
312
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
313
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
314
+ "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
315
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
316
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
317
+ "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
318
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
319
+ "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
320
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
321
+ "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
322
+ "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
323
+ "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
324
+ "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
325
+ "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
326
+ "model.layers.32.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
327
+ "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
328
+ "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
329
+ "model.layers.32.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
330
+ "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
331
+ "model.layers.32.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
332
+ "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
333
+ "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
334
+ "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
335
+ "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
336
+ "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
337
+ "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
338
+ "model.layers.33.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
339
+ "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
340
+ "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
341
+ "model.layers.33.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
342
+ "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
343
+ "model.layers.33.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
344
+ "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
345
+ "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
346
+ "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
347
+ "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
348
+ "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
349
+ "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
350
+ "model.layers.34.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
351
+ "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
352
+ "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
353
+ "model.layers.34.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
354
+ "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
355
+ "model.layers.34.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
356
+ "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
357
+ "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
358
+ "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
359
+ "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
360
+ "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
361
+ "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
362
+ "model.layers.35.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
363
+ "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
364
+ "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
365
+ "model.layers.35.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
366
+ "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
367
+ "model.layers.35.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
368
+ "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
369
+ "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
370
+ "model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
371
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
372
+ "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
373
+ "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
374
+ "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
375
+ "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
376
+ "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
377
+ "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
378
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
379
+ "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
380
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
381
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
382
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
383
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
384
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
385
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
386
+ "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
387
+ "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
388
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
389
+ "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
390
+ "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
391
+ "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
392
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
393
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
394
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
395
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
396
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
397
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
398
+ "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
399
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
400
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
401
+ "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
402
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
403
+ "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
404
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
405
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
406
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
407
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
408
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
409
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
410
+ "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
411
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
412
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
413
+ "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
414
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
415
+ "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
416
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
417
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
418
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
419
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
420
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
421
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
422
+ "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
423
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
424
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
425
+ "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
426
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
427
+ "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
428
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
429
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
430
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
431
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
432
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
433
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
434
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
435
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
436
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
437
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
438
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
439
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
440
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
441
+ "model.norm.weight": "model-00004-of-00004.safetensors",
442
+ "visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
443
+ "visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
444
+ "visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
445
+ "visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
446
+ "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
447
+ "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
448
+ "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
449
+ "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
450
+ "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
451
+ "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
452
+ "visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
453
+ "visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
454
+ "visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
455
+ "visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
456
+ "visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
457
+ "visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
458
+ "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
459
+ "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
460
+ "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
461
+ "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
462
+ "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
463
+ "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
464
+ "visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
465
+ "visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
466
+ "visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
467
+ "visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
468
+ "visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
469
+ "visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
470
+ "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
471
+ "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
472
+ "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
473
+ "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
474
+ "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
475
+ "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
476
+ "visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
477
+ "visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
478
+ "visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
479
+ "visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
480
+ "visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
481
+ "visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
482
+ "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
483
+ "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
484
+ "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
485
+ "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
486
+ "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
487
+ "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
488
+ "visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
489
+ "visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
490
+ "visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
491
+ "visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
492
+ "visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
493
+ "visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
494
+ "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
495
+ "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
496
+ "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
497
+ "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
498
+ "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
499
+ "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
500
+ "visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
501
+ "visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
502
+ "visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
503
+ "visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
504
+ "visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
505
+ "visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
506
+ "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
507
+ "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
508
+ "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
509
+ "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
510
+ "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
511
+ "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
512
+ "visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
513
+ "visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
514
+ "visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
515
+ "visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
516
+ "visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
517
+ "visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
518
+ "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
519
+ "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
520
+ "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
521
+ "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
522
+ "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
523
+ "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
524
+ "visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
525
+ "visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
526
+ "visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
527
+ "visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
528
+ "visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
529
+ "visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
530
+ "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
531
+ "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
532
+ "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
533
+ "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
534
+ "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
535
+ "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
536
+ "visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
537
+ "visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
538
+ "visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
539
+ "visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
540
+ "visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
541
+ "visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
542
+ "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
543
+ "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
544
+ "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
545
+ "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
546
+ "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
547
+ "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
548
+ "visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
549
+ "visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
550
+ "visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
551
+ "visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
552
+ "visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
553
+ "visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
554
+ "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
555
+ "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
556
+ "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
557
+ "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
558
+ "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
559
+ "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
560
+ "visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
561
+ "visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
562
+ "visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
563
+ "visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
564
+ "visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
565
+ "visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
566
+ "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
567
+ "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
568
+ "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
569
+ "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
570
+ "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
571
+ "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
572
+ "visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
573
+ "visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
574
+ "visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
575
+ "visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
576
+ "visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
577
+ "visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
578
+ "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
579
+ "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
580
+ "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
581
+ "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
582
+ "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
583
+ "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
584
+ "visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
585
+ "visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
586
+ "visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
587
+ "visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
588
+ "visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
589
+ "visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
590
+ "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
591
+ "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
592
+ "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
593
+ "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
594
+ "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
595
+ "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
596
+ "visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
597
+ "visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
598
+ "visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
599
+ "visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
600
+ "visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
601
+ "visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
602
+ "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
603
+ "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
604
+ "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
605
+ "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
606
+ "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
607
+ "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
608
+ "visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
609
+ "visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
610
+ "visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
611
+ "visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
612
+ "visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
613
+ "visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
614
+ "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
615
+ "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
616
+ "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
617
+ "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
618
+ "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
619
+ "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
620
+ "visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
621
+ "visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
622
+ "visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
623
+ "visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
624
+ "visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
625
+ "visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
626
+ "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
627
+ "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
628
+ "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
629
+ "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
630
+ "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
631
+ "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
632
+ "visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
633
+ "visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
634
+ "visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
635
+ "visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
636
+ "visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
637
+ "visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
638
+ "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
639
+ "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
640
+ "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
641
+ "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
642
+ "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
643
+ "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
644
+ "visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
645
+ "visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
646
+ "visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
647
+ "visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
648
+ "visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
649
+ "visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
650
+ "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
651
+ "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
652
+ "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
653
+ "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
654
+ "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
655
+ "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
656
+ "visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
657
+ "visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
658
+ "visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
659
+ "visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
660
+ "visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
661
+ "visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
662
+ "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
663
+ "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
664
+ "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
665
+ "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
666
+ "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
667
+ "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
668
+ "visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
669
+ "visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
670
+ "visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
671
+ "visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
672
+ "visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
673
+ "visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
674
+ "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
675
+ "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
676
+ "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
677
+ "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
678
+ "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
679
+ "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
680
+ "visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
681
+ "visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
682
+ "visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
683
+ "visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
684
+ "visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors",
685
+ "visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors",
686
+ "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
687
+ "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
688
+ "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
689
+ "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
690
+ "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
691
+ "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
692
+ "visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors",
693
+ "visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors",
694
+ "visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors",
695
+ "visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors",
696
+ "visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors",
697
+ "visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors",
698
+ "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
699
+ "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
700
+ "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
701
+ "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
702
+ "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
703
+ "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
704
+ "visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors",
705
+ "visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors",
706
+ "visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors",
707
+ "visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors",
708
+ "visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors",
709
+ "visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors",
710
+ "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
711
+ "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
712
+ "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
713
+ "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
714
+ "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
715
+ "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
716
+ "visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors",
717
+ "visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
718
+ "visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
719
+ "visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
720
+ "visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
721
+ "visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
722
+ "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
723
+ "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
724
+ "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
725
+ "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
726
+ "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
727
+ "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
728
+ "visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
729
+ "visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
730
+ "visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors",
731
+ "visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors",
732
+ "visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors",
733
+ "visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors",
734
+ "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
735
+ "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
736
+ "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
737
+ "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
738
+ "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
739
+ "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
740
+ "visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors",
741
+ "visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors",
742
+ "visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors",
743
+ "visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors",
744
+ "visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors",
745
+ "visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors",
746
+ "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
747
+ "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
748
+ "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
749
+ "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
750
+ "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
751
+ "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
752
+ "visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors",
753
+ "visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors",
754
+ "visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
755
+ "visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
756
+ "visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
757
+ "visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
758
+ "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
759
+ "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
760
+ "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
761
+ "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
762
+ "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
763
+ "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
764
+ "visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
765
+ "visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
766
+ "visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
767
+ "visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
768
+ "visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
769
+ "visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
770
+ "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
771
+ "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
772
+ "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
773
+ "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
774
+ "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
775
+ "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
776
+ "visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
777
+ "visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
778
+ "visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
779
+ "visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
780
+ "visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
781
+ "visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
782
+ "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
783
+ "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
784
+ "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
785
+ "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
786
+ "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
787
+ "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
788
+ "visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
789
+ "visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
790
+ "visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
791
+ "visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
792
+ "visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
793
+ "visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
794
+ "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
795
+ "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
796
+ "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
797
+ "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
798
+ "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
799
+ "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
800
+ "visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
801
+ "visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
802
+ "visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
803
+ "visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
804
+ "visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
805
+ "visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
806
+ "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
807
+ "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
808
+ "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
809
+ "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
810
+ "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
811
+ "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
812
+ "visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
813
+ "visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
814
+ "visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
815
+ "visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
816
+ "visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
817
+ "visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
818
+ "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
819
+ "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
820
+ "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
821
+ "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
822
+ "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
823
+ "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
824
+ "visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
825
+ "visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
826
+ "visual.merger.ln_q.weight": "model-00001-of-00004.safetensors",
827
+ "visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
828
+ "visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors",
829
+ "visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors",
830
+ "visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
831
+ "visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors"
832
+ }
833
+ }
modeling.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "min_pixels": 3136,
3
+ "max_pixels": 12845056,
4
+ "patch_size": 14,
5
+ "temporal_patch_size": 2,
6
+ "merge_size": 2,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "image_processor_type": "Qwen2VLImageProcessor",
18
+ "processor_class": "Qwen2_5_VLProcessor"
19
+ }
section_utils.py ADDED
@@ -0,0 +1,803 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Section-aware block scheduling for JSON structured output.
3
+
4
+ Inspired by the S3 (Self-adaptive Schema Scaffolding) paper (arXiv:2507.04504),
5
+ this module provides utilities to:
6
+ 1. Parse tokenized JSON output into sections (critical_objects, explanation, etc.)
7
+ 2. Assign section-aware block indices for variable block sizes per section
8
+ 3. Build JSON scaffolds for inference (pre-fill structural tokens)
9
+
10
+ The DVLM-AD output schema has 4 sections:
11
+ - critical_objects: ~88 tokens (12 yes/no fields, nearly constant)
12
+ - explanation: ~114 tokens (variable, 72-172)
13
+ - future_meta_behavior: ~40 tokens (nearly constant)
14
+ - trajectory: ~80 tokens (nearly constant)
15
+ """
16
+
17
+ import torch
18
+ from typing import Dict, List, Optional, Tuple
19
+ import math
20
+
21
+
22
+ # Ordered list of section keys as they appear in the JSON output
23
+ SECTION_KEYS = [
24
+ "critical_objects",
25
+ "explanation",
26
+ "future_meta_behavior",
27
+ "trajectory",
28
+ ]
29
+
30
+ # Default token budgets per section (based on training data analysis)
31
+ DEFAULT_TOKEN_BUDGETS = {
32
+ "critical_objects": 88,
33
+ "explanation": 128,
34
+ "future_meta_behavior": 40,
35
+ "trajectory": 80,
36
+ }
37
+
38
+ # Default steps per section
39
+ DEFAULT_SECTION_STEPS = {
40
+ "critical_objects": 1,
41
+ "explanation": 3,
42
+ "future_meta_behavior": 1,
43
+ "trajectory": 1,
44
+ }
45
+
46
+
47
+
48
+
49
+ def _v1_removed_parse_json_sections(*args, **kwargs):
50
+ raise NotImplementedError("DS v1 parse_json_sections has been removed. Use deep scaffold v2.")
51
+
52
+
53
+ def _v1_removed_compute_section_block_idx(*args, **kwargs):
54
+ raise NotImplementedError("DS v1 compute_section_block_idx has been removed. Use compute_section_block_idx_deep_static.")
55
+
56
+
57
+ def _v1_removed_build_json_scaffold(*args, **kwargs):
58
+ raise NotImplementedError("DS v1 build_json_scaffold has been removed. Use build_deep_json_scaffold.")
59
+
60
+
61
+ def _v1_removed_compute_section_block_sizes(*args, **kwargs):
62
+ raise NotImplementedError("DS v1 compute_section_block_sizes has been removed.")
63
+
64
+
65
+ def build_static_scaffold_sequences(tokenizer) -> Dict[str, List[int]]:
66
+ """Pre-compute token sequences for top-level JSON boundary matching.
67
+
68
+ Used internally by :func:`build_deep_scaffold_sequences`.
69
+ """
70
+ return {
71
+ "prefix": tokenizer.encode('{"critical_objects":', add_special_tokens=False),
72
+ "between_co_exp": tokenizer.encode(' "explanation":', add_special_tokens=False),
73
+ "between_exp_fmb": tokenizer.encode(' "future_meta_behavior":', add_special_tokens=False),
74
+ "between_fmb_traj": tokenizer.encode(' "trajectory":', add_special_tokens=False),
75
+ }
76
+
77
+
78
+ def _v1_removed_compute_section_block_idx_static(*args, **kwargs):
79
+ raise NotImplementedError("DS v1 compute_section_block_idx_static has been removed. Use compute_section_block_idx_deep_static.")
80
+
81
+
82
+ # Backward-compatible aliases so stale imports produce clear errors
83
+ parse_json_sections = _v1_removed_parse_json_sections
84
+ compute_section_block_idx = _v1_removed_compute_section_block_idx
85
+ build_json_scaffold = _v1_removed_build_json_scaffold
86
+ compute_section_block_sizes = _v1_removed_compute_section_block_sizes
87
+ compute_section_block_idx_static = _v1_removed_compute_section_block_idx_static
88
+
89
+
90
+ # ═══════════════════════════════════════════════════════════════
91
+ # Deep scaffold v2: constants and utilities
92
+ # ═══════════════════════════════════════════════════════════════
93
+
94
+ NULL_TOKEN_ID = 151666
95
+
96
+ # critical_objects: 12 sub-keys, each value is exactly 1 token (yes=9693 / no=2152)
97
+ CRITICAL_OBJECTS_SUBKEYS = [
98
+ "nearby_vehicle", "pedestrian", "cyclist", "construction",
99
+ "traffic_element", "weather_condition", "road_hazard",
100
+ "emergency_vehicle", "animal", "special_vehicle",
101
+ "conflicting_vehicle", "door_opening_vehicle",
102
+ ]
103
+
104
+ # future_meta_behavior: each sub-key value is exactly 3 tokens
105
+ # (e.g., "keep speed" → [4867, 4732, 151667] or "go straight" → [2849, 7833, 151667])
106
+ FMB_VALUE_BUDGET = 3
107
+
108
+
109
+ def build_deep_json_scaffold(
110
+ tokenizer,
111
+ section_token_budgets: Optional[Dict[str, int]] = None,
112
+ mask_id: Optional[int] = None,
113
+ null_id: Optional[int] = None,
114
+ explanation_block_size: int = 32,
115
+ explanation_max_blocks: int = 6,
116
+ ) -> Tuple[List[int], Dict[str, Tuple[int, int]], List[int]]:
117
+ """Build a deep JSON scaffold for inference (v2).
118
+
119
+ Constructs a template response by building a Python dict and
120
+ processing it through the **exact same pipeline** as the training
121
+ dataloader (``multi_modal_dataset.py``):
122
+
123
+ 1. Build a realistic dict with placeholder values.
124
+ 2. Pad explanation with ``<|NULL|>`` to ``exp_budget`` tokens.
125
+ 3. Pad FMB values with ``<|NULL|>`` to 3 tokens each.
126
+ 4. Normalize trajectory to ``+XXX.XX`` format with spaces.
127
+ 5. Serialize with ``json.dumps(obj, ensure_ascii=False)``.
128
+ 6. Tokenize the whole string as one piece.
129
+ 7. Run ``compute_section_block_idx_deep_static`` to get scaffold/value.
130
+ 8. Replace value positions with MASK tokens.
131
+
132
+ This guarantees identical BPE tokenization as training data.
133
+
134
+ Returns
135
+ -------
136
+ scaffold_tokens : list[int]
137
+ Token IDs with MASK at value positions.
138
+ section_ranges : dict
139
+ Section name -> (start, end) within scaffold_tokens.
140
+ scaffold_mask : list[int]
141
+ 0 = value (to denoise), 1 = scaffold (frozen).
142
+ """
143
+ import torch as _torch
144
+ import json as _json
145
+ import re as _re
146
+
147
+ if mask_id is None:
148
+ mask_tok = tokenizer.encode("|<MASK>|", add_special_tokens=False)
149
+ mask_id = mask_tok[0] if len(mask_tok) == 1 else 151665
150
+ if null_id is None:
151
+ null_id = NULL_TOKEN_ID
152
+
153
+ exp_budget = explanation_block_size * explanation_max_blocks # default 192
154
+
155
+ # ── Step 1: Build a Python dict matching training data structure ──
156
+ # Placeholder explanation text (will be replaced with MASK anyway).
157
+ filler_explanation = (
158
+ "The ego vehicle is driving forward on the road. "
159
+ "There are nearby vehicles ahead that may affect the path. "
160
+ "No pedestrians or cyclists are detected in the immediate area. "
161
+ "The road conditions appear normal with no hazards present. "
162
+ "Speed adjustment may be needed based on the traffic ahead. "
163
+ "No lateral maneuvering is required at this time."
164
+ )
165
+
166
+ def _build_template(n_exp_nulls: int) -> str:
167
+ """Build template via json.dumps — identical to dataloader output."""
168
+ null_pad = "<|NULL|>" * n_exp_nulls
169
+
170
+ data_obj = {
171
+ "critical_objects": {
172
+ "nearby_vehicle": "no", "pedestrian": "no", "cyclist": "no",
173
+ "construction": "no", "traffic_element": "no",
174
+ "weather_condition": "no", "road_hazard": "no",
175
+ "emergency_vehicle": "no", "animal": "no",
176
+ "special_vehicle": "no", "conflicting_vehicle": "no",
177
+ "door_opening_vehicle": "no",
178
+ },
179
+ "explanation": filler_explanation + null_pad,
180
+ "future_meta_behavior": {
181
+ "longitudinal": "come to stop",
182
+ "lateral": "go straight<|NULL|>",
183
+ },
184
+ # Raw trajectory — will be normalized below
185
+ "trajectory": "[[+14.70,-00.04], [+29.55,-00.21], [+44.51,-00.56], [+59.50,-01.06], [+74.39,-01.69]]",
186
+ }
187
+
188
+ # Apply exact same trajectory normalization as dataloader (lines 851-863)
189
+ traj = data_obj["trajectory"]
190
+ def _fmt_coord(m):
191
+ raw = m.group(0)
192
+ sign = raw[0]
193
+ num = float(raw[1:])
194
+ return f"{sign}{num:06.2f}"
195
+ traj = _re.sub(r'[+-]\d+\.\d+', _fmt_coord, traj)
196
+ traj = _re.sub(r',([+-])', r', \1', traj)
197
+ traj = _re.sub(r'\[([+-])', r'[ \1', traj)
198
+ data_obj["trajectory"] = traj
199
+
200
+ # Serialize with json.dumps — identical to dataloader line 865
201
+ return _json.dumps(data_obj, ensure_ascii=False)
202
+
203
+ # ── Step 2: Iteratively adjust NULL count for exp_budget ──
204
+ deep_seqs = build_deep_scaffold_sequences(tokenizer)
205
+ top_seqs = deep_seqs["top"]
206
+
207
+ def _count_exp_value_tokens(tok_list):
208
+ """Count explanation VALUE tokens (between boundary patterns)."""
209
+ co_exp_pat = top_seqs["between_co_exp"]
210
+ exp_fmb_pat = top_seqs["between_exp_fmb"]
211
+ co_exp_pos = _find_subseq(tok_list, co_exp_pat, 0)
212
+ if co_exp_pos < 0:
213
+ return None
214
+ exp_start = co_exp_pos + len(co_exp_pat)
215
+ exp_fmb_pos = _find_subseq(tok_list, exp_fmb_pat, exp_start)
216
+ if exp_fmb_pos < 0:
217
+ return None
218
+ # exp_start..exp_fmb_pos includes opening/closing quotes (scaffold)
219
+ # value tokens = total - 2 (quotes)
220
+ return (exp_fmb_pos - exp_start) - 2
221
+
222
+ # Measure base explanation tokens (no NULLs)
223
+ toks_0 = tokenizer.encode(_build_template(0), add_special_tokens=False)
224
+ base_exp = _count_exp_value_tokens(toks_0)
225
+ if base_exp is not None:
226
+ needed_nulls = max(0, exp_budget - base_exp)
227
+ else:
228
+ needed_nulls = exp_budget // 2 # fallback
229
+
230
+ # Build and measure, adjust once
231
+ template = _build_template(needed_nulls)
232
+ template_tokens = tokenizer.encode(template, add_special_tokens=False)
233
+ actual_exp = _count_exp_value_tokens(template_tokens)
234
+ if actual_exp is not None and actual_exp != exp_budget:
235
+ needed_nulls = max(0, needed_nulls + (exp_budget - actual_exp))
236
+ template = _build_template(needed_nulls)
237
+ template_tokens = tokenizer.encode(template, add_special_tokens=False)
238
+
239
+ # ── Step 3: Run training scaffold detection ──
240
+ prompt_len = 10
241
+ all_tokens = [1] * prompt_len + template_tokens
242
+ labels_list = [-100] * prompt_len + template_tokens
243
+
244
+ labels = _torch.tensor([labels_list])
245
+ token_ids = _torch.tensor([all_tokens])
246
+
247
+ _, _, _, scaffold_mask_tensor, _ = compute_section_block_idx_deep_static(
248
+ labels, token_ids, deep_seqs, fallback_block_size=32,
249
+ )
250
+
251
+ # ── Step 4: Extract scaffold/value and replace value with MASK ──
252
+ scaffold_tokens = list(template_tokens)
253
+ scaffold_mask_list: List[int] = []
254
+ for i in range(len(template_tokens)):
255
+ abs_pos = prompt_len + i
256
+ is_scaffold = scaffold_mask_tensor[abs_pos].item()
257
+ scaffold_mask_list.append(1 if is_scaffold else 0)
258
+
259
+ for i in range(len(scaffold_tokens)):
260
+ if scaffold_mask_list[i] == 0:
261
+ scaffold_tokens[i] = mask_id
262
+
263
+ # ── Step 5: Compute section ranges ──
264
+ section_ranges: Dict[str, Tuple[int, int]] = {}
265
+ boundary_order = [
266
+ ("prefix", "critical_objects"),
267
+ ("between_co_exp", "explanation"),
268
+ ("between_exp_fmb", "future_meta_behavior"),
269
+ ("between_fmb_traj", "trajectory"),
270
+ ]
271
+
272
+ search_from = 0
273
+ prev_section_name = None
274
+ prev_value_start = None
275
+
276
+ for boundary_key, section_name in boundary_order:
277
+ pattern = top_seqs.get(boundary_key)
278
+ if pattern is None:
279
+ continue
280
+ pos = _find_subseq(template_tokens, pattern, search_from)
281
+ if pos < 0:
282
+ continue
283
+ if prev_section_name is not None and prev_value_start is not None:
284
+ section_ranges[prev_section_name] = (prev_value_start, pos)
285
+ value_start = pos + len(pattern)
286
+ prev_section_name = section_name
287
+ prev_value_start = value_start
288
+ search_from = value_start
289
+
290
+ if prev_section_name is not None and prev_value_start is not None:
291
+ section_ranges[prev_section_name] = (prev_value_start, len(template_tokens))
292
+
293
+ return scaffold_tokens, section_ranges, scaffold_mask_list
294
+
295
+
296
+ def _find_subseq(seq: List[int], pattern: List[int], start: int = 0) -> int:
297
+ """Find first occurrence of *pattern* in *seq* starting at *start*. Returns -1 if not found."""
298
+ n = len(pattern)
299
+ for i in range(start, len(seq) - n + 1):
300
+ if seq[i : i + n] == pattern:
301
+ return i
302
+ return -1
303
+
304
+
305
+ def build_deep_scaffold_sequences(tokenizer) -> Dict[str, object]:
306
+ """
307
+ Pre-compute token sequences for deep scaffold matching.
308
+
309
+ Returns a dict with:
310
+ - Top-level boundary patterns (same as build_static_scaffold_sequences)
311
+ - Sub-key patterns for critical_objects, future_meta_behavior, trajectory
312
+ """
313
+ seqs: Dict[str, object] = {}
314
+
315
+ # ── Top-level boundaries (reuse existing) ──
316
+ seqs["top"] = build_static_scaffold_sequences(tokenizer)
317
+
318
+ # ── critical_objects sub-key patterns ──
319
+ # In context, CO value starts with ' {"nearby_vehicle": "yes", ...'
320
+ # Token 5212 = ' {"' merges space+brace+quote in context
321
+ # First entry: ' {"key": "'
322
+ # Subsequent: '", "key": "' (token 497='","' merges quote+comma)
323
+ co_patterns = []
324
+ for i, key in enumerate(CRITICAL_OBJECTS_SUBKEYS):
325
+ if i == 0:
326
+ pattern = tokenizer.encode(' {"' + key + '": "', add_special_tokens=False)
327
+ else:
328
+ pattern = tokenizer.encode('", "' + key + '": "', add_special_tokens=False)
329
+ co_patterns.append({"key": key, "pattern": pattern, "index": i})
330
+ seqs["co_subkeys"] = co_patterns
331
+ seqs["co_closing"] = tokenizer.encode('"}', add_special_tokens=False)
332
+ # json.dumps produces "}," which may merge into a single token
333
+ seqs["co_closing_comma"] = tokenizer.encode('"},', add_special_tokens=False)
334
+
335
+ # ── future_meta_behavior sub-key patterns ──
336
+ # After dataloader processing (mdm markers removed, NULLs cleaned):
337
+ # ' {"longitudinal": "keep speed", "lateral": "go straight"}'
338
+ # Scaffold = everything except the value content between quotes.
339
+ seqs["fmb_prefix"] = tokenizer.encode(' {"longitudinal": "', add_special_tokens=False)
340
+ seqs["fmb_closing"] = tokenizer.encode('"}', add_special_tokens=False)
341
+ seqs["fmb_closing_comma"] = tokenizer.encode('"},', add_special_tokens=False)
342
+ # Between longitudinal value and lateral value: '", "lateral": "'
343
+ seqs["fmb_between"] = tokenizer.encode('", "lateral": "', add_special_tokens=False)
344
+
345
+ # ── trajectory structure patterns ──
346
+ # After dataloader processing (no mdm markers), traj is:
347
+ # ' "[[+14.70,-00.04], [+29.55,-00.21], ...]"'
348
+ seqs["traj_open"] = tokenizer.encode(' "[[', add_special_tokens=False)
349
+ # After dataloader inserts spaces (e.g. [+14.70,-00.04] → [ +14.70, -00.04]),
350
+ # tokens split cleanly: '],'(1125), ' ['(508), ','(11) are all independent.
351
+ seqs["traj_wp_sep"] = tokenizer.encode('],', add_special_tokens=False) # [1125]
352
+ seqs["traj_wp_open"] = tokenizer.encode(' [', add_special_tokens=False) # [508]
353
+ seqs["traj_coord_comma"] = tokenizer.encode(',', add_special_tokens=False) # [11]
354
+ seqs["traj_close"] = tokenizer.encode(']]"}', add_special_tokens=False)
355
+ seqs["traj_close_split"] = tokenizer.encode(']]"', add_special_tokens=False)
356
+ seqs["traj_close_split2"] = tokenizer.encode(']]', add_special_tokens=False)
357
+ # Trajectory-only output support, e.g. {"trajectory": "..."}.
358
+ seqs["traj_only_boundaries"] = [
359
+ tokenizer.encode('{"trajectory":', add_special_tokens=False),
360
+ tokenizer.encode(' {"trajectory":', add_special_tokens=False),
361
+ tokenizer.encode('"trajectory":', add_special_tokens=False),
362
+ tokenizer.encode(' "trajectory":', add_special_tokens=False),
363
+ ]
364
+
365
+ return seqs
366
+
367
+
368
+ def _mark_scaffold_range(scaffold_positions: List[int], start: int, length: int):
369
+ """Add positions [start, start+length) to scaffold_positions."""
370
+ for i in range(length):
371
+ scaffold_positions.append(start + i)
372
+
373
+
374
+ def compute_section_block_idx_deep_static(
375
+ labels: torch.Tensor,
376
+ token_ids: torch.Tensor,
377
+ deep_scaffold_sequences: Dict[str, object],
378
+ fallback_block_size: int = 32,
379
+ ) -> Tuple[torch.Tensor, torch.Tensor, int, torch.Tensor]:
380
+ """
381
+ Deep-scaffold v2 block index computation.
382
+
383
+ Freezes sub-keys within sections:
384
+ - critical_objects: only yes/no values are denoised
385
+ - future_meta_behavior: only value tokens are denoised
386
+ - trajectory: only coordinate digits are denoised
387
+ - explanation: all content is denoised
388
+
389
+ Block count per section is computed dynamically:
390
+ ``n_blocks = ceil(num_value_tokens / fallback_block_size)``.
391
+
392
+ Args:
393
+ labels: [B, seq_len]
394
+ token_ids: [B, seq_len]
395
+ deep_scaffold_sequences: output of ``build_deep_scaffold_sequences``
396
+ fallback_block_size: block size (bd_size), default 32
397
+
398
+ Returns:
399
+ response_block_idx, turn_idx, n_blocks, scaffold_mask
400
+ """
401
+ labels_single = labels[0]
402
+ token_list = token_ids[0].tolist()
403
+ seq_len = labels_single.shape[0]
404
+ device = labels.device
405
+
406
+ response_mask = (labels_single != -100)
407
+ response_block_idx = torch.full((seq_len,), -1, device=device, dtype=torch.int64)
408
+ turn_idx = torch.zeros((seq_len,), device=device, dtype=torch.int64)
409
+ scaffold_mask = torch.zeros((seq_len,), device=device, dtype=torch.bool)
410
+
411
+ response_positions = response_mask.nonzero(as_tuple=True)[0]
412
+ if len(response_positions) == 0:
413
+ return response_block_idx, turn_idx, 0, scaffold_mask
414
+
415
+ resp_start = response_positions[0].item()
416
+ resp_end = response_positions[-1].item() + 1
417
+ effective_resp_end = resp_end
418
+ resp_tokens = token_list[resp_start:resp_end]
419
+
420
+ top_seqs = deep_scaffold_sequences["top"]
421
+
422
+ # ── Step 1: Find top-level section boundaries (same as static version) ──
423
+ boundary_order = [
424
+ ("prefix", "critical_objects"),
425
+ ("between_co_exp", "explanation"),
426
+ ("between_exp_fmb", "future_meta_behavior"),
427
+ ("between_fmb_traj", "trajectory"),
428
+ ]
429
+
430
+ sections: Dict[str, Tuple[int, int]] = {}
431
+ scaffold_positions: List[int] = []
432
+ # Top-level boundary scaffold tokens should belong to the *following*
433
+ # section's first block (e.g. `"explanation":` -> explanation block 0).
434
+ boundary_scaffold_to_section: Dict[str, List[int]] = {}
435
+
436
+ search_from = 0
437
+ prev_section_name: Optional[str] = None
438
+ prev_value_start: Optional[int] = None
439
+
440
+ for boundary_key, section_name in boundary_order:
441
+ pattern = top_seqs.get(boundary_key)
442
+ if pattern is None:
443
+ continue
444
+ pos = _find_subseq(resp_tokens, pattern, search_from)
445
+ if pos < 0:
446
+ continue
447
+
448
+ if prev_section_name is not None and prev_value_start is not None:
449
+ sections[prev_section_name] = (prev_value_start, pos)
450
+
451
+ _mark_scaffold_range(scaffold_positions, pos, len(pattern))
452
+ boundary_scaffold_to_section.setdefault(section_name, []).extend(
453
+ list(range(pos, pos + len(pattern)))
454
+ )
455
+
456
+ value_start = pos + len(pattern)
457
+ prev_section_name = section_name
458
+ prev_value_start = value_start
459
+ search_from = value_start
460
+
461
+ if prev_section_name is not None and prev_value_start is not None:
462
+ sections[prev_section_name] = (prev_value_start, len(resp_tokens))
463
+
464
+ # New dataset compatibility: response may contain only trajectory.
465
+ # If the 4-section boundaries are not found, try direct trajectory key match.
466
+ if "trajectory" not in sections:
467
+ traj_only_patterns = deep_scaffold_sequences.get("traj_only_boundaries", [])
468
+ # Reuse legacy boundary pattern as additional fallback (contains
469
+ # `"trajectory":` in old-format responses).
470
+ between_fmb_traj = top_seqs.get("between_fmb_traj")
471
+ if between_fmb_traj:
472
+ traj_only_patterns = list(traj_only_patterns) + [between_fmb_traj]
473
+
474
+ traj_pos = -1
475
+ traj_pat: Optional[List[int]] = None
476
+ for pat in traj_only_patterns:
477
+ if not pat:
478
+ continue
479
+ pos = _find_subseq(resp_tokens, pat, 0)
480
+ if pos >= 0:
481
+ traj_pos = pos
482
+ traj_pat = pat
483
+ break
484
+
485
+ if traj_pos >= 0 and traj_pat is not None:
486
+ _mark_scaffold_range(scaffold_positions, traj_pos, len(traj_pat))
487
+ boundary_scaffold_to_section.setdefault("trajectory", []).extend(
488
+ list(range(traj_pos, traj_pos + len(traj_pat)))
489
+ )
490
+ sections["trajectory"] = (traj_pos + len(traj_pat), len(resp_tokens))
491
+ # print(f"sections: {sections}")
492
+ # ── Step 2: Deep scaffold within critical_objects ──
493
+ if "critical_objects" in sections:
494
+ co_start, co_end = sections["critical_objects"]
495
+ co_tokens = resp_tokens[co_start:co_end]
496
+
497
+ co_search = 0
498
+ for entry in deep_scaffold_sequences["co_subkeys"]:
499
+ pattern = entry["pattern"]
500
+ pos = _find_subseq(co_tokens, pattern, co_search)
501
+ if pos < 0:
502
+ continue
503
+ _mark_scaffold_range(scaffold_positions, co_start + pos, len(pattern))
504
+ # The single value token is right after the pattern — skip it
505
+ co_search = pos + len(pattern) + 1
506
+
507
+ # Mark closing '"}' or "}," as scaffold
508
+ co_close = deep_scaffold_sequences["co_closing"]
509
+ close_pos = _find_subseq(co_tokens, co_close,
510
+ max(0, len(co_tokens) - len(co_close) - 2))
511
+ if close_pos >= 0:
512
+ _mark_scaffold_range(scaffold_positions, co_start + close_pos, len(co_close))
513
+ else:
514
+ # json.dumps may produce "}," as a single token
515
+ co_close_comma = deep_scaffold_sequences.get("co_closing_comma")
516
+ if co_close_comma:
517
+ close_pos = _find_subseq(co_tokens, co_close_comma,
518
+ max(0, len(co_tokens) - len(co_close_comma) - 2))
519
+ if close_pos >= 0:
520
+ _mark_scaffold_range(scaffold_positions, co_start + close_pos, len(co_close_comma))
521
+
522
+ # ── Step 2b: Explanation opening/closing quotes as scaffold ──
523
+ # Explanation content is all VALUE, but the surrounding quotes must be
524
+ # SCAFFOLD so that VALUE tokens are exactly block-aligned (multiple of bd_size).
525
+ if "explanation" in sections:
526
+ exp_start, exp_end = sections["explanation"]
527
+ if exp_start < exp_end:
528
+ # Opening quote: first token of explanation section (e.g. ' "')
529
+ scaffold_positions.append(exp_start)
530
+ # Closing quote+comma: last token (e.g. '",')
531
+ scaffold_positions.append(exp_start + (exp_end - exp_start) - 1)
532
+
533
+ # ── Step 3: Deep scaffold within future_meta_behavior ──
534
+ # After dataloader processing, FMB has no <|mdm_start|>/<|mdm_end|> markers.
535
+ # Format: ' {"longitudinal": "keep speed", "lateral": "go straight"}'
536
+ # Strategy: use fmb_prefix to find start, fmb_between to split long/lat values,
537
+ # and fmb_closing to find end. Everything except value content is scaffold.
538
+ if "future_meta_behavior" in sections:
539
+ fmb_start, fmb_end = sections["future_meta_behavior"]
540
+ fmb_tokens = resp_tokens[fmb_start:fmb_end]
541
+
542
+ fmb_scaffold_positions = set()
543
+
544
+ # 1. Mark fmb_prefix as scaffold: ' {"longitudinal": "'
545
+ fmb_prefix = deep_scaffold_sequences["fmb_prefix"]
546
+ prefix_pos = _find_subseq(fmb_tokens, fmb_prefix, 0)
547
+ if prefix_pos >= 0:
548
+ for i in range(prefix_pos, prefix_pos + len(fmb_prefix)):
549
+ fmb_scaffold_positions.add(i)
550
+
551
+ long_value_start = prefix_pos + len(fmb_prefix)
552
+
553
+ # 2. Mark fmb_between as scaffold: '", "lateral": "'
554
+ fmb_between = deep_scaffold_sequences.get("fmb_between")
555
+ if fmb_between:
556
+ between_pos = _find_subseq(fmb_tokens, fmb_between, long_value_start)
557
+ if between_pos >= 0:
558
+ for i in range(between_pos, between_pos + len(fmb_between)):
559
+ fmb_scaffold_positions.add(i)
560
+
561
+ lat_value_start = between_pos + len(fmb_between)
562
+
563
+ # 3. Mark closing '"}' or "}," as scaffold
564
+ fmb_close = deep_scaffold_sequences["fmb_closing"]
565
+ close_pos = _find_subseq(fmb_tokens, fmb_close,
566
+ max(0, len(fmb_tokens) - len(fmb_close) - 2))
567
+ if close_pos < 0:
568
+ fmb_close_comma = deep_scaffold_sequences.get("fmb_closing_comma")
569
+ if fmb_close_comma:
570
+ close_pos = _find_subseq(fmb_tokens, fmb_close_comma,
571
+ max(0, len(fmb_tokens) - len(fmb_close_comma) - 2))
572
+ if close_pos >= 0:
573
+ fmb_close = fmb_close_comma
574
+ if close_pos >= 0:
575
+ for i in range(close_pos, close_pos + len(fmb_close)):
576
+ fmb_scaffold_positions.add(i)
577
+
578
+ for i in fmb_scaffold_positions:
579
+ scaffold_positions.append(fmb_start + i)
580
+
581
+ # ── Step 4: Deep scaffold within trajectory ──
582
+ # After dataloader processing (no mdm markers), trajectory is:
583
+ # ' "[[+14.70,-00.04], [+29.55,-00.21], ...]"'
584
+ if "trajectory" in sections:
585
+ traj_start, traj_end = sections["trajectory"]
586
+ traj_tokens = resp_tokens[traj_start:traj_end]
587
+
588
+ # Opening "[[
589
+ traj_open = deep_scaffold_sequences["traj_open"]
590
+ open_pos = _find_subseq(traj_tokens, traj_open, 0)
591
+ if open_pos >= 0:
592
+ _mark_scaffold_range(scaffold_positions, traj_start + open_pos, len(traj_open))
593
+
594
+ # Waypoint separators ], (4 of them between 5 waypoints)
595
+ traj_wp_sep = deep_scaffold_sequences["traj_wp_sep"]
596
+ sep_search = 0
597
+ for _ in range(4):
598
+ sep_pos = _find_subseq(traj_tokens, traj_wp_sep, sep_search)
599
+ if sep_pos < 0:
600
+ break
601
+ _mark_scaffold_range(scaffold_positions, traj_start + sep_pos, len(traj_wp_sep))
602
+ sep_search = sep_pos + len(traj_wp_sep)
603
+
604
+ # Intermediate waypoint opening ' [' (4 of them, between 5 waypoints)
605
+ traj_wp_open = deep_scaffold_sequences.get("traj_wp_open")
606
+ if traj_wp_open:
607
+ wo_search = 0
608
+ for _ in range(4):
609
+ wo_pos = _find_subseq(traj_tokens, traj_wp_open, wo_search)
610
+ if wo_pos < 0:
611
+ break
612
+ _mark_scaffold_range(scaffold_positions, traj_start + wo_pos, len(traj_wp_open))
613
+ wo_search = wo_pos + len(traj_wp_open)
614
+
615
+ # Coordinate comma ',' between x and y within each waypoint (5 of them)
616
+ traj_coord_comma = deep_scaffold_sequences.get("traj_coord_comma")
617
+ if traj_coord_comma:
618
+ cc_search = 0
619
+ for _ in range(5):
620
+ cc_pos = _find_subseq(traj_tokens, traj_coord_comma, cc_search)
621
+ if cc_pos < 0:
622
+ break
623
+ _mark_scaffold_range(scaffold_positions, traj_start + cc_pos, len(traj_coord_comma))
624
+ cc_search = cc_pos + len(traj_coord_comma)
625
+
626
+ # Closing ]]" or just ]]
627
+ traj_close = deep_scaffold_sequences["traj_close"]
628
+ close_pos = _find_subseq(traj_tokens, traj_close,
629
+ max(0, len(traj_tokens) - len(traj_close) - 6))
630
+ if close_pos < 0:
631
+ for split_key in ["traj_close_split", "traj_close_split2"]:
632
+ tcs = deep_scaffold_sequences.get(split_key)
633
+ if tcs:
634
+ close_pos = _find_subseq(traj_tokens, tcs,
635
+ max(0, len(traj_tokens) - len(tcs) - 6))
636
+ if close_pos >= 0:
637
+ traj_close = tcs
638
+ break
639
+ if close_pos >= 0:
640
+ _mark_scaffold_range(scaffold_positions, traj_start + close_pos, len(traj_close))
641
+ # Align training with inference scaffold: exclude trailing tokens
642
+ # after the JSON closing of trajectory (e.g. "<|im_end|>\n") from
643
+ # section/block scheduling.
644
+ effective_resp_end = min(
645
+ effective_resp_end,
646
+ resp_start + traj_start + close_pos + len(traj_close),
647
+ )
648
+
649
+ # Opening quote " (first token of traj value)
650
+ if len(traj_tokens) > 0:
651
+ scaffold_positions.append(traj_start)
652
+
653
+ # ── Mark scaffold mask (absolute positions) ──
654
+ scaffold_positions_set = set(scaffold_positions)
655
+ for sp in scaffold_positions_set:
656
+ abs_pos = resp_start + sp
657
+ if abs_pos < seq_len:
658
+ scaffold_mask[abs_pos] = True
659
+
660
+ # ── Assign block indices per section ──
661
+ current_block = 0
662
+ assigned = set()
663
+ block_to_section = {} # block_idx -> section_name (for SASD compatibility)
664
+ section_first_block: Dict[str, int] = {}
665
+
666
+ for section_name in SECTION_KEYS:
667
+ if section_name not in sections:
668
+ continue
669
+
670
+ rel_start, rel_end = sections[section_name]
671
+ abs_start = resp_start + rel_start
672
+ abs_end = resp_start + rel_end
673
+ abs_start = max(abs_start, resp_start)
674
+ abs_end = min(abs_end, effective_resp_end)
675
+
676
+ num_tokens = abs_end - abs_start
677
+ if num_tokens <= 0:
678
+ continue
679
+
680
+ # Count only non-scaffold tokens for block sizing
681
+ value_positions = [p for p in range(abs_start, abs_end)
682
+ if response_mask[p] and (p - resp_start) not in scaffold_positions_set]
683
+ num_value_tokens = len(value_positions)
684
+
685
+ if num_value_tokens <= 0:
686
+ section_first_block[section_name] = current_block
687
+ block_to_section[current_block] = section_name
688
+ current_block += 1
689
+ continue
690
+
691
+ # Use fixed block size (bd_size) and compute number of blocks dynamically
692
+ tokens_per_step = fallback_block_size
693
+ n_steps = max(1, math.ceil(num_value_tokens / tokens_per_step))
694
+
695
+ for b in range(n_steps):
696
+ block_to_section[current_block + b] = section_name
697
+
698
+ section_first_block[section_name] = current_block
699
+ for vi, pos in enumerate(value_positions):
700
+ block_in_section = min(vi // tokens_per_step, n_steps - 1)
701
+ response_block_idx[pos] = current_block + block_in_section
702
+ assigned.add(pos)
703
+
704
+ current_block += n_steps
705
+
706
+ # Assign scaffold tokens within each section to the nearest value token
707
+ # in the SAME section. This keeps section-closing tokens such as `"},`
708
+ # with their section instead of drifting to the next section.
709
+ for section_name in SECTION_KEYS:
710
+ if section_name not in sections:
711
+ continue
712
+ rel_start, rel_end = sections[section_name]
713
+ abs_start = max(resp_start + rel_start, resp_start)
714
+ abs_end = min(resp_start + rel_end, resp_end)
715
+ if abs_end <= abs_start:
716
+ continue
717
+
718
+ for abs_pos in range(abs_start, abs_end):
719
+ rel_pos = abs_pos - resp_start
720
+ if (
721
+ abs_pos >= seq_len
722
+ or not response_mask[abs_pos]
723
+ or abs_pos in assigned
724
+ or rel_pos not in scaffold_positions_set
725
+ ):
726
+ continue
727
+
728
+ best_block = -1
729
+ max_delta = max(1, abs_end - abs_start)
730
+ for delta in range(1, max_delta + 1):
731
+ # Prefer left first so closing punctuation tends to stay with
732
+ # the preceding content in the same section.
733
+ for cand in [abs_pos - delta, abs_pos + delta]:
734
+ if abs_start <= cand < abs_end and cand in assigned:
735
+ best_block = response_block_idx[cand].item()
736
+ break
737
+ if best_block >= 0:
738
+ break
739
+
740
+ if best_block < 0:
741
+ best_block = section_first_block.get(section_name, -1)
742
+
743
+ if best_block >= 0:
744
+ response_block_idx[abs_pos] = best_block
745
+ assigned.add(abs_pos)
746
+
747
+ # Top-level boundary tokens are explicitly attached to the following
748
+ # section's first block, instead of nearest-neighbor assignment.
749
+ for section_name, rel_positions in boundary_scaffold_to_section.items():
750
+ first_block = section_first_block.get(section_name)
751
+ if first_block is None:
752
+ continue
753
+ for rel_pos in rel_positions:
754
+ abs_pos = resp_start + rel_pos
755
+ if abs_pos >= seq_len or not response_mask[abs_pos]:
756
+ continue
757
+ response_block_idx[abs_pos] = first_block
758
+ assigned.add(abs_pos)
759
+
760
+ # Scaffold tokens → block index of nearest assigned neighbour
761
+ for sp in scaffold_positions_set:
762
+ abs_pos = resp_start + sp
763
+ if (
764
+ abs_pos >= seq_len
765
+ or abs_pos >= effective_resp_end
766
+ or not response_mask[abs_pos]
767
+ or abs_pos in assigned
768
+ ):
769
+ continue
770
+ best_block = -1
771
+ for delta in range(1, seq_len):
772
+ for cand in [abs_pos + delta, abs_pos - delta]:
773
+ if 0 <= cand < seq_len and cand in assigned:
774
+ best_block = response_block_idx[cand].item()
775
+ break
776
+ if best_block >= 0:
777
+ break
778
+ if best_block >= 0:
779
+ response_block_idx[abs_pos] = best_block
780
+ assigned.add(abs_pos)
781
+
782
+ # Fallback for unassigned response tokens
783
+ for pos in range(resp_start, effective_resp_end):
784
+ if response_mask[pos] and pos not in assigned:
785
+ offset = pos - resp_start
786
+ response_block_idx[pos] = current_block + offset // fallback_block_size
787
+ assigned.add(pos)
788
+
789
+ fallback_positions = [p for p in range(resp_start, effective_resp_end)
790
+ if response_mask[p] and response_block_idx[p].item() >= current_block]
791
+ if fallback_positions:
792
+ current_block = max(response_block_idx[p].item() for p in fallback_positions) + 1
793
+
794
+ n_blocks = current_block
795
+
796
+ # Turn index
797
+ for i in range(1, seq_len):
798
+ if response_block_idx[i] != response_block_idx[i - 1]:
799
+ turn_idx[i] = turn_idx[i - 1] + 1
800
+ else:
801
+ turn_idx[i] = turn_idx[i - 1]
802
+
803
+ return response_block_idx, turn_idx, n_blocks, scaffold_mask, block_to_section
special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "|<MASK>|",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "eos_token": {
12
+ "content": "<|im_end|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "pad_token": {
19
+ "content": "<|endoftext|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ }
25
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a25ec1183126b2a0a76961dba7680d62b2209776fc31d39b85be8833b9386ae9
3
+ size 11422266
tokenizer_config.json ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "|<MASK>|",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|NULL|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ }
197
+ },
198
+ "additional_special_tokens": [
199
+ "|<MASK>|"
200
+ ],
201
+ "bos_token": null,
202
+ "clean_up_tokenization_spaces": false,
203
+ "eos_token": "<|im_end|>",
204
+ "errors": "replace",
205
+ "extra_special_tokens": {},
206
+ "model_max_length": 131072,
207
+ "pad_token": "<|endoftext|>",
208
+ "padding_side": "right",
209
+ "split_special_tokens": false,
210
+ "tokenizer_class": "Qwen2Tokenizer",
211
+ "unk_token": null
212
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff