Instructions to use xiwenyoumu/Fast-dDrive with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use xiwenyoumu/Fast-dDrive with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="xiwenyoumu/Fast-dDrive", trust_remote_code=True)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("xiwenyoumu/Fast-dDrive", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use xiwenyoumu/Fast-dDrive with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "xiwenyoumu/Fast-dDrive"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "xiwenyoumu/Fast-dDrive",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/xiwenyoumu/Fast-dDrive

SGLang

How to use xiwenyoumu/Fast-dDrive with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "xiwenyoumu/Fast-dDrive" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "xiwenyoumu/Fast-dDrive",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "xiwenyoumu/Fast-dDrive" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "xiwenyoumu/Fast-dDrive",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use xiwenyoumu/Fast-dDrive with Docker Model Runner:
```
docker model run hf.co/xiwenyoumu/Fast-dDrive
```

xiwenyoumu commited on about 17 hours ago

Commit

5e9a603

verified ·

1 Parent(s): b1bd585

Initial Fast-dDrive 3B release

Browse files

Files changed (22) hide show

.gitattributes +1 -0
README.md +89 -0
__init__.py +33 -0
added_tokens.json +26 -0
chat_template.jinja +7 -0
config.json +199 -0
configuration.py +201 -0
generation_config.json +11 -0
generation_utils.py +1192 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +833 -0
modeling.py +0 -0
preprocessor_config.json +19 -0
section_utils.py +803 -0
special_tokens_map.json +25 -0
tokenizer.json +3 -0
tokenizer_config.json +212 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,92 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+library_name: transformers
+pipeline_tag: image-text-to-text
+tags:
+- block-diffusion
+- vision-language-action
+- autonomous-driving
+- qwen2.5-vl
 ---
+# Fast-dDrive
+Fast-dDrive is a block-diffusion Vision-Language-Action (VLA) model for
+end-to-end autonomous driving, built on Qwen2.5-VL-3B. It pairs section-aware
+structured-diffusion training (SASD) with scaffold-aware speculative decoding
+(Scaffold Spec) and an optional shared-prefix multi-trajectory inference
+scaling scheme, and reaches SOTA accuracy on the Waymo Open Dataset
+End-to-End Driving (WOD-E2E) benchmark at over 200 tokens / second on a
+single H100.
+## Quick start
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+MODEL = "Efficient-Large-Model/Fast_dDrive_3B"   # or your local clone
+processor = AutoProcessor.from_pretrained(MODEL, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL,
+    trust_remote_code=True,
+    dtype=torch.bfloat16,
+).cuda().eval()
+# Scaffold Spec (paper canonical, threshold = 0.0)
+output_ids = model.scaffold_speculative_sample(
+    input_ids=input_ids,
+    attention_mask=attention_mask,
+    pixel_values=pixel_values,
+    image_grid_thw=image_grid_thw,
+    confidence_threshold=0.0,
+    block_size=32,
+    max_new_tokens=512,
+)
+```
+## Inference paths
+This release exposes three decoding paths as bound methods on the model:
+| Method | Description | Threshold |
+|---|---|---|
+| `mdm_sample_deep_scaffold` | **Section Diffusion (SD)** — iterative MDM denoising over a pre-filled JSON scaffold | `0.9` |
+| `scaffold_speculative_sample` | **Scaffold Spec (SS)** — scaffold-aware self-speculative decoding (MDM draft + AR verify per block). Paper canonical. | `0.0` |
+| `scaffold_spec_with_ss_multi_traj` | **SS multi-rollout** — shared-prefix N-rollout inference scaling on the trajectory section | `0.0` |
+> **Important:** `scaffold_speculative_sample` and its multi-traj variant must
+> be run with `confidence_threshold=0.0` to reproduce the paper numbers.
+> Running at `0.9` silently degrades both ADE and throughput.
+## Headline results — WOD-E2E test set (single H100)
+| Mode | RFS ↑ | ADE@3s ↓ | ADE@5s ↓ | TPS ↑ | Tok/Step ↑ |
+|---|---|---|---|---|---|
+| Scaffold Spec | 7.823 | 1.254 | 2.907 | 210.4 | 4.90 |
+| + Inference scaling (N=4) | 7.827 | 1.240 | 2.821 | 114.7 | 2.76 |
+On the WOD-E2E val set, Scaffold Spec runs at 1919 ms / sample (4.1× over the
+AR baseline); fused with SGLang the same configuration drops to 665 ms /
+sample at 608.5 TPS — the 11.8× / 12× speedup over AR cited in the paper.
+## Files
+- `modeling.py` — model definition (`Fast_dDriveForConditionalGeneration`)
+- `configuration.py` — config classes
+- `section_utils.py` — scaffold construction + section-aligned block index utilities
+- `generation_utils.py` — the three inference paths, attached to the model class on import
+- `config.json`, `generation_config.json`, `preprocessor_config.json`, `chat_template.jinja`, tokenizer files — standard HF artifacts
+- `model-0000{1..4}-of-00004.safetensors` — model weights (4 shards)
+## Citation
+```bibtex
+@misc{fastddrive2026,
+  title  = {Fast-dDrive: Section-Aware Diffusion VLAs for End-to-End Driving},
+  author = {Anonymous},
+  year   = {2026},
+  note   = {Submitted to NeurIPS 2026},
+}
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""Fast-dDrive HF release package.
+Mirrors the layout of ``Efficient-Large-Model/Fast_dVLM_3B`` on the Hugging Face
+Hub: a single :mod:`modeling` module that holds the model definition plus
+inference-time decoding paths (Section Diffusion, Scaffold Spec, and
+Scaffold Spec with multi-trajectory rollouts), and a :mod:`configuration`
+module with the config classes.
+Users normally load the model via::
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    model = AutoModelForCausalLM.from_pretrained(
+        "Efficient-Large-Model/Fast_dDrive",  # or local path
+        trust_remote_code=True,
+    )
+with the ``auto_map`` entry in :file:`config.json` pointing back to the classes
+defined here.
+"""
+from .configuration import (
+    Fast_dDriveConfig,
+    Fast_dDriveTextConfig,
+    Fast_dDriveVisionConfig,
+)
+from .modeling import Fast_dDriveForConditionalGeneration
+__all__ = [
+    "Fast_dDriveConfig",
+    "Fast_dDriveTextConfig",
+    "Fast_dDriveVisionConfig",
+    "Fast_dDriveForConditionalGeneration",
+]

added_tokens.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|NULL|>": 151666,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652,
+  "|<MASK>|": 151665
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,199 @@

+{
+  "always_mask_im_end": true,
+  "anneal_block_size": false,
+  "architectures": [
+    "Fast_dDriveForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bd_size": 32,
+  "block_causal_no_dynamic": false,
+  "complementary_mask": true,
+  "deep_json_scaffold": true,
+  "dtype": "float32",
+  "enable_efficient_vision_embed": false,
+  "entropy_loss": false,
+  "entropy_loss_weight": 1.0,
+  "eos_token_id": 151645,
+  "flexible_bd_size": false,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "image_token_id": 151655,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 70,
+  "minimum_noise_level": 0.001,
+  "model_type": "fast_d_drive",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 1000000.0,
+  "section_block_steps": null,
+  "section_loss_weights": {
+    "critical_objects": 1.5,
+    "explanation": 1.0,
+    "future_meta_behavior": 2.0,
+    "trajectory": 3.0
+  },
+  "section_noise_schedule": {
+    "critical_objects": "1.0,2.0",
+    "explanation": "1.0,1.0",
+    "future_meta_behavior": "1.0,1.5",
+    "trajectory": "2.0,1.0"
+  },
+  "section_token_budgets": null,
+  "sliding_window": 32768,
+  "static_json_scaffold": false,
+  "text_config": {
+    "always_mask_im_end": true,
+    "anneal_block_size": false,
+    "architectures": [
+      "Fast_dDriveForConditionalGeneration"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration.Fast_dDriveConfig",
+      "AutoModel": "modeling.Fast_dDriveForConditionalGeneration",
+      "AutoModelForCausalLM": "modeling.Fast_dDriveForConditionalGeneration"
+    },
+    "bd_size": 32,
+    "block_causal_no_dynamic": false,
+    "block_length": null,
+    "bos_token_id": 151643,
+    "complementary_mask": true,
+    "deep_json_scaffold": true,
+    "dtype": "float32",
+    "enable_efficient_vision_embed": false,
+    "entropy_loss": false,
+    "entropy_loss_weight": 1.0,
+    "eos_token_id": 151645,
+    "flexible_bd_size": false,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "image_token_id": null,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 128000,
+    "max_window_layers": 70,
+    "minimum_noise_level": 0.001,
+    "model_type": "fast_d_drive_for_causal_lm",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_section": [
+        16,
+        24,
+        24
+      ],
+      "rope_type": "default",
+      "type": "default"
+    },
+    "rope_theta": 1000000.0,
+    "section_block_steps": null,
+    "section_token_budgets": null,
+    "self_spec_inference_mode": null,
+    "sliding_window": null,
+    "static_json_scaffold": false,
+    "tie_word_embeddings": true,
+    "use_block_causal_mask": true,
+    "use_cache": true,
+    "use_json_scaffold": true,
+    "use_sliding_window": false,
+    "video_token_id": null,
+    "vision_end_token_id": 151653,
+    "vision_start_token_id": 151652,
+    "vision_token_id": 151654,
+    "vocab_size": 151936
+  },
+  "transformers_version": "4.57.1",
+  "use_block_causal_mask": true,
+  "use_cache": true,
+  "use_json_scaffold": true,
+  "use_sliding_window": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "depth": 32,
+    "dtype": "float32",
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3420,
+    "model_type": "fast_d_drive",
+    "num_heads": 16,
+    "out_hidden_size": 2048,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "temporal_patch_size": 2,
+    "tokens_per_second": 2,
+    "window_size": 112
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vocab_size": 151936,
+  "auto_map": {
+    "AutoConfig": "configuration.Fast_dDriveConfig",
+    "AutoModel": "modeling.Fast_dDriveForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling.Fast_dDriveForConditionalGeneration"
+  }
+}

configuration.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.modeling_rope_utils import rope_config_validation
+class Fast_dDriveVisionConfig(PretrainedConfig):
+    model_type = "fast_d_drive"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        depth=32,
+        hidden_size=3584,
+        hidden_act="silu",
+        intermediate_size=3420,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        tokens_per_second=4,
+        window_size=112,
+        out_hidden_size=3584,
+        fullatt_block_indexes=[7, 15, 23, 31],
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+        self.window_size = window_size
+        self.fullatt_block_indexes = fullatt_block_indexes
+        self.out_hidden_size = out_hidden_size
+        self.initializer_range = initializer_range
+class Fast_dDriveTextConfig(PretrainedConfig):
+    model_type = "fast_d_drive_for_causal_lm"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=8192,
+        intermediate_size=29568,
+        num_hidden_layers=80,
+        num_attention_heads=64,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=80,
+        layer_types=None,
+        attention_dropout=0.0,
+        rope_scaling=None,
+        image_token_id=None,
+        video_token_id=None,
+        bd_size=8,
+        self_spec_inference_mode=None,
+        block_length=None,
+        use_block_causal_mask=False,
+        complementary_mask=True,
+        minimum_noise_level=1e-3,
+        entropy_loss=False,
+        entropy_loss_weight=1.0,
+        block_causal_no_dynamic=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if self.use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        self.bd_size = bd_size
+        self.layer_types = layer_types
+        self.use_block_causal_mask = use_block_causal_mask
+        self.complementary_mask = complementary_mask
+        self.minimum_noise_level = minimum_noise_level
+        self.entropy_loss = entropy_loss
+        self.entropy_loss_weight = entropy_loss_weight
+        self.block_causal_no_dynamic = block_causal_no_dynamic
+        self.self_spec_inference_mode = self_spec_inference_mode
+        self.block_length = block_length
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if self.sliding_window is not None and i >= self.max_window_layers
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations
+        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
+        # TODO: @raushan update config in the hub
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self, ignore_keys={"mrope_section"})
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+class Fast_dDriveConfig(PretrainedConfig):
+    model_type = "fast_d_drive"
+    sub_configs = {"vision_config": Fast_dDriveVisionConfig, "text_config": Fast_dDriveTextConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151655,
+        video_token_id=151656,
+        enable_efficient_vision_embed=False,
+        always_mask_im_end=False,
+        flexible_bd_size=False,
+        anneal_block_size=False,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init `TextConfig`
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.enable_efficient_vision_embed = enable_efficient_vision_embed
+        self.always_mask_im_end = always_mask_im_end
+        self.flexible_bd_size = flexible_bd_size
+        self.anneal_block_size = anneal_block_size
+        super().__init__(**kwargs)
+    # def to_dict(self):
+    #     output = super().to_dict()
+    #     output.pop("auto_map", None)
+    #     return output
+__all__ = ["Fast_dDriveConfig", "Fast_dDriveTextConfig"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 1e-06,
+  "transformers_version": "4.57.1"
+}

generation_utils.py ADDED Viewed

	@@ -0,0 +1,1192 @@

+"""Generation utilities for Fast-dDrive.
+This module provides the three inference paths exposed by the canonical paper
+release:
+* ``mdm_sample_deep_scaffold`` — Section Diffusion (SD): iterative MDM
+  denoising over a pre-filled JSON scaffold, no AR verification.
+* ``scaffold_speculative_sample`` — Scaffold Spec (SS): scaffold-aware
+  self-speculative decoding (MDM draft + AR verify per block).
+* ``scaffold_spec_with_ss_multi_traj`` — SS with shared-prefix multi-trajectory
+  rollouts (the test-time inference-scaling path).
+All three are attached as bound methods on
+:class:`Fast_dDriveForConditionalGeneration` when this module is
+imported (see ``modeling.py`` for the import hook).
+"""
+import os
+import re
+import sys
+import math
+import torch
+import types
+import numpy as np
+from transformers.cache_utils import DynamicCache
+def _crop_cache(past_key_values, max_length: int):
+    """Crop a DynamicCache to max_length tokens, compatible with Qwen cache layout."""
+    new_past_key_values = []
+    for layer_num in range(len(past_key_values)):
+        layer_past_key_values = ()
+        for kv_idx in range(len(past_key_values[layer_num])):
+            layer_past_key_values += (past_key_values[layer_num][kv_idx][:, :, :max_length, :],)
+        new_past_key_values.append(layer_past_key_values)
+    return DynamicCache(new_past_key_values)
+def _sample_from_logits(logits, temperature=0.0):
+    """Sample token ids from logits with optional temperature scaling.
+    When temperature <= 0, falls back to argmax (greedy).
+    """
+    if temperature <= 0:
+        return logits.argmax(dim=-1)
+    scaled = logits / temperature
+    probs = torch.softmax(scaled, dim=-1)
+    original_shape = probs.shape[:-1]
+    flat_probs = probs.reshape(-1, probs.shape[-1])
+    sampled = torch.multinomial(flat_probs, num_samples=1).squeeze(-1)
+    return sampled.reshape(original_shape)
+# ---------------------------------------------------------------------------
+# mdm_sample_deep_scaffold — Section Diffusion (SD)
+# ---------------------------------------------------------------------------
+def mdm_sample_deep_scaffold(
+    self,
+    input_ids,
+    tokenizer,
+    max_tokens=512,
+    pixel_values=None,
+    image_grid_thw=None,
+    mask_id=151665,
+    null_id=151666,
+    threshold=0.9,
+    stop_token=151645,
+    explanation_block_size=32,
+    explanation_max_blocks=6,
+    block_size=32,
+    return_stats=False,
+    use_kv_cache=True,
+    temperature=0.0,
+):
+    """
+    Deep scaffold MDM generation with train-consistent hybrid block causal mask.
+    Pre-fills the entire JSON scaffold (including sub-keys for critical_objects,
+    future_meta_behavior, trajectory) with MASK tokens at value positions only.
+    Then denoises each section's value tokens via iterative unmasking.
+    The attention mask matches training: prompt tokens use causal attention,
+    response tokens use block-causal attention where each section's denoise
+    steps form separate blocks. Block i can see all prompt tokens and blocks
+    0..i, but NOT blocks i+1..N (which still contain MASK tokens).
+    For explanation (variable length), NULL tokens in the output signal that
+    the section content is complete — trailing NULLs are stripped.
+    KV-cache path (``use_kv_cache=True``, default):
+        Prompt K/V is computed once with vision embedding scatter, then each
+        response block, once fully denoised, gets its K/V appended to the
+        cache. Subsequent blocks' iterative unmasking only forwards their
+        own ~block_size tokens against the cache (plus prior committed
+        blocks), avoiding O(seqlen^2) recomputation of the prompt + prior
+        blocks every iteration. Correctness is preserved because
+        block-causal attention means block k only attends to prompt +
+        blocks 0..k, which is exactly what the cache provides.
+    """
+    import math
+    import os as _os
+    from .section_utils import (
+        build_deep_json_scaffold,
+        SECTION_KEYS,
+        NULL_TOKEN_ID,
+    )
+    # Env override for A/B testing the KV cache path without editing code.
+    _kv_env = _os.environ.get("MDM_DS_USE_KV_CACHE")
+    if _kv_env is not None:
+        use_kv_cache = _kv_env not in ("0", "false", "False", "")
+    scaffold_tokens, section_ranges, scaffold_mask_list = build_deep_json_scaffold(
+        tokenizer,
+        mask_id=mask_id,
+        null_id=null_id,
+        explanation_block_size=explanation_block_size,
+        explanation_max_blocks=explanation_max_blocks,
+    )
+    tokens_per_step = []
+    original_input_length = input_ids.shape[1]
+    # Phase 1: Build sequence with scaffold appended
+    scaffold_tensor = torch.tensor(scaffold_tokens, device=self.device, dtype=torch.long).unsqueeze(0)
+    x_t = torch.cat([input_ids, scaffold_tensor], dim=1)
+    seqlen = x_t.shape[1]
+    # Track scaffold (frozen) vs value (to denoise) positions in scaffold region
+    scaffold_frozen = torch.tensor(scaffold_mask_list, device=self.device, dtype=torch.bool)
+    # ── Build response_block_idx matching training's compute_section_block_idx_deep_static ──
+    response_block_idx = torch.full((seqlen,), -1, device=self.device, dtype=torch.long)
+    current_block = 0
+    assigned = set()
+    for section_name in SECTION_KEYS:
+        if section_name not in section_ranges:
+            continue
+        sec_start, sec_end = section_ranges[section_name]
+        # Find value positions (non-scaffold) in this section
+        value_positions = []
+        for i in range(sec_start, sec_end):
+            if not scaffold_mask_list[i]:  # 0 = value token
+                value_positions.append(original_input_length + i)
+        if not value_positions:
+            current_block += 1
+            continue
+        # Block assignment MUST match training's
+        # compute_section_block_idx_deep_static: n_blocks = ceil(value/block_size)
+        # for every section. Previously non-explanation sections were forced to
+        # a single block; that broke attention alignment for trajectory
+        # (70 value tokens → training 3 blocks vs inference 1 block), causing
+        # trajectory over-extrapolation. CO (12) and FMB (6) still resolve
+        # to 1 block since their value counts are < block_size.
+        tokens_per_step_sec = block_size
+        n_steps = max(1, math.ceil(len(value_positions) / tokens_per_step_sec))
+        # Assign block indices to value tokens
+        for vi, abs_pos in enumerate(value_positions):
+            block_in_section = min(vi // tokens_per_step_sec, n_steps - 1)
+            response_block_idx[abs_pos] = current_block + block_in_section
+            assigned.add(abs_pos)
+        # Assign scaffold tokens to nearest value token's block
+        for i in range(sec_start, sec_end):
+            abs_pos = original_input_length + i
+            if scaffold_mask_list[i] and abs_pos not in assigned:
+                best_block = -1
+                for delta in range(1, sec_end - sec_start + 10):
+                    for cand in [abs_pos + delta, abs_pos - delta]:
+                        if cand in assigned:
+                            best_block = response_block_idx[cand].item()
+                            break
+                    if best_block >= 0:
+                        break
+                if best_block >= 0:
+                    response_block_idx[abs_pos] = best_block
+                    assigned.add(abs_pos)
+        current_block += n_steps
+    # Assign any remaining unassigned scaffold tokens (e.g. top-level separators)
+    for i in range(len(scaffold_tokens)):
+        abs_pos = original_input_length + i
+        if abs_pos not in assigned:
+            # Find nearest assigned position
+            best_block = -1
+            for delta in range(1, seqlen):
+                for cand in [abs_pos + delta, abs_pos - delta]:
+                    if 0 <= cand < seqlen and cand in assigned:
+                        best_block = response_block_idx[cand].item()
+                        break
+                if best_block >= 0:
+                    break
+            if best_block >= 0:
+                response_block_idx[abs_pos] = best_block
+                assigned.add(abs_pos)
+    # ── Build hybrid block causal mask (computed once, reused for all forward passes) ──
+    attention_mask = self.model.eval_hybrid_mask(seqlen, response_block_idx).to(self.device)
+    # Section-MoE-LoRA: set section_ids before language model forward
+    set_section_ids = lambda *a, **kw: None  # noqa: E731  (Section-MoE-LoRA disabled in release)
+    # Map block indices to section IDs (0=CO, 1=Exp, 2=FMB, 3=Traj, 4=Other/Prompt)
+    _sec_ids = torch.full((seqlen,), 4, device=self.device, dtype=torch.long)
+    for section_name, (sec_start, sec_end) in section_ranges.items():
+        abs_start = original_input_length + sec_start
+        abs_end = original_input_length + sec_end
+        if section_name == "critical_objects":
+            _sec_ids[abs_start:abs_end] = 0
+        elif section_name == "explanation":
+            _sec_ids[abs_start:abs_end] = 1
+        elif section_name == "future_meta_behavior":
+            _sec_ids[abs_start:abs_end] = 2
+        elif section_name == "trajectory":
+            _sec_ids[abs_start:abs_end] = 3
+    # Add batch dimension
+    _sec_ids_batch = _sec_ids.unsqueeze(0)
+    set_section_ids(_sec_ids_batch)
+    # ── Precompute vision embeddings and position_ids once ──
+    # BUG FIX: Previously pixel_values was only passed on the first forward
+    # (step==0) but with use_cache=False every forward is independent, so all
+    # subsequent forwards lost vision information entirely.
+    _embed_fn = self.model.get_input_embeddings()
+    _cached_image_embeds = None
+    _cached_image_mask = None
+    if pixel_values is not None:
+        _cached_image_embeds = self.model.get_image_features(pixel_values, image_grid_thw)
+        _cached_image_embeds = torch.cat(_cached_image_embeds, dim=0).to(
+            self.device, _embed_fn.weight.dtype
+        )
+        _tmp_embeds = _embed_fn(x_t)
+        _cached_image_mask, _ = self.model.get_placeholder_mask(
+            x_t, inputs_embeds=_tmp_embeds, image_features=_cached_image_embeds
+        )
+    # Compute position_ids once with correct image_grid_thw (3D RoPE)
+    _position_ids, _rope_deltas = self.model.get_rope_index(
+        x_t, image_grid_thw, None
+    )
+    self.model.rope_deltas = _rope_deltas
+    # ── Compute contiguous block ranges in the response region ──
+    # Each block's absolute [start, end) range in x_t is the maximal
+    # contiguous span of positions sharing the same response_block_idx.
+    # Blocks are ordered by block_idx and cover the entire response.
+    _block_ranges = []  # list of (block_idx, abs_start, abs_end)
+    _cur_bi = None
+    _cur_start = None
+    for _p in range(seqlen):
+        _bi = int(response_block_idx[_p].item())
+        if _bi < 0:
+            if _cur_bi is not None:
+                _block_ranges.append((_cur_bi, _cur_start, _p))
+                _cur_bi, _cur_start = None, None
+            continue
+        if _cur_bi is None:
+            _cur_bi, _cur_start = _bi, _p
+        elif _bi != _cur_bi:
+            _block_ranges.append((_cur_bi, _cur_start, _p))
+            _cur_bi, _cur_start = _bi, _p
+    if _cur_bi is not None:
+        _block_ranges.append((_cur_bi, _cur_start, seqlen))
+    # Map block_idx -> section_name for downstream logic (section-specific
+    # behaviors like explanation NULL handling can still be scoped).
+    _block_idx_to_section = {}
+    for _sname, (_sstart, _send) in section_ranges.items():
+        _sabs_start = original_input_length + _sstart
+        _sabs_end = original_input_length + _send
+        for _bi, _bs, _be in _block_ranges:
+            # Assign section by whether the block's range overlaps the section
+            if _bs < _sabs_end and _be > _sabs_start:
+                _block_idx_to_section.setdefault(_bi, _sname)
+    # ── Phase 2: Denoise block-by-block with optional KV cache ──
+    # Without cache (fallback): each forward replays the entire sequence.
+    # With cache: prompt K/V computed once; each block's finalized K/V is
+    # appended after denoising, so later blocks only forward their own
+    # ~block_size tokens against the cache.
+    step = 0
+    past_kv = None
+    prev_last_logit = None  # logit at the position just before the next block
+    if use_kv_cache:
+        # Phase 0: prompt prefill. Includes vision scatter; cache becomes
+        # the reusable foundation for every scaffold block.
+        prompt_tokens = x_t[:, :original_input_length]
+        prompt_embeds = _embed_fn(prompt_tokens)
+        if _cached_image_embeds is not None:
+            prompt_image_mask = _cached_image_mask[:, :original_input_length]
+            prompt_embeds = prompt_embeds.masked_scatter(
+                prompt_image_mask, _cached_image_embeds
+            )
+        prompt_position_ids = _position_ids[..., :original_input_length]
+        # Causal over prompt (matches training's prompt-side attention).
+        # When attention_mask=None, the model's eval_mask auto-builds causal
+        # because use_block_causal_mask=True and update_kv_cache=True.
+        prompt_out = self.forward(
+            inputs_embeds=prompt_embeds,
+            position_ids=prompt_position_ids,
+            attention_mask=None,
+            past_key_values=None,
+            use_cache=True,
+            update_kv_cache=True,
+        )
+        past_kv = prompt_out.past_key_values
+        # Logit at position (original_input_length - 1); used to predict
+        # the first token of the first response block via causal shift.
+        prev_last_logit = prompt_out.logits[:, -1:, :]
+    # ── Iterate blocks in order ──
+    for _block_idx, block_abs_start, block_abs_end in _block_ranges:
+        B = block_abs_end - block_abs_start
+        section_name = _block_idx_to_section.get(_block_idx, None)
+        # Count MASK tokens in this block
+        block_slice = x_t[0, block_abs_start:block_abs_end]
+        n_masks_in_block = int((block_slice == mask_id).sum().item())
+        # ── Iterative unmasking within this block (if any MASKs) ──
+        if n_masks_in_block > 0:
+            max_iter = n_masks_in_block + 5  # safety limit
+            for _ in range(max_iter):
+                current_block_masks = (x_t[:, block_abs_start:block_abs_end] == mask_id)
+                if current_block_masks.sum() == 0:
+                    break
+                if use_kv_cache:
+                    # Feed only this block; past_kv covers prompt + prior blocks.
+                    block_tokens = x_t[:, block_abs_start:block_abs_end]
+                    block_embeds = _embed_fn(block_tokens)
+                    block_position_ids = _position_ids[..., block_abs_start:block_abs_end]
+                    L_cached = past_kv.get_seq_length() if past_kv is not None else 0
+                    # Block-causal + bidirectional-within-block ⇒ this
+                    # block's queries attend to all cached KV plus all
+                    # fresh block KV ⇒ all-True mask of shape [B, L+B].
+                    block_attn = torch.ones(
+                        B, L_cached + B, device=self.device, dtype=torch.bool
+                    )
+                    output = self.forward(
+                        inputs_embeds=block_embeds,
+                        attention_mask=block_attn,
+                        position_ids=block_position_ids,
+                        past_key_values=past_kv,
+                        use_cache=True,
+                        update_kv_cache=False,  # read-only during iteration
+                    )
+                    logits = output.logits  # [1, B, V]
+                    # Shift: pred for abs_pos uses logit at abs_pos-1.
+                    # logit at block_abs_start-1 is prev_last_logit; the
+                    # rest come from this forward's earlier positions.
+                    sec_logits = torch.cat([prev_last_logit, logits[:, :-1, :]], dim=1)
+                else:
+                    # Full-sequence forward (fallback path, same as before)
+                    _cur_embeds = _embed_fn(x_t)
+                    if _cached_image_embeds is not None:
+                        _cur_embeds = _cur_embeds.masked_scatter(
+                            _cached_image_mask, _cached_image_embeds
+                        )
+                    output = self.forward(
+                        input_ids=x_t,
+                        inputs_embeds=_cur_embeds,
+                        attention_mask=attention_mask,
+                        position_ids=_position_ids,
+                        use_cache=False,
+                    )
+                    logits = output.logits
+                    sec_logits = logits[:, block_abs_start:block_abs_end, :]
+                    sec_logits = torch.cat(
+                        [logits[:, block_abs_start - 1:block_abs_start, :],
+                         sec_logits[:, :-1, :]], dim=1
+                    )
+                if temperature > 0:
+                    # Temperature sampling for diverse generation (e.g. GRPO rollouts)
+                    sampling_probs = torch.softmax(sec_logits / temperature, dim=-1)
+                    x_1 = torch.multinomial(
+                        sampling_probs.view(-1, sampling_probs.shape[-1]), num_samples=1
+                    ).view(sampling_probs.shape[:-1])
+                else:
+                    # Greedy (default, backward compatible)
+                    x_1 = sec_logits.argmax(dim=-1)
+                probs = torch.softmax(sec_logits, dim=-1)
+                x1_p = torch.gather(probs, dim=-1, index=x_1.unsqueeze(-1)).squeeze(-1)
+                # Only consider currently-masked positions in this block
+                x1_p = torch.where(current_block_masks, x1_p, -torch.inf)
+                unmask_idx = (x1_p > threshold)
+                if unmask_idx.sum() > 0:
+                    x_t[:, block_abs_start:block_abs_end][unmask_idx] = x_1[unmask_idx]
+                    tokens_per_step.append(int(unmask_idx.sum()))
+                else:
+                    # Fallback: unmask highest-confidence token
+                    pos = x1_p.argmax()
+                    row = 0
+                    col = pos.item()
+                    x_t[:, block_abs_start:block_abs_end][row, col] = x_1[row, col]
+                    tokens_per_step.append(1)
+                step += 1
+                if step > max_tokens:
+                    break
+        # ── Commit this block's K/V to the cache ──
+        # Run one final forward at block's fully-denoised state with
+        # update_kv_cache=True so future blocks can attend to it via cache.
+        # prev_last_logit is refreshed to the logit at the last position
+        # of this block for the NEXT block's first-position prediction.
+        if use_kv_cache:
+            block_tokens = x_t[:, block_abs_start:block_abs_end]
+            block_embeds = _embed_fn(block_tokens)
+            block_position_ids = _position_ids[..., block_abs_start:block_abs_end]
+            L_cached = past_kv.get_seq_length() if past_kv is not None else 0
+            block_attn = torch.ones(
+                B, L_cached + B, device=self.device, dtype=torch.bool
+            )
+            commit_out = self.forward(
+                inputs_embeds=block_embeds,
+                attention_mask=block_attn,
+                position_ids=block_position_ids,
+                past_key_values=past_kv,
+                use_cache=True,
+                update_kv_cache=True,
+            )
+            past_kv = commit_out.past_key_values
+            prev_last_logit = commit_out.logits[:, -1:, :]
+        # NOTE: a previous null_ratio>0.3 early-stopping heuristic was
+        # removed. It computed the ratio globally across the whole
+        # explanation and, when tripped, force-filled every remaining
+        # MASK with NULL — including MASKs in middle positions that
+        # should have held real text — which cut short explanations
+        # mid-sentence. Training always produces 192 value tokens
+        # (real text + <|NULL|> padding at the tail) and the model
+        # learned to emit NULL cleanly at the tail, so the final
+        # NULL-strip below is sufficient. Cost: every sample now
+        # denoises all 6 explanation blocks.
+    # Post-process: strip NULL tokens from the output
+    gen_tokens = x_t[0, original_input_length:].tolist()
+    cleaned = [t for t in gen_tokens if t != null_id and t != mask_id]
+    x_t = torch.cat([
+        input_ids,
+        torch.tensor([cleaned], device=self.device, dtype=torch.long)
+    ], dim=1)
+    gen_length = x_t.shape[1] - original_input_length
+    if return_stats:
+        stats = {
+            "tokens_per_step": tokens_per_step,
+            "total_steps": step,
+            "gen_length": gen_length,
+            "null_tokens_stripped": len(gen_tokens) - len(cleaned),
+            "block_size": block_size,
+        }
+        return x_t, stats
+    return x_t
+@torch.no_grad()
+# ---------------------------------------------------------------------------
+# scaffold_speculative_sample — Scaffold Spec (SS)
+# ---------------------------------------------------------------------------
+def scaffold_speculative_sample(
+    self,
+    input_ids,
+    tokenizer,
+    block_size=32,
+    max_tokens=1024,
+    pixel_values=None,
+    image_grid_thw=None,
+    mask_id=151665,
+    null_id=151666,
+    threshold=0.9,
+    stop_token=151645,
+    explanation_block_size=32,
+    explanation_max_blocks=6,
+    return_stats=False,
+    draft_temperature=0.0,
+    verify_temperature=0.0,
+):
+    """
+    Scaffold-aware self-speculative decoding.
+    Minimal modification of standard self-spec
+    (speculative_block_causal_sample_cache): scaffold (structural JSON)
+    tokens are pre-filled in the draft block instead of MASK and
+    auto-accepted during causal verification.
+    Key design: uses *exactly the same* attention patterns as standard
+    self-spec (block-diff for draft, **causal** for verify via
+    auto eval_mask).  Only the draft block content differs — scaffold
+    positions carry known tokens instead of MASK, giving the draft
+    better context while scaffold tokens are "free" during acceptance.
+    """
+    from .section_utils import (
+        build_deep_json_scaffold,
+        NULL_TOKEN_ID,
+    )
+    scaffold_tokens, section_ranges, scaffold_mask_list = build_deep_json_scaffold(
+        tokenizer,
+        mask_id=mask_id,
+        null_id=null_id,
+        explanation_block_size=explanation_block_size,
+        explanation_max_blocks=explanation_max_blocks,
+    )
+    scaffold_len = len(scaffold_tokens)
+    original_input_length = input_ids.shape[1]
+    tokens_per_step = []
+    self.model.bd_size = block_size
+    _ss_profile = bool(os.environ.get("SS_PROFILE"))
+    _ss_traj_start = section_ranges.get("trajectory", (None, None))[0]
+    if _ss_profile:
+        import time as _time
+        torch.cuda.synchronize()
+        _ss_t = {"start": _time.perf_counter()}
+        _ss_marked_traj_start = False
+        _ss_n_fwd_prefix = 0
+        _ss_n_fwd_traj = 0
+    # Pre-convert to tensors for vectorized operations in the loop
+    scaffold_tok_t = torch.tensor(
+        scaffold_tokens, device=self.device, dtype=torch.long
+    )
+    scaffold_is_fixed = torch.tensor(
+        scaffold_mask_list, device=self.device, dtype=torch.bool
+    )
+    # ── Phase 1: Prefill prompt (identical to standard self-spec) ──
+    output = self.forward(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        image_grid_thw=image_grid_thw,
+        use_cache=True,
+        update_kv_cache=True,
+    )
+    logits, past_key_values = output.logits, output.past_key_values
+    if _ss_profile:
+        torch.cuda.synchronize()
+        _ss_t["after_prefill"] = _time.perf_counter()
+    # First token — use scaffold token (always '{')
+    next_token = torch.tensor(
+        [[scaffold_tokens[0]]], device=self.device, dtype=torch.long
+    )
+    input_ids = torch.cat([input_ids, next_token], dim=1)
+    tokens_per_step.append(1)
+    scaffold_cursor = 1
+    step = 1
+    # ── Phase 2: Self-speculative decoding loop ──
+    # Follows the exact same structure as
+    # speculative_block_causal_sample_cache, with scaffold-aware draft.
+    while scaffold_cursor < scaffold_len:
+        if _ss_profile and (not _ss_marked_traj_start) and (
+            _ss_traj_start is not None and scaffold_cursor >= _ss_traj_start
+        ):
+            torch.cuda.synchronize()
+            _ss_t["enter_traj"] = _time.perf_counter()
+            _ss_marked_traj_start = True
+        prompt_length = input_ids.shape[1]
+        n_draft = min(block_size - 1, scaffold_len - scaffold_cursor)
+        # Build draft block: [seed, scaffold_or_MASK × n_draft]
+        sc_end = scaffold_cursor + n_draft
+        is_fixed = scaffold_is_fixed[scaffold_cursor:sc_end]
+        draft_tensor = torch.where(
+            is_fixed,
+            scaffold_tok_t[scaffold_cursor:sc_end],
+            mask_id,
+        ).unsqueeze(0)
+        x_t = torch.cat([input_ids[:, -1:], draft_tensor], dim=1)
+        mask_idx = (x_t == mask_id)
+        # ── Draft (block-diff bidirectional via auto eval_mask) ──
+        logits = self.forward(
+            input_ids=x_t,
+            use_cache=True,
+            past_key_values=past_key_values,
+            update_kv_cache=False,
+            eval_bd_size=block_size,
+        ).logits
+        tokens_per_step.append(0)
+        step += 1
+        # Shift logits (same as standard self-spec)
+        logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
+        if draft_temperature > 0:
+            # Temperature sampling for draft diversity
+            scaled = logits / draft_temperature
+            draft_probs = torch.softmax(scaled, dim=-1)
+            x_1 = torch.multinomial(
+                draft_probs.view(-1, draft_probs.shape[-1]), num_samples=1
+            ).view(draft_probs.shape[:-1])
+            # Confidence uses unscaled probs for thresholding
+            probs = torch.softmax(logits, dim=-1)
+            x1_p = torch.gather(
+                probs, dim=-1, index=x_1.unsqueeze(-1)
+            ).squeeze(-1)
+        else:
+            x_1 = logits.argmax(dim=-1)
+            probs = torch.softmax(logits, dim=-1)
+            x1_p = torch.gather(
+                probs, dim=-1, index=x_1.unsqueeze(-1)
+            ).squeeze(-1)
+        # Only fill MASK positions; scaffold positions keep their tokens
+        x1_p = torch.where(mask_idx, x1_p, -torch.inf)
+        unmask_idx = (x1_p > 0)  # threshold=0 for draft filling
+        if unmask_idx.sum() > 0:
+            x_t[unmask_idx] = x_1[unmask_idx]
+        else:
+            # Fallback: fill most confident MASK
+            mask_only_p = x1_p.clone()
+            mask_only_p[~mask_idx] = -torch.inf
+            if mask_only_p.max() > -torch.inf:
+                best = mask_only_p.argmax()
+                x_t.view(-1)[best] = x_1.view(-1)[best]
+        # ── Verify (causal via auto eval_mask, commit to cache) ──
+        output = self.forward(
+            input_ids=x_t,
+            use_cache=True,
+            past_key_values=past_key_values,
+            update_kv_cache=True,
+            eval_bd_size=block_size,
+        )
+        past_key_values = output.past_key_values
+        if verify_temperature > 0:
+            verify_logits = output.logits / verify_temperature
+            verify_probs = torch.softmax(verify_logits, dim=-1)
+            ar_block_token = torch.multinomial(
+                verify_probs.view(-1, verify_probs.shape[-1]), num_samples=1
+            ).view(verify_probs.shape[:-1])
+        else:
+            ar_block_token = output.logits.argmax(dim=-1)
+        # ── AR acceptance (scaffold positions auto-pass) ──
+        ar_matches = (ar_block_token[0, :n_draft] == x_t[0, 1:n_draft + 1])
+        accepted_token_num = 0
+        for i in range(n_draft):
+            if is_fixed[i] or ar_matches[i]:
+                accepted_token_num += 1
+            else:
+                break
+        accepted_token_num += 1  # bonus token
+        tokens_per_step.append(accepted_token_num)
+        # Force scaffold tokens at scaffold positions, AR predictions elsewhere
+        accepted_ids = ar_block_token[:, :accepted_token_num].clone()
+        acc_end = min(scaffold_cursor + accepted_token_num, scaffold_len)
+        acc_fixed = scaffold_is_fixed[scaffold_cursor:acc_end]
+        accepted_ids[0, :len(acc_fixed)][acc_fixed] = \
+            scaffold_tok_t[scaffold_cursor:acc_end][acc_fixed]
+        input_ids = torch.cat([input_ids, accepted_ids], dim=1)
+        scaffold_cursor += accepted_token_num
+        past_key_values = _crop_cache(past_key_values, input_ids.shape[1] - 1)
+        step += 1
+        # Stop conditions
+        if input_ids.shape[1] - original_input_length > max_tokens:
+            break
+        if stop_token in input_ids[:, prompt_length:]:
+            stop_token_idx = (
+                input_ids[:, prompt_length:] == stop_token
+            ).nonzero()[0][1]
+            if (
+                input_ids[:, prompt_length:prompt_length + stop_token_idx]
+                == mask_id
+            ).sum() == 0:
+                break
+    if _ss_profile:
+        torch.cuda.synchronize()
+        _ss_t["end"] = _time.perf_counter()
+        _t_total = _ss_t["end"] - _ss_t["start"]
+        _t_pre = _ss_t["after_prefill"] - _ss_t["start"]
+        _t_traj_in = _ss_t.get("enter_traj")
+        if _t_traj_in is not None:
+            _t_prefix = _t_traj_in - _ss_t["after_prefill"]
+            _t_traj = _ss_t["end"] - _t_traj_in
+        else:
+            _t_prefix = _ss_t["end"] - _ss_t["after_prefill"]
+            _t_traj = 0.0
+        print(
+            f"[ss profile] total={_t_total*1000:.0f}ms  "
+            f"prefill={_t_pre*1000:.0f}ms  "
+            f"prefix-decode={_t_prefix*1000:.0f}ms  "
+            f"traj-decode={_t_traj*1000:.0f}ms",
+            flush=True,
+        )
+    # ── Phase 3: Post-process — truncate at stop, strip NULL ──
+    if stop_token in input_ids[:, original_input_length:]:
+        stop_token_idx = (
+            input_ids[:, original_input_length:] == stop_token
+        ).nonzero()[0][1]
+        input_ids = input_ids[
+            :, :stop_token_idx + original_input_length + 1
+        ]
+    gen_tokens = input_ids[0, original_input_length:].tolist()
+    cleaned = [t for t in gen_tokens if t != null_id and t != mask_id]
+    output_ids = torch.cat(
+        [
+            input_ids[:, :original_input_length],
+            torch.tensor(
+                [cleaned], device=self.device, dtype=torch.long
+            ),
+        ],
+        dim=1,
+    )
+    gen_length = output_ids.shape[1] - original_input_length
+    if return_stats:
+        stats = {
+            "tokens_per_step": tokens_per_step,
+            "total_steps": step,
+            "gen_length": gen_length,
+            "null_tokens_stripped": len(gen_tokens) - len(cleaned),
+            "block_size": block_size,
+            "method": "scaffold_speculative_v5",
+        }
+        return output_ids, stats
+    return output_ids
+@torch.no_grad()
+# ---------------------------------------------------------------------------
+# scaffold_spec_with_ss_multi_traj — SS multi-rollout inference scaling
+# ---------------------------------------------------------------------------
+def scaffold_spec_with_ss_multi_traj(
+    self,
+    input_ids,
+    tokenizer,
+    block_size=32,
+    max_tokens=1024,
+    pixel_values=None,
+    image_grid_thw=None,
+    mask_id=151665,
+    null_id=151666,
+    threshold=0.9,
+    stop_token=151645,
+    explanation_block_size=32,
+    explanation_max_blocks=6,
+    return_stats=False,
+    num_traj_rollouts=4,
+    traj_verify_temperature=0.5,
+    traj_draft_temperature=0.0,
+    merge_weights=None,
+    batch_parallel=False,
+):
+    """Scaffold Spec with shared prefix + N SS rollouts on the trajectory section.
+    Decoding pipeline:
+      0) Prompt prefill                                                [shared]
+      1) Scaffold Spec for sections 1-3 (CoT) at verify_temp = 0       [shared, deterministic]
+      2) Fork KV cache N times                                          [O(N) memory]
+      3) For each fork: continue Scaffold Spec on the trajectory
+         section with verify_temperature = traj_verify_temperature
+         (each rollout draws different samples in the AR-verify step
+         because torch.multinomial is invoked with a global RNG).
+      4) Parse all N trajectories and return their weighted mean.
+    Cost: roughly 1 full SS pass (sections 1-3 are ~88%% of decoded tokens
+    on our schema) + N x trajectory-only SS passes.  For N = 4 this is
+    ~1.5x the cost of a single SS, vs ~4x for naive sequential rerolling.
+    If batch_parallel = True, the N trajectory rollouts are executed in a
+    batched (batch_size = N) manner: one shared model.forward per
+    speculative draft / verify step over an N-replicated trajectory
+    suffix, which removes the per-rollout serial overhead at the cost of
+    replicating the per-layer KV cache N-fold along the batch dimension.
+    Returns: (output_ids, stats) if return_stats else output_ids.
+    """
+    from .section_utils import (
+        build_deep_json_scaffold,
+        SECTION_KEYS,
+    )
+    scaffold_tokens, section_ranges, scaffold_mask_list = build_deep_json_scaffold(
+        tokenizer,
+        mask_id=mask_id,
+        null_id=null_id,
+        explanation_block_size=explanation_block_size,
+        explanation_max_blocks=explanation_max_blocks,
+    )
+    scaffold_len = len(scaffold_tokens)
+    original_input_length = input_ids.shape[1]
+    tokens_per_step = []
+    self.model.bd_size = block_size
+    scaffold_tok_t = torch.tensor(scaffold_tokens, device=self.device, dtype=torch.long)
+    scaffold_is_fixed = torch.tensor(scaffold_mask_list, device=self.device, dtype=torch.bool)
+    traj_start_in_scaffold = section_ranges["trajectory"][0]
+    _profile = bool(os.environ.get("SS_MT_PROFILE"))
+    if _profile:
+        import time as _time
+        torch.cuda.synchronize()
+        _t_phase = {"start": _time.perf_counter()}
+        _phase_clone_total = 0.0
+        _phase_rollout_each = []
+    # ── Phase 0: Prefill prompt ──
+    output = self.forward(
+        input_ids=input_ids, pixel_values=pixel_values,
+        image_grid_thw=image_grid_thw,
+        use_cache=True, update_kv_cache=True,
+    )
+    logits, past_key_values = output.logits, output.past_key_values
+    if _profile:
+        torch.cuda.synchronize()
+        _t_phase["after_prefill"] = _time.perf_counter()
+    next_token = torch.tensor(
+        [[scaffold_tokens[0]]], device=self.device, dtype=torch.long,
+    )
+    input_ids = torch.cat([input_ids, next_token], dim=1)
+    tokens_per_step.append(1)
+    scaffold_cursor = 1
+    step = 1
+    # ── Phase 1: Scaffold Spec for non-trajectory sections (shared, vt=0) ──
+    while scaffold_cursor < scaffold_len and scaffold_cursor < traj_start_in_scaffold:
+        remaining_before_traj = traj_start_in_scaffold - scaffold_cursor
+        n_draft = min(block_size - 1, remaining_before_traj)
+        if n_draft <= 0:
+            break
+        sc_end = scaffold_cursor + n_draft
+        is_fixed = scaffold_is_fixed[scaffold_cursor:sc_end]
+        draft_tensor = torch.where(
+            is_fixed, scaffold_tok_t[scaffold_cursor:sc_end], mask_id,
+        ).unsqueeze(0)
+        x_t = torch.cat([input_ids[:, -1:], draft_tensor], dim=1)
+        mask_idx = (x_t == mask_id)
+        # Draft (block-bidirectional)
+        logits = self.forward(
+            input_ids=x_t, use_cache=True,
+            past_key_values=past_key_values,
+            update_kv_cache=False, eval_bd_size=block_size,
+        ).logits
+        tokens_per_step.append(0)
+        step += 1
+        logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
+        x_1 = logits.argmax(dim=-1)
+        probs = torch.softmax(logits, dim=-1)
+        x1_p = torch.gather(probs, dim=-1, index=x_1.unsqueeze(-1)).squeeze(-1)
+        x1_p = torch.where(mask_idx, x1_p, -torch.inf)
+        unmask_idx = (x1_p > 0)
+        if unmask_idx.sum() > 0:
+            x_t[unmask_idx] = x_1[unmask_idx]
+        else:
+            mask_only_p = x1_p.clone()
+            mask_only_p[~mask_idx] = -torch.inf
+            if mask_only_p.max() > -torch.inf:
+                best = mask_only_p.argmax()
+                x_t.view(-1)[best] = x_1.view(-1)[best]
+        # Verify (causal, greedy)
+        output = self.forward(
+            input_ids=x_t, use_cache=True,
+            past_key_values=past_key_values,
+            update_kv_cache=True, eval_bd_size=block_size,
+        )
+        past_key_values = output.past_key_values
+        ar_block_token = output.logits.argmax(dim=-1)
+        ar_matches = (ar_block_token[0, :n_draft] == x_t[0, 1:n_draft + 1])
+        accepted_token_num = 0
+        for i in range(n_draft):
+            if is_fixed[i] or ar_matches[i]:
+                accepted_token_num += 1
+            else:
+                break
+        accepted_token_num += 1
+        max_accept = traj_start_in_scaffold - scaffold_cursor
+        if accepted_token_num > max_accept:
+            accepted_token_num = max_accept
+        tokens_per_step.append(accepted_token_num)
+        accepted_ids = ar_block_token[:, :accepted_token_num].clone()
+        acc_end = min(scaffold_cursor + accepted_token_num, scaffold_len)
+        acc_fixed = scaffold_is_fixed[scaffold_cursor:acc_end]
+        accepted_ids[0, :len(acc_fixed)][acc_fixed] = \
+            scaffold_tok_t[scaffold_cursor:acc_end][acc_fixed]
+        input_ids = torch.cat([input_ids, accepted_ids], dim=1)
+        scaffold_cursor += accepted_token_num
+        past_key_values = _crop_cache(past_key_values, input_ids.shape[1] - 1)
+        step += 1
+        if input_ids.shape[1] - original_input_length > max_tokens:
+            break
+    if _profile:
+        torch.cuda.synchronize()
+        _t_phase["after_phase1"] = _time.perf_counter()
+    # ── Phase 2: Fork KV cache N times (one per trajectory rollout) ──
+    prefix_input_ids = input_ids.clone()
+    prefix_len = prefix_input_ids.shape[1]
+    def _clone_cache(kv):
+        if _profile:
+            torch.cuda.synchronize()
+            _t0 = _time.perf_counter()
+        cloned = []
+        for layer_num in range(len(kv)):
+            cloned.append(tuple(t.clone() for t in kv[layer_num]))
+        ret = DynamicCache(cloned)
+        if _profile:
+            torch.cuda.synchronize()
+            nonlocal _phase_clone_total
+            _phase_clone_total += _time.perf_counter() - _t0
+        return ret
+    # ── Phase 3: N SS rollouts on trajectory section, each with vt > 0 ──
+    # All rollouts start from the same prefix; randomness comes from
+    # the multinomial calls in draft / verify (RNG is process-global).
+    N = max(1, int(num_traj_rollouts))
+    def _run_one_traj_rollout(start_kv, start_input_ids):
+        """Continue Scaffold Spec from start_kv / start_input_ids over the
+        trajectory section, applying traj_*_temperature.  Returns the
+        final ss_input_ids (with trajectory tokens appended) and the
+        extracted trajectory value tokens."""
+        local_kv = start_kv
+        local_input = start_input_ids
+        local_cursor = scaffold_cursor
+        while local_cursor < scaffold_len:
+            n_draft = min(block_size - 1, scaffold_len - local_cursor)
+            sc_end = local_cursor + n_draft
+            is_fixed = scaffold_is_fixed[local_cursor:sc_end]
+            draft_tensor = torch.where(
+                is_fixed, scaffold_tok_t[local_cursor:sc_end], mask_id,
+            ).unsqueeze(0)
+            x_t = torch.cat([local_input[:, -1:], draft_tensor], dim=1)
+            mask_idx = (x_t == mask_id)
+            # Draft (block-bidirectional, optionally temp-sampled)
+            draft_logits = self.forward(
+                input_ids=x_t, use_cache=True, past_key_values=local_kv,
+                update_kv_cache=False, eval_bd_size=block_size,
+            ).logits
+            draft_logits = torch.cat(
+                [draft_logits[:, :1, :], draft_logits[:, :-1, :]], dim=1,
+            )
+            if traj_draft_temperature > 0:
+                scaled = draft_logits / traj_draft_temperature
+                draft_probs = torch.softmax(scaled, dim=-1)
+                x_1 = torch.multinomial(
+                    draft_probs.view(-1, draft_probs.shape[-1]),
+                    num_samples=1,
+                ).view(draft_probs.shape[:-1])
+            else:
+                x_1 = draft_logits.argmax(dim=-1)
+            probs = torch.softmax(draft_logits, dim=-1)
+            x1_p = torch.gather(
+                probs, dim=-1, index=x_1.unsqueeze(-1),
+            ).squeeze(-1)
+            x1_p = torch.where(mask_idx, x1_p, -torch.inf)
+            unmask_idx = (x1_p > 0)
+            if unmask_idx.sum() > 0:
+                x_t[unmask_idx] = x_1[unmask_idx]
+            else:
+                mask_only_p = x1_p.clone()
+                mask_only_p[~mask_idx] = -torch.inf
+                if mask_only_p.max() > -torch.inf:
+                    x_t.view(-1)[mask_only_p.argmax()] = \
+                        x_1.view(-1)[mask_only_p.argmax()]
+            # Verify (causal, optionally temp-sampled)
+            v_out = self.forward(
+                input_ids=x_t, use_cache=True, past_key_values=local_kv,
+                update_kv_cache=True, eval_bd_size=block_size,
+            )
+            local_kv = v_out.past_key_values
+            if traj_verify_temperature > 0:
+                v_logits = v_out.logits / traj_verify_temperature
+                v_probs = torch.softmax(v_logits, dim=-1)
+                ar_block_token = torch.multinomial(
+                    v_probs.view(-1, v_probs.shape[-1]),
+                    num_samples=1,
+                ).view(v_probs.shape[:-1])
+            else:
+                ar_block_token = v_out.logits.argmax(dim=-1)
+            ar_matches = (ar_block_token[0, :n_draft] == x_t[0, 1:n_draft + 1])
+            accepted_token_num = 0
+            for i in range(n_draft):
+                if is_fixed[i] or ar_matches[i]:
+                    accepted_token_num += 1
+                else:
+                    break
+            accepted_token_num += 1
+            accepted_ids = ar_block_token[:, :accepted_token_num].clone()
+            acc_end = min(local_cursor + accepted_token_num, scaffold_len)
+            acc_fixed = scaffold_is_fixed[local_cursor:acc_end]
+            accepted_ids[0, :len(acc_fixed)][acc_fixed] = \
+                scaffold_tok_t[local_cursor:acc_end][acc_fixed]
+            local_input = torch.cat([local_input, accepted_ids], dim=1)
+            local_cursor += accepted_token_num
+            local_kv = _crop_cache(local_kv, local_input.shape[1] - 1)
+            if local_input.shape[1] - original_input_length > max_tokens:
+                break
+            if stop_token in local_input[:, prefix_len:]:
+                st_idx = (local_input[:, prefix_len:] == stop_token).nonzero()
+                if st_idx.numel() > 0:
+                    cand_st = st_idx[0][1].item()
+                    if (local_input[:, prefix_len:prefix_len + cand_st] == mask_id).sum() == 0:
+                        break
+        traj_values = [
+            t for i, t in enumerate(local_input[0, original_input_length:].tolist())
+            if i >= traj_start_in_scaffold and i < scaffold_len
+            and not scaffold_mask_list[i] and t != null_id and t != mask_id
+        ]
+        return local_input, traj_values
+    # Sequential N rollouts (Option A; batch_parallel=False).
+    rollout_inputs = []
+    rollout_traj_values = []
+    for _i in range(N):
+        if _profile:
+            torch.cuda.synchronize()
+            _t_r0 = _time.perf_counter()
+        cand_kv = _clone_cache(past_key_values)
+        cand_input = prefix_input_ids.clone()
+        cand_input, traj_vals = _run_one_traj_rollout(cand_kv, cand_input)
+        rollout_inputs.append(cand_input)
+        rollout_traj_values.append(traj_vals)
+        step += 1
+        if _profile:
+            torch.cuda.synchronize()
+            _phase_rollout_each.append(_time.perf_counter() - _t_r0)
+    if _profile:
+        torch.cuda.synchronize()
+        _t_phase["after_rollouts"] = _time.perf_counter()
+        _t_total = _t_phase["after_rollouts"] - _t_phase["start"]
+        _t_pre = _t_phase["after_prefill"] - _t_phase["start"]
+        _t_p1 = _t_phase["after_phase1"] - _t_phase["after_prefill"]
+        _t_rolls = _t_phase["after_rollouts"] - _t_phase["after_phase1"]
+        print(
+            f"[ss_mt profile] total={_t_total*1000:.0f}ms  "
+            f"prefill(P0)={_t_pre*1000:.0f}ms  "
+            f"prefix-decode(P1)={_t_p1*1000:.0f}ms  "
+            f"rollouts(P2+P3)={_t_rolls*1000:.0f}ms  "
+            f"of which kv-clone={_phase_clone_total*1000:.0f}ms  "
+            f"per-rollout={[f'{r*1000:.0f}' for r in _phase_rollout_each]}ms",
+            flush=True,
+        )
+    # ── Phase 4: Parse all rollouts, weighted-merge waypoints ──
+    def _decode_trajectory(traj_tokens):
+        text = tokenizer.decode(traj_tokens, skip_special_tokens=False)
+        text = text.replace("<|NULL|>", "").strip()
+        coords = re.findall(r"[+-]?\d+\.?\d*", text)
+        wps = []
+        for i in range(0, len(coords) - 1, 2):
+            wps.append([float(coords[i]), float(coords[i + 1])])
+        return wps
+    rollout_waypoints = [_decode_trajectory(v) for v in rollout_traj_values]
+    if merge_weights is None or len(merge_weights) != N:
+        ws = [1.0 / N] * N
+    else:
+        total = sum(merge_weights)
+        ws = [w / total for w in merge_weights]
+    if rollout_waypoints and all(len(w) > 0 for w in rollout_waypoints):
+        n_wp = min(len(w) for w in rollout_waypoints)
+        merged_waypoints = []
+        for i in range(n_wp):
+            mx = sum(ws[c] * rollout_waypoints[c][i][0] for c in range(N))
+            my = sum(ws[c] * rollout_waypoints[c][i][1] for c in range(N))
+            merged_waypoints.append([mx, my])
+    else:
+        merged_waypoints = next(
+            (w for w in rollout_waypoints if w), [],
+        )
+    # Output text: take rollout 0's full text but replace its trajectory
+    # with the merged waypoints.
+    base_input = rollout_inputs[0]
+    if stop_token in base_input[:, original_input_length:]:
+        st_idx = (base_input[:, original_input_length:] == stop_token).nonzero()[0][1]
+        base_input = base_input[:, :st_idx + original_input_length + 1]
+    base_raw_tokens = base_input[0, original_input_length:].tolist()
+    base_cleaned = [t for t in base_raw_tokens if t != null_id and t != mask_id]
+    base_null_stripped = len(base_raw_tokens) - len(base_cleaned)
+    base_text = tokenizer.decode(base_cleaned, skip_special_tokens=False)
+    traj_parts = [
+        f"[{x:+07.2f},{y:+06.2f}]" for x, y in merged_waypoints
+    ]
+    merged_traj_str = "[" + ", ".join(traj_parts) + "]"
+    replaced_text = re.sub(
+        r'("trajectory"\s*:\s*")(\[\[.*?\]\])',
+        r"\g<1>" + merged_traj_str, base_text,
+    )
+    merged_tokens = tokenizer.encode(replaced_text, add_special_tokens=False)
+    output_ids = torch.cat([
+        input_ids[:, :original_input_length],
+        torch.tensor([merged_tokens], device=self.device, dtype=torch.long),
+    ], dim=1)
+    gen_length = output_ids.shape[1] - original_input_length
+    if return_stats:
+        stats = {
+            "tokens_per_step": tokens_per_step,
+            "total_steps": step,
+            "gen_length": gen_length,
+            "null_tokens_stripped": base_null_stripped,
+            "block_size": block_size,
+            "method": "scaffold_spec_with_ss_multi_traj",
+            "num_traj_rollouts": N,
+            "traj_verify_temperature": traj_verify_temperature,
+            "rollout_waypoints": rollout_waypoints,
+            "merged_waypoints": merged_waypoints,
+            "merge_weights": ws,
+        }
+        return output_ids, stats
+    return output_ids
+@torch.no_grad()
+# ---------------------------------------------------------------------------
+# Bind decoding methods onto the model class.
+#
+# ``modeling.py`` imports this module at the bottom of the file, after the
+# ``Fast_dDriveForConditionalGeneration`` class has been defined.  We
+# attach the three decoding paths as ordinary methods so callers can invoke
+# them as ``model.mdm_sample_deep_scaffold(...)`` etc. without any extra
+# registration step.
+# ---------------------------------------------------------------------------
+def attach_generation_methods(cls):
+    """Attach the three release decoding paths as methods of ``cls``."""
+    cls.mdm_sample_deep_scaffold = mdm_sample_deep_scaffold
+    cls.scaffold_speculative_sample = scaffold_speculative_sample
+    cls.scaffold_spec_with_ss_multi_traj = scaffold_spec_with_ss_multi_traj
+    return cls
+__all__ = [
+    "mdm_sample_deep_scaffold",
+    "scaffold_speculative_sample",
+    "scaffold_spec_with_ss_multi_traj",
+    "attach_generation_methods",
+]

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08a8945f208e0b7e62c71542d3301d755d95d02bcdb54d7deec28f8a819b4a2d
+size 4972304384

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f399e3ccf37f2016964e531e2cfb5371a3d96d46205cb1b3eba24cd13d0de6aa
+size 4932949248

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1c77c01870520022fd0111459ce27eff492ce932a64740dd26d23598289f3ed
+size 4932949336

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c13c8eff2f5e9435ac7692790d1028ff1dbc5f8bd53db6e5a91b22c322162008
+size 1425040040

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,833 @@

+{
+  "metadata": {
+    "total_parameters": 234663856,
+    "total_size": 16263151616
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors",
+    "visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.ln_q.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
+    "visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors"
+  }
+}

modeling.py ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "min_pixels": 3136,
+  "max_pixels": 12845056,
+  "patch_size": 14,
+  "temporal_patch_size": 2,
+  "merge_size": 2,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "processor_class": "Qwen2_5_VLProcessor"
+}

section_utils.py ADDED Viewed

	@@ -0,0 +1,803 @@

+"""
+Section-aware block scheduling for JSON structured output.
+Inspired by the S3 (Self-adaptive Schema Scaffolding) paper (arXiv:2507.04504),
+this module provides utilities to:
+1. Parse tokenized JSON output into sections (critical_objects, explanation, etc.)
+2. Assign section-aware block indices for variable block sizes per section
+3. Build JSON scaffolds for inference (pre-fill structural tokens)
+The DVLM-AD output schema has 4 sections:
+  - critical_objects: ~88 tokens (12 yes/no fields, nearly constant)
+  - explanation: ~114 tokens (variable, 72-172)
+  - future_meta_behavior: ~40 tokens (nearly constant)
+  - trajectory: ~80 tokens (nearly constant)
+"""
+import torch
+from typing import Dict, List, Optional, Tuple
+import math
+# Ordered list of section keys as they appear in the JSON output
+SECTION_KEYS = [
+    "critical_objects",
+    "explanation",
+    "future_meta_behavior",
+    "trajectory",
+]
+# Default token budgets per section (based on training data analysis)
+DEFAULT_TOKEN_BUDGETS = {
+    "critical_objects": 88,
+    "explanation": 128,
+    "future_meta_behavior": 40,
+    "trajectory": 80,
+}
+# Default steps per section
+DEFAULT_SECTION_STEPS = {
+    "critical_objects": 1,
+    "explanation": 3,
+    "future_meta_behavior": 1,
+    "trajectory": 1,
+}
+def _v1_removed_parse_json_sections(*args, **kwargs):
+    raise NotImplementedError("DS v1 parse_json_sections has been removed. Use deep scaffold v2.")
+def _v1_removed_compute_section_block_idx(*args, **kwargs):
+    raise NotImplementedError("DS v1 compute_section_block_idx has been removed. Use compute_section_block_idx_deep_static.")
+def _v1_removed_build_json_scaffold(*args, **kwargs):
+    raise NotImplementedError("DS v1 build_json_scaffold has been removed. Use build_deep_json_scaffold.")
+def _v1_removed_compute_section_block_sizes(*args, **kwargs):
+    raise NotImplementedError("DS v1 compute_section_block_sizes has been removed.")
+def build_static_scaffold_sequences(tokenizer) -> Dict[str, List[int]]:
+    """Pre-compute token sequences for top-level JSON boundary matching.
+    Used internally by :func:`build_deep_scaffold_sequences`.
+    """
+    return {
+        "prefix": tokenizer.encode('{"critical_objects":', add_special_tokens=False),
+        "between_co_exp": tokenizer.encode(' "explanation":', add_special_tokens=False),
+        "between_exp_fmb": tokenizer.encode(' "future_meta_behavior":', add_special_tokens=False),
+        "between_fmb_traj": tokenizer.encode(' "trajectory":', add_special_tokens=False),
+    }
+def _v1_removed_compute_section_block_idx_static(*args, **kwargs):
+    raise NotImplementedError("DS v1 compute_section_block_idx_static has been removed. Use compute_section_block_idx_deep_static.")
+# Backward-compatible aliases so stale imports produce clear errors
+parse_json_sections = _v1_removed_parse_json_sections
+compute_section_block_idx = _v1_removed_compute_section_block_idx
+build_json_scaffold = _v1_removed_build_json_scaffold
+compute_section_block_sizes = _v1_removed_compute_section_block_sizes
+compute_section_block_idx_static = _v1_removed_compute_section_block_idx_static
+# ═══════════════════════════════════════════════════════════════
+# Deep scaffold v2: constants and utilities
+# ═══════════════════════════════════════════════════════════════
+NULL_TOKEN_ID = 151666
+# critical_objects: 12 sub-keys, each value is exactly 1 token (yes=9693 / no=2152)
+CRITICAL_OBJECTS_SUBKEYS = [
+    "nearby_vehicle", "pedestrian", "cyclist", "construction",
+    "traffic_element", "weather_condition", "road_hazard",
+    "emergency_vehicle", "animal", "special_vehicle",
+    "conflicting_vehicle", "door_opening_vehicle",
+]
+# future_meta_behavior: each sub-key value is exactly 3 tokens
+# (e.g., "keep speed" → [4867, 4732, 151667] or "go straight" → [2849, 7833, 151667])
+FMB_VALUE_BUDGET = 3
+def build_deep_json_scaffold(
+    tokenizer,
+    section_token_budgets: Optional[Dict[str, int]] = None,
+    mask_id: Optional[int] = None,
+    null_id: Optional[int] = None,
+    explanation_block_size: int = 32,
+    explanation_max_blocks: int = 6,
+) -> Tuple[List[int], Dict[str, Tuple[int, int]], List[int]]:
+    """Build a deep JSON scaffold for inference (v2).
+    Constructs a template response by building a Python dict and
+    processing it through the **exact same pipeline** as the training
+    dataloader (``multi_modal_dataset.py``):
+    1. Build a realistic dict with placeholder values.
+    2. Pad explanation with ``<|NULL|>`` to ``exp_budget`` tokens.
+    3. Pad FMB values with ``<|NULL|>`` to 3 tokens each.
+    4. Normalize trajectory to ``+XXX.XX`` format with spaces.
+    5. Serialize with ``json.dumps(obj, ensure_ascii=False)``.
+    6. Tokenize the whole string as one piece.
+    7. Run ``compute_section_block_idx_deep_static`` to get scaffold/value.
+    8. Replace value positions with MASK tokens.
+    This guarantees identical BPE tokenization as training data.
+    Returns
+    -------
+    scaffold_tokens : list[int]
+        Token IDs with MASK at value positions.
+    section_ranges : dict
+        Section name -> (start, end) within scaffold_tokens.
+    scaffold_mask : list[int]
+        0 = value (to denoise), 1 = scaffold (frozen).
+    """
+    import torch as _torch
+    import json as _json
+    import re as _re
+    if mask_id is None:
+        mask_tok = tokenizer.encode("|<MASK>|", add_special_tokens=False)
+        mask_id = mask_tok[0] if len(mask_tok) == 1 else 151665
+    if null_id is None:
+        null_id = NULL_TOKEN_ID
+    exp_budget = explanation_block_size * explanation_max_blocks  # default 192
+    # ── Step 1: Build a Python dict matching training data structure ──
+    # Placeholder explanation text (will be replaced with MASK anyway).
+    filler_explanation = (
+        "The ego vehicle is driving forward on the road. "
+        "There are nearby vehicles ahead that may affect the path. "
+        "No pedestrians or cyclists are detected in the immediate area. "
+        "The road conditions appear normal with no hazards present. "
+        "Speed adjustment may be needed based on the traffic ahead. "
+        "No lateral maneuvering is required at this time."
+    )
+    def _build_template(n_exp_nulls: int) -> str:
+        """Build template via json.dumps — identical to dataloader output."""
+        null_pad = "<|NULL|>" * n_exp_nulls
+        data_obj = {
+            "critical_objects": {
+                "nearby_vehicle": "no", "pedestrian": "no", "cyclist": "no",
+                "construction": "no", "traffic_element": "no",
+                "weather_condition": "no", "road_hazard": "no",
+                "emergency_vehicle": "no", "animal": "no",
+                "special_vehicle": "no", "conflicting_vehicle": "no",
+                "door_opening_vehicle": "no",
+            },
+            "explanation": filler_explanation + null_pad,
+            "future_meta_behavior": {
+                "longitudinal": "come to stop",
+                "lateral": "go straight<|NULL|>",
+            },
+            # Raw trajectory — will be normalized below
+            "trajectory": "[[+14.70,-00.04], [+29.55,-00.21], [+44.51,-00.56], [+59.50,-01.06], [+74.39,-01.69]]",
+        }
+        # Apply exact same trajectory normalization as dataloader (lines 851-863)
+        traj = data_obj["trajectory"]
+        def _fmt_coord(m):
+            raw = m.group(0)
+            sign = raw[0]
+            num = float(raw[1:])
+            return f"{sign}{num:06.2f}"
+        traj = _re.sub(r'[+-]\d+\.\d+', _fmt_coord, traj)
+        traj = _re.sub(r',([+-])', r', \1', traj)
+        traj = _re.sub(r'\[([+-])', r'[ \1', traj)
+        data_obj["trajectory"] = traj
+        # Serialize with json.dumps — identical to dataloader line 865
+        return _json.dumps(data_obj, ensure_ascii=False)
+    # ── Step 2: Iteratively adjust NULL count for exp_budget ──
+    deep_seqs = build_deep_scaffold_sequences(tokenizer)
+    top_seqs = deep_seqs["top"]
+    def _count_exp_value_tokens(tok_list):
+        """Count explanation VALUE tokens (between boundary patterns)."""
+        co_exp_pat = top_seqs["between_co_exp"]
+        exp_fmb_pat = top_seqs["between_exp_fmb"]
+        co_exp_pos = _find_subseq(tok_list, co_exp_pat, 0)
+        if co_exp_pos < 0:
+            return None
+        exp_start = co_exp_pos + len(co_exp_pat)
+        exp_fmb_pos = _find_subseq(tok_list, exp_fmb_pat, exp_start)
+        if exp_fmb_pos < 0:
+            return None
+        # exp_start..exp_fmb_pos includes opening/closing quotes (scaffold)
+        # value tokens = total - 2 (quotes)
+        return (exp_fmb_pos - exp_start) - 2
+    # Measure base explanation tokens (no NULLs)
+    toks_0 = tokenizer.encode(_build_template(0), add_special_tokens=False)
+    base_exp = _count_exp_value_tokens(toks_0)
+    if base_exp is not None:
+        needed_nulls = max(0, exp_budget - base_exp)
+    else:
+        needed_nulls = exp_budget // 2  # fallback
+    # Build and measure, adjust once
+    template = _build_template(needed_nulls)
+    template_tokens = tokenizer.encode(template, add_special_tokens=False)
+    actual_exp = _count_exp_value_tokens(template_tokens)
+    if actual_exp is not None and actual_exp != exp_budget:
+        needed_nulls = max(0, needed_nulls + (exp_budget - actual_exp))
+        template = _build_template(needed_nulls)
+        template_tokens = tokenizer.encode(template, add_special_tokens=False)
+    # ── Step 3: Run training scaffold detection ──
+    prompt_len = 10
+    all_tokens = [1] * prompt_len + template_tokens
+    labels_list = [-100] * prompt_len + template_tokens
+    labels = _torch.tensor([labels_list])
+    token_ids = _torch.tensor([all_tokens])
+    _, _, _, scaffold_mask_tensor, _ = compute_section_block_idx_deep_static(
+        labels, token_ids, deep_seqs, fallback_block_size=32,
+    )
+    # ── Step 4: Extract scaffold/value and replace value with MASK ──
+    scaffold_tokens = list(template_tokens)
+    scaffold_mask_list: List[int] = []
+    for i in range(len(template_tokens)):
+        abs_pos = prompt_len + i
+        is_scaffold = scaffold_mask_tensor[abs_pos].item()
+        scaffold_mask_list.append(1 if is_scaffold else 0)
+    for i in range(len(scaffold_tokens)):
+        if scaffold_mask_list[i] == 0:
+            scaffold_tokens[i] = mask_id
+    # ── Step 5: Compute section ranges ──
+    section_ranges: Dict[str, Tuple[int, int]] = {}
+    boundary_order = [
+        ("prefix", "critical_objects"),
+        ("between_co_exp", "explanation"),
+        ("between_exp_fmb", "future_meta_behavior"),
+        ("between_fmb_traj", "trajectory"),
+    ]
+    search_from = 0
+    prev_section_name = None
+    prev_value_start = None
+    for boundary_key, section_name in boundary_order:
+        pattern = top_seqs.get(boundary_key)
+        if pattern is None:
+            continue
+        pos = _find_subseq(template_tokens, pattern, search_from)
+        if pos < 0:
+            continue
+        if prev_section_name is not None and prev_value_start is not None:
+            section_ranges[prev_section_name] = (prev_value_start, pos)
+        value_start = pos + len(pattern)
+        prev_section_name = section_name
+        prev_value_start = value_start
+        search_from = value_start
+    if prev_section_name is not None and prev_value_start is not None:
+        section_ranges[prev_section_name] = (prev_value_start, len(template_tokens))
+    return scaffold_tokens, section_ranges, scaffold_mask_list
+def _find_subseq(seq: List[int], pattern: List[int], start: int = 0) -> int:
+    """Find first occurrence of *pattern* in *seq* starting at *start*. Returns -1 if not found."""
+    n = len(pattern)
+    for i in range(start, len(seq) - n + 1):
+        if seq[i : i + n] == pattern:
+            return i
+    return -1
+def build_deep_scaffold_sequences(tokenizer) -> Dict[str, object]:
+    """
+    Pre-compute token sequences for deep scaffold matching.
+    Returns a dict with:
+      - Top-level boundary patterns (same as build_static_scaffold_sequences)
+      - Sub-key patterns for critical_objects, future_meta_behavior, trajectory
+    """
+    seqs: Dict[str, object] = {}
+    # ── Top-level boundaries (reuse existing) ──
+    seqs["top"] = build_static_scaffold_sequences(tokenizer)
+    # ── critical_objects sub-key patterns ──
+    # In context, CO value starts with ' {"nearby_vehicle": "yes", ...'
+    # Token 5212 = ' {"' merges space+brace+quote in context
+    # First entry: ' {"key": "'
+    # Subsequent: '", "key": "'  (token 497='","' merges quote+comma)
+    co_patterns = []
+    for i, key in enumerate(CRITICAL_OBJECTS_SUBKEYS):
+        if i == 0:
+            pattern = tokenizer.encode(' {"' + key + '": "', add_special_tokens=False)
+        else:
+            pattern = tokenizer.encode('", "' + key + '": "', add_special_tokens=False)
+        co_patterns.append({"key": key, "pattern": pattern, "index": i})
+    seqs["co_subkeys"] = co_patterns
+    seqs["co_closing"] = tokenizer.encode('"}', add_special_tokens=False)
+    # json.dumps produces "}," which may merge into a single token
+    seqs["co_closing_comma"] = tokenizer.encode('"},', add_special_tokens=False)
+    # ── future_meta_behavior sub-key patterns ──
+    # After dataloader processing (mdm markers removed, NULLs cleaned):
+    #   ' {"longitudinal": "keep speed", "lateral": "go straight"}'
+    # Scaffold = everything except the value content between quotes.
+    seqs["fmb_prefix"] = tokenizer.encode(' {"longitudinal": "', add_special_tokens=False)
+    seqs["fmb_closing"] = tokenizer.encode('"}', add_special_tokens=False)
+    seqs["fmb_closing_comma"] = tokenizer.encode('"},', add_special_tokens=False)
+    # Between longitudinal value and lateral value: '", "lateral": "'
+    seqs["fmb_between"] = tokenizer.encode('", "lateral": "', add_special_tokens=False)
+    # ── trajectory structure patterns ──
+    # After dataloader processing (no mdm markers), traj is:
+    #   ' "[[+14.70,-00.04], [+29.55,-00.21], ...]"'
+    seqs["traj_open"] = tokenizer.encode(' "[[', add_special_tokens=False)
+    # After dataloader inserts spaces (e.g. [+14.70,-00.04] → [ +14.70, -00.04]),
+    # tokens split cleanly: '],'(1125), ' ['(508), ','(11) are all independent.
+    seqs["traj_wp_sep"] = tokenizer.encode('],', add_special_tokens=False)   # [1125]
+    seqs["traj_wp_open"] = tokenizer.encode(' [', add_special_tokens=False)  # [508]
+    seqs["traj_coord_comma"] = tokenizer.encode(',', add_special_tokens=False)  # [11]
+    seqs["traj_close"] = tokenizer.encode(']]"}', add_special_tokens=False)
+    seqs["traj_close_split"] = tokenizer.encode(']]"', add_special_tokens=False)
+    seqs["traj_close_split2"] = tokenizer.encode(']]', add_special_tokens=False)
+    # Trajectory-only output support, e.g. {"trajectory": "..."}.
+    seqs["traj_only_boundaries"] = [
+        tokenizer.encode('{"trajectory":', add_special_tokens=False),
+        tokenizer.encode(' {"trajectory":', add_special_tokens=False),
+        tokenizer.encode('"trajectory":', add_special_tokens=False),
+        tokenizer.encode(' "trajectory":', add_special_tokens=False),
+    ]
+    return seqs
+def _mark_scaffold_range(scaffold_positions: List[int], start: int, length: int):
+    """Add positions [start, start+length) to scaffold_positions."""
+    for i in range(length):
+        scaffold_positions.append(start + i)
+def compute_section_block_idx_deep_static(
+    labels: torch.Tensor,
+    token_ids: torch.Tensor,
+    deep_scaffold_sequences: Dict[str, object],
+    fallback_block_size: int = 32,
+) -> Tuple[torch.Tensor, torch.Tensor, int, torch.Tensor]:
+    """
+    Deep-scaffold v2 block index computation.
+    Freezes sub-keys within sections:
+      - critical_objects: only yes/no values are denoised
+      - future_meta_behavior: only value tokens are denoised
+      - trajectory: only coordinate digits are denoised
+      - explanation: all content is denoised
+    Block count per section is computed dynamically:
+    ``n_blocks = ceil(num_value_tokens / fallback_block_size)``.
+    Args:
+        labels:                [B, seq_len]
+        token_ids:             [B, seq_len]
+        deep_scaffold_sequences: output of ``build_deep_scaffold_sequences``
+        fallback_block_size:   block size (bd_size), default 32
+    Returns:
+        response_block_idx, turn_idx, n_blocks, scaffold_mask
+    """
+    labels_single = labels[0]
+    token_list = token_ids[0].tolist()
+    seq_len = labels_single.shape[0]
+    device = labels.device
+    response_mask = (labels_single != -100)
+    response_block_idx = torch.full((seq_len,), -1, device=device, dtype=torch.int64)
+    turn_idx = torch.zeros((seq_len,), device=device, dtype=torch.int64)
+    scaffold_mask = torch.zeros((seq_len,), device=device, dtype=torch.bool)
+    response_positions = response_mask.nonzero(as_tuple=True)[0]
+    if len(response_positions) == 0:
+        return response_block_idx, turn_idx, 0, scaffold_mask
+    resp_start = response_positions[0].item()
+    resp_end = response_positions[-1].item() + 1
+    effective_resp_end = resp_end
+    resp_tokens = token_list[resp_start:resp_end]
+    top_seqs = deep_scaffold_sequences["top"]
+    # ── Step 1: Find top-level section boundaries (same as static version) ──
+    boundary_order = [
+        ("prefix",           "critical_objects"),
+        ("between_co_exp",   "explanation"),
+        ("between_exp_fmb",  "future_meta_behavior"),
+        ("between_fmb_traj", "trajectory"),
+    ]
+    sections: Dict[str, Tuple[int, int]] = {}
+    scaffold_positions: List[int] = []
+    # Top-level boundary scaffold tokens should belong to the *following*
+    # section's first block (e.g. `"explanation":` -> explanation block 0).
+    boundary_scaffold_to_section: Dict[str, List[int]] = {}
+    search_from = 0
+    prev_section_name: Optional[str] = None
+    prev_value_start: Optional[int] = None
+    for boundary_key, section_name in boundary_order:
+        pattern = top_seqs.get(boundary_key)
+        if pattern is None:
+            continue
+        pos = _find_subseq(resp_tokens, pattern, search_from)
+        if pos < 0:
+            continue
+        if prev_section_name is not None and prev_value_start is not None:
+            sections[prev_section_name] = (prev_value_start, pos)
+        _mark_scaffold_range(scaffold_positions, pos, len(pattern))
+        boundary_scaffold_to_section.setdefault(section_name, []).extend(
+            list(range(pos, pos + len(pattern)))
+        )
+        value_start = pos + len(pattern)
+        prev_section_name = section_name
+        prev_value_start = value_start
+        search_from = value_start
+    if prev_section_name is not None and prev_value_start is not None:
+        sections[prev_section_name] = (prev_value_start, len(resp_tokens))
+    # New dataset compatibility: response may contain only trajectory.
+    # If the 4-section boundaries are not found, try direct trajectory key match.
+    if "trajectory" not in sections:
+        traj_only_patterns = deep_scaffold_sequences.get("traj_only_boundaries", [])
+        # Reuse legacy boundary pattern as additional fallback (contains
+        # `"trajectory":` in old-format responses).
+        between_fmb_traj = top_seqs.get("between_fmb_traj")
+        if between_fmb_traj:
+            traj_only_patterns = list(traj_only_patterns) + [between_fmb_traj]
+        traj_pos = -1
+        traj_pat: Optional[List[int]] = None
+        for pat in traj_only_patterns:
+            if not pat:
+                continue
+            pos = _find_subseq(resp_tokens, pat, 0)
+            if pos >= 0:
+                traj_pos = pos
+                traj_pat = pat
+                break
+        if traj_pos >= 0 and traj_pat is not None:
+            _mark_scaffold_range(scaffold_positions, traj_pos, len(traj_pat))
+            boundary_scaffold_to_section.setdefault("trajectory", []).extend(
+                list(range(traj_pos, traj_pos + len(traj_pat)))
+            )
+            sections["trajectory"] = (traj_pos + len(traj_pat), len(resp_tokens))
+    # print(f"sections: {sections}")
+    # ── Step 2: Deep scaffold within critical_objects ──
+    if "critical_objects" in sections:
+        co_start, co_end = sections["critical_objects"]
+        co_tokens = resp_tokens[co_start:co_end]
+        co_search = 0
+        for entry in deep_scaffold_sequences["co_subkeys"]:
+            pattern = entry["pattern"]
+            pos = _find_subseq(co_tokens, pattern, co_search)
+            if pos < 0:
+                continue
+            _mark_scaffold_range(scaffold_positions, co_start + pos, len(pattern))
+            # The single value token is right after the pattern — skip it
+            co_search = pos + len(pattern) + 1
+        # Mark closing '"}' or "}," as scaffold
+        co_close = deep_scaffold_sequences["co_closing"]
+        close_pos = _find_subseq(co_tokens, co_close,
+                                  max(0, len(co_tokens) - len(co_close) - 2))
+        if close_pos >= 0:
+            _mark_scaffold_range(scaffold_positions, co_start + close_pos, len(co_close))
+        else:
+            # json.dumps may produce "}," as a single token
+            co_close_comma = deep_scaffold_sequences.get("co_closing_comma")
+            if co_close_comma:
+                close_pos = _find_subseq(co_tokens, co_close_comma,
+                                          max(0, len(co_tokens) - len(co_close_comma) - 2))
+                if close_pos >= 0:
+                    _mark_scaffold_range(scaffold_positions, co_start + close_pos, len(co_close_comma))
+    # ── Step 2b: Explanation opening/closing quotes as scaffold ──
+    # Explanation content is all VALUE, but the surrounding quotes must be
+    # SCAFFOLD so that VALUE tokens are exactly block-aligned (multiple of bd_size).
+    if "explanation" in sections:
+        exp_start, exp_end = sections["explanation"]
+        if exp_start < exp_end:
+            # Opening quote: first token of explanation section (e.g. ' "')
+            scaffold_positions.append(exp_start)
+            # Closing quote+comma: last token (e.g. '",')
+            scaffold_positions.append(exp_start + (exp_end - exp_start) - 1)
+    # ── Step 3: Deep scaffold within future_meta_behavior ──
+    # After dataloader processing, FMB has no <|mdm_start|>/<|mdm_end|> markers.
+    # Format: ' {"longitudinal": "keep speed", "lateral": "go straight"}'
+    # Strategy: use fmb_prefix to find start, fmb_between to split long/lat values,
+    # and fmb_closing to find end. Everything except value content is scaffold.
+    if "future_meta_behavior" in sections:
+        fmb_start, fmb_end = sections["future_meta_behavior"]
+        fmb_tokens = resp_tokens[fmb_start:fmb_end]
+        fmb_scaffold_positions = set()
+        # 1. Mark fmb_prefix as scaffold: ' {"longitudinal": "'
+        fmb_prefix = deep_scaffold_sequences["fmb_prefix"]
+        prefix_pos = _find_subseq(fmb_tokens, fmb_prefix, 0)
+        if prefix_pos >= 0:
+            for i in range(prefix_pos, prefix_pos + len(fmb_prefix)):
+                fmb_scaffold_positions.add(i)
+            long_value_start = prefix_pos + len(fmb_prefix)
+            # 2. Mark fmb_between as scaffold: '", "lateral": "'
+            fmb_between = deep_scaffold_sequences.get("fmb_between")
+            if fmb_between:
+                between_pos = _find_subseq(fmb_tokens, fmb_between, long_value_start)
+                if between_pos >= 0:
+                    for i in range(between_pos, between_pos + len(fmb_between)):
+                        fmb_scaffold_positions.add(i)
+                    lat_value_start = between_pos + len(fmb_between)
+                    # 3. Mark closing '"}'  or "}," as scaffold
+                    fmb_close = deep_scaffold_sequences["fmb_closing"]
+                    close_pos = _find_subseq(fmb_tokens, fmb_close,
+                                              max(0, len(fmb_tokens) - len(fmb_close) - 2))
+                    if close_pos < 0:
+                        fmb_close_comma = deep_scaffold_sequences.get("fmb_closing_comma")
+                        if fmb_close_comma:
+                            close_pos = _find_subseq(fmb_tokens, fmb_close_comma,
+                                                      max(0, len(fmb_tokens) - len(fmb_close_comma) - 2))
+                            if close_pos >= 0:
+                                fmb_close = fmb_close_comma
+                    if close_pos >= 0:
+                        for i in range(close_pos, close_pos + len(fmb_close)):
+                            fmb_scaffold_positions.add(i)
+        for i in fmb_scaffold_positions:
+            scaffold_positions.append(fmb_start + i)
+    # ── Step 4: Deep scaffold within trajectory ──
+    # After dataloader processing (no mdm markers), trajectory is:
+    #   ' "[[+14.70,-00.04], [+29.55,-00.21], ...]"'
+    if "trajectory" in sections:
+        traj_start, traj_end = sections["trajectory"]
+        traj_tokens = resp_tokens[traj_start:traj_end]
+        # Opening "[[
+        traj_open = deep_scaffold_sequences["traj_open"]
+        open_pos = _find_subseq(traj_tokens, traj_open, 0)
+        if open_pos >= 0:
+            _mark_scaffold_range(scaffold_positions, traj_start + open_pos, len(traj_open))
+        # Waypoint separators ], (4 of them between 5 waypoints)
+        traj_wp_sep = deep_scaffold_sequences["traj_wp_sep"]
+        sep_search = 0
+        for _ in range(4):
+            sep_pos = _find_subseq(traj_tokens, traj_wp_sep, sep_search)
+            if sep_pos < 0:
+                break
+            _mark_scaffold_range(scaffold_positions, traj_start + sep_pos, len(traj_wp_sep))
+            sep_search = sep_pos + len(traj_wp_sep)
+        # Intermediate waypoint opening ' [' (4 of them, between 5 waypoints)
+        traj_wp_open = deep_scaffold_sequences.get("traj_wp_open")
+        if traj_wp_open:
+            wo_search = 0
+            for _ in range(4):
+                wo_pos = _find_subseq(traj_tokens, traj_wp_open, wo_search)
+                if wo_pos < 0:
+                    break
+                _mark_scaffold_range(scaffold_positions, traj_start + wo_pos, len(traj_wp_open))
+                wo_search = wo_pos + len(traj_wp_open)
+        # Coordinate comma ',' between x and y within each waypoint (5 of them)
+        traj_coord_comma = deep_scaffold_sequences.get("traj_coord_comma")
+        if traj_coord_comma:
+            cc_search = 0
+            for _ in range(5):
+                cc_pos = _find_subseq(traj_tokens, traj_coord_comma, cc_search)
+                if cc_pos < 0:
+                    break
+                _mark_scaffold_range(scaffold_positions, traj_start + cc_pos, len(traj_coord_comma))
+                cc_search = cc_pos + len(traj_coord_comma)
+        # Closing ]]" or just ]]
+        traj_close = deep_scaffold_sequences["traj_close"]
+        close_pos = _find_subseq(traj_tokens, traj_close,
+                                  max(0, len(traj_tokens) - len(traj_close) - 6))
+        if close_pos < 0:
+            for split_key in ["traj_close_split", "traj_close_split2"]:
+                tcs = deep_scaffold_sequences.get(split_key)
+                if tcs:
+                    close_pos = _find_subseq(traj_tokens, tcs,
+                                              max(0, len(traj_tokens) - len(tcs) - 6))
+                    if close_pos >= 0:
+                        traj_close = tcs
+                        break
+        if close_pos >= 0:
+            _mark_scaffold_range(scaffold_positions, traj_start + close_pos, len(traj_close))
+            # Align training with inference scaffold: exclude trailing tokens
+            # after the JSON closing of trajectory (e.g. "<|im_end|>\n") from
+            # section/block scheduling.
+            effective_resp_end = min(
+                effective_resp_end,
+                resp_start + traj_start + close_pos + len(traj_close),
+            )
+        # Opening quote " (first token of traj value)
+        if len(traj_tokens) > 0:
+            scaffold_positions.append(traj_start)
+    # ── Mark scaffold mask (absolute positions) ──
+    scaffold_positions_set = set(scaffold_positions)
+    for sp in scaffold_positions_set:
+        abs_pos = resp_start + sp
+        if abs_pos < seq_len:
+            scaffold_mask[abs_pos] = True
+    # ── Assign block indices per section ──
+    current_block = 0
+    assigned = set()
+    block_to_section = {}  # block_idx -> section_name (for SASD compatibility)
+    section_first_block: Dict[str, int] = {}
+    for section_name in SECTION_KEYS:
+        if section_name not in sections:
+            continue
+        rel_start, rel_end = sections[section_name]
+        abs_start = resp_start + rel_start
+        abs_end = resp_start + rel_end
+        abs_start = max(abs_start, resp_start)
+        abs_end = min(abs_end, effective_resp_end)
+        num_tokens = abs_end - abs_start
+        if num_tokens <= 0:
+            continue
+        # Count only non-scaffold tokens for block sizing
+        value_positions = [p for p in range(abs_start, abs_end)
+                           if response_mask[p] and (p - resp_start) not in scaffold_positions_set]
+        num_value_tokens = len(value_positions)
+        if num_value_tokens <= 0:
+            section_first_block[section_name] = current_block
+            block_to_section[current_block] = section_name
+            current_block += 1
+            continue
+        # Use fixed block size (bd_size) and compute number of blocks dynamically
+        tokens_per_step = fallback_block_size
+        n_steps = max(1, math.ceil(num_value_tokens / tokens_per_step))
+        for b in range(n_steps):
+            block_to_section[current_block + b] = section_name
+        section_first_block[section_name] = current_block
+        for vi, pos in enumerate(value_positions):
+            block_in_section = min(vi // tokens_per_step, n_steps - 1)
+            response_block_idx[pos] = current_block + block_in_section
+            assigned.add(pos)
+        current_block += n_steps
+    # Assign scaffold tokens within each section to the nearest value token
+    # in the SAME section. This keeps section-closing tokens such as `"},`
+    # with their section instead of drifting to the next section.
+    for section_name in SECTION_KEYS:
+        if section_name not in sections:
+            continue
+        rel_start, rel_end = sections[section_name]
+        abs_start = max(resp_start + rel_start, resp_start)
+        abs_end = min(resp_start + rel_end, resp_end)
+        if abs_end <= abs_start:
+            continue
+        for abs_pos in range(abs_start, abs_end):
+            rel_pos = abs_pos - resp_start
+            if (
+                abs_pos >= seq_len
+                or not response_mask[abs_pos]
+                or abs_pos in assigned
+                or rel_pos not in scaffold_positions_set
+            ):
+                continue
+            best_block = -1
+            max_delta = max(1, abs_end - abs_start)
+            for delta in range(1, max_delta + 1):
+                # Prefer left first so closing punctuation tends to stay with
+                # the preceding content in the same section.
+                for cand in [abs_pos - delta, abs_pos + delta]:
+                    if abs_start <= cand < abs_end and cand in assigned:
+                        best_block = response_block_idx[cand].item()
+                        break
+                if best_block >= 0:
+                    break
+            if best_block < 0:
+                best_block = section_first_block.get(section_name, -1)
+            if best_block >= 0:
+                response_block_idx[abs_pos] = best_block
+                assigned.add(abs_pos)
+    # Top-level boundary tokens are explicitly attached to the following
+    # section's first block, instead of nearest-neighbor assignment.
+    for section_name, rel_positions in boundary_scaffold_to_section.items():
+        first_block = section_first_block.get(section_name)
+        if first_block is None:
+            continue
+        for rel_pos in rel_positions:
+            abs_pos = resp_start + rel_pos
+            if abs_pos >= seq_len or not response_mask[abs_pos]:
+                continue
+            response_block_idx[abs_pos] = first_block
+            assigned.add(abs_pos)
+    # Scaffold tokens → block index of nearest assigned neighbour
+    for sp in scaffold_positions_set:
+        abs_pos = resp_start + sp
+        if (
+            abs_pos >= seq_len
+            or abs_pos >= effective_resp_end
+            or not response_mask[abs_pos]
+            or abs_pos in assigned
+        ):
+            continue
+        best_block = -1
+        for delta in range(1, seq_len):
+            for cand in [abs_pos + delta, abs_pos - delta]:
+                if 0 <= cand < seq_len and cand in assigned:
+                    best_block = response_block_idx[cand].item()
+                    break
+            if best_block >= 0:
+                break
+        if best_block >= 0:
+            response_block_idx[abs_pos] = best_block
+            assigned.add(abs_pos)
+    # Fallback for unassigned response tokens
+    for pos in range(resp_start, effective_resp_end):
+        if response_mask[pos] and pos not in assigned:
+            offset = pos - resp_start
+            response_block_idx[pos] = current_block + offset // fallback_block_size
+            assigned.add(pos)
+    fallback_positions = [p for p in range(resp_start, effective_resp_end)
+                          if response_mask[p] and response_block_idx[p].item() >= current_block]
+    if fallback_positions:
+        current_block = max(response_block_idx[p].item() for p in fallback_positions) + 1
+    n_blocks = current_block
+    # Turn index
+    for i in range(1, seq_len):
+        if response_block_idx[i] != response_block_idx[i - 1]:
+            turn_idx[i] = turn_idx[i - 1] + 1
+        else:
+            turn_idx[i] = turn_idx[i - 1]
+    return response_block_idx, turn_idx, n_blocks, scaffold_mask, block_to_section

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "|<MASK>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a25ec1183126b2a0a76961dba7680d62b2209776fc31d39b85be8833b9386ae9
+size 11422266

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,212 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "|<MASK>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|NULL|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "|<MASK>|"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff