Instructions to use nvidia/Nemotron-Labs-Diffusion-8B-Base with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use nvidia/Nemotron-Labs-Diffusion-8B-Base with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="nvidia/Nemotron-Labs-Diffusion-8B-Base", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("nvidia/Nemotron-Labs-Diffusion-8B-Base", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use nvidia/Nemotron-Labs-Diffusion-8B-Base with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "nvidia/Nemotron-Labs-Diffusion-8B-Base"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-8B-Base",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/nvidia/Nemotron-Labs-Diffusion-8B-Base

SGLang

How to use nvidia/Nemotron-Labs-Diffusion-8B-Base with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "nvidia/Nemotron-Labs-Diffusion-8B-Base" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-8B-Base",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "nvidia/Nemotron-Labs-Diffusion-8B-Base" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-8B-Base",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use nvidia/Nemotron-Labs-Diffusion-8B-Base with Docker Model Runner:
```
docker model run hf.co/nvidia/Nemotron-Labs-Diffusion-8B-Base
```

YongganFu commited on 19 days ago

Commit

f318bfe

verified ·

1 Parent(s): d42bc62

Upload model

Browse files

Files changed (4) hide show

chat_utils.py +69 -3
config.json +1 -0
configuration_ministral_dlm.py +5 -0
modeling_ministral_dlm.py +1164 -54

chat_utils.py CHANGED Viewed

@@ -113,10 +113,13 @@ def generate_with_prefix_cache_block_diff(
     shift_logits=False,
     neg_entropy=False,
     causal_context=False,
 ):
     dream_style=shift_logits
-    # Initialize the accumulator
     x_accum = prompt.clone()
     assert gen_length % block_length == 0
     num_blocks = gen_length // block_length
@@ -141,30 +144,66 @@ def generate_with_prefix_cache_block_diff(
             if hasattr(layer.self_attn, 'diffusion_lm'):
                 layer.self_attn.diffusion_lm=True
     # For dream_style: store the "next token logit" of the context
     next_logits_context = None
     if dream_style:
         next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)
     for num_block in range(num_blocks):
-        # Create a new block with mask tokens (no seeding)
         mask_block = torch.ones(
             (prompt.shape[0], block_length),
             dtype=prompt.dtype,
             device=prompt.device
         ) * mask_id
         # Append the block of masks
         x_accum = torch.cat([x_accum, mask_block], dim=1)
         current_block_start = prompt.size(1) + num_block * block_length
         block_slice = slice(current_block_start, current_block_start + block_length)
         # Build the initial mask for this block
         mask_block_idx0 = (x_accum[:, block_slice] == mask_id)  # (B, Lb)
         # Precompute the transfer schedule for this block
         if dream_style:
-            # still denoise *all* positions (0..Lb-1), since none are seeded
             schedule_mask = mask_block_idx0
         else:
             schedule_mask = mask_block_idx0
@@ -221,6 +260,16 @@ def generate_with_prefix_cache_block_diff(
                 cur[transfer_idx] = x0[transfer_idx]
                 x_accum[:, block_slice] = cur
         if causal_context:
             for layer in model_module.encoder.layers:
                 if hasattr(layer.self_attn, 'diffusion_lm'):
@@ -234,14 +283,31 @@ def generate_with_prefix_cache_block_diff(
             use_causal_mask=causal_context
         )
         past_key_values = output.past_key_values
         if causal_context:
             for layer in model_module.encoder.layers:
                 if hasattr(layer.self_attn, 'diffusion_lm'):
                     layer.self_attn.diffusion_lm=True
         if dream_style and num_block < num_blocks - 1:
             # refresh context-next logit for the next block
             next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)
     return x_accum, nfe

     shift_logits=False,
     neg_entropy=False,
     causal_context=False,
+    eos_token_id=None,
+    max_thinking_tokens=None,
+    end_think_token_id=None,
 ):
     dream_style=shift_logits
     x_accum = prompt.clone()
+    B = prompt.shape[0]
     assert gen_length % block_length == 0
     num_blocks = gen_length // block_length
             if hasattr(layer.self_attn, 'diffusion_lm'):
                 layer.self_attn.diffusion_lm=True
+    # Causal prefill: next token from last position (same as linear_spec_generate).
+    next_token = None
+    if causal_context:
+        last_logit = output.logits[:, -1, :]
+        if temperature > 0:
+            probs = torch.softmax(last_logit / temperature, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
     # For dream_style: store the "next token logit" of the context
     next_logits_context = None
     if dream_style:
         next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)
     for num_block in range(num_blocks):
+        # Create a new block with mask tokens; under causal context, seed position 0
+        # with the next-token prediction from the previous causal forward (prefill or
+        # post-block encode), matching linear_spec_generate.
         mask_block = torch.ones(
             (prompt.shape[0], block_length),
             dtype=prompt.dtype,
             device=prompt.device
         ) * mask_id
+        if causal_context:
+            mask_block[:, 0] = next_token[:, 0]
         # Append the block of masks
         x_accum = torch.cat([x_accum, mask_block], dim=1)
         current_block_start = prompt.size(1) + num_block * block_length
         block_slice = slice(current_block_start, current_block_start + block_length)
+        # ---- thinking budget enforcement ----
+        # If we've generated >= max_thinking_tokens without a </think>, inject one.
+        if end_think_token_id is not None and max_thinking_tokens is not None:
+            tokens_before_block = num_block * block_length
+            tokens_after_block = tokens_before_block + block_length
+            if tokens_after_block > max_thinking_tokens:
+                gen_so_far = x_accum[:, prompt.size(1):current_block_start]
+                has_end_think = (
+                    (gen_so_far == end_think_token_id).any(dim=1)
+                    if gen_so_far.size(1) > 0
+                    else torch.zeros(B, dtype=torch.bool, device=prompt.device)
+                )
+                if not has_end_think.all():
+                    if tokens_before_block < max_thinking_tokens:
+                        offset = max_thinking_tokens - tokens_before_block
+                    else:
+                        offset = 0
+                    inject_pos = current_block_start + offset
+                    for b in range(B):
+                        if not has_end_think[b]:
+                            x_accum[b, inject_pos] = end_think_token_id
         # Build the initial mask for this block
         mask_block_idx0 = (x_accum[:, block_slice] == mask_id)  # (B, Lb)
         # Precompute the transfer schedule for this block
         if dream_style:
+            # masked positions only (position 0 may be causal-seeded, not mask_id)
             schedule_mask = mask_block_idx0
         else:
             schedule_mask = mask_block_idx0
                 cur[transfer_idx] = x0[transfer_idx]
                 x_accum[:, block_slice] = cur
+            if eos_token_id is not None:
+                block_tokens = x_accum[:, block_slice]              # (B, Lb)
+                eos_mask = (block_tokens == eos_token_id)           # (B, Lb)
+                any_eos = eos_mask.any(dim=1)                       # (B,)
+                if any_eos.any():
+                    after_eos = eos_mask.cumsum(dim=1).bool()       # (B, Lb)
+                    mask_before = (block_tokens == mask_id) & ~after_eos
+                    if (any_eos & ~mask_before.any(dim=1)).any():
+                        break
         if causal_context:
             for layer in model_module.encoder.layers:
                 if hasattr(layer.self_attn, 'diffusion_lm'):
             use_causal_mask=causal_context
         )
         past_key_values = output.past_key_values
+        nfe += 1
         if causal_context:
             for layer in model_module.encoder.layers:
                 if hasattr(layer.self_attn, 'diffusion_lm'):
                     layer.self_attn.diffusion_lm=True
+            # Next block's first position = greedy/sampled next token from this causal encode
+            last_logit = output.logits[:, -1, :]
+            if temperature > 0:
+                probs = torch.softmax(last_logit / temperature, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            else:
+                next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
         if dream_style and num_block < num_blocks - 1:
             # refresh context-next logit for the next block
             next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)
+        if eos_token_id is not None:
+            gen_so_far = x_accum[:, prompt.size(1):]                    # (B, gen_len_so_far)
+            is_eos = (gen_so_far == eos_token_id)                       # (B, gen_len_so_far)
+            has_eos = is_eos.any(dim=1)                                 # (B,)
+            if has_eos.all():
+                first_eos_pos = is_eos.to(torch.int64).argmax(dim=1)    # (B,)
+                max_eos = first_eos_pos.max().item()
+                return x_accum[:, : prompt.size(1) + max_eos + 1], nfe
     return x_accum, nfe

config.json CHANGED Viewed

@@ -22,6 +22,7 @@
   "dlm_paradigm": "bidirectional",
   "dlm_type": "llada",
   "dp_varying_mask_ratio": false,
   "enforce_mask": false,
   "eos_token_id": 2,
   "global_loss_avg": false,

   "dlm_paradigm": "bidirectional",
   "dlm_type": "llada",
   "dp_varying_mask_ratio": false,
+  "enable_self_spec": false,
   "enforce_mask": false,
   "eos_token_id": 2,
   "global_loss_avg": false,

configuration_ministral_dlm.py CHANGED Viewed

@@ -112,6 +112,9 @@ class MinistralDLMConfig(PretrainedConfig):
             Adaptive permutation ratio for each block.
         ada_perm_ratio_global (`float`, *optional*):
             Adaptive permutation ratio for global.
     """
     model_type = "ministral_dlm"
@@ -181,6 +184,7 @@ class MinistralDLMConfig(PretrainedConfig):
         ada_perm_ratio_per_block=None,
         ada_perm_ratio_global=None,
         ada_dlm_loss_ratio=None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -234,6 +238,7 @@ class MinistralDLMConfig(PretrainedConfig):
         self.ada_perm_ratio_per_block = ada_perm_ratio_per_block
         self.ada_perm_ratio_global = ada_perm_ratio_global
         self.ada_dlm_loss_ratio = ada_dlm_loss_ratio
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

             Adaptive permutation ratio for each block.
         ada_perm_ratio_global (`float`, *optional*):
             Adaptive permutation ratio for global.
+        enable_self_spec (`bool`, *optional*, defaults to `False`):
+            Force MinistralFlexAttention for all paradigms (including bidirectional/autoregressive).
+            Required for self speculative generation; leave False for standard eval to use faster SDPA kernels.
     """
     model_type = "ministral_dlm"
         ada_perm_ratio_per_block=None,
         ada_perm_ratio_global=None,
         ada_dlm_loss_ratio=None,
+        enable_self_spec=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.ada_perm_ratio_per_block = ada_perm_ratio_per_block
         self.ada_perm_ratio_global = ada_perm_ratio_global
         self.ada_dlm_loss_ratio = ada_dlm_loss_ratio
+        self.enable_self_spec = enable_self_spec
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

modeling_ministral_dlm.py CHANGED Viewed

@@ -13,7 +13,7 @@ from torch import nn
 from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutput
 from transformers.utils import ModelOutput
-from torch.nn.attention.flex_attention import flex_attention, create_block_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
@@ -31,6 +31,7 @@ from .chat_utils import generate_with_prefix_cache_block_diff
 from .modeling_ministral import Ministral3Model, Ministral3PreTrainedModel, Ministral3Attention, apply_rotary_pos_emb, repeat_kv, _get_llama_4_attn_scale
 from .configuration_ministral_dlm import MinistralDLMConfig
 @dataclass
 class MinistralDiffOutputWithPast(ModelOutput):
@@ -49,11 +50,49 @@ class MinistralDiffOutputWithPast(ModelOutput):
 def fused_flex_attention(q, k, v, block_mask=None):
     return flex_attention(q, k, v, block_mask=block_mask)
 # with reference to https://github.com/pytorch-labs/attention-gym/blob/main/examples/flex_attn.ipynb
 class MinistralFlexAttention(Ministral3Attention):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.block_size_orig = self.config.block_size
         if self.config.dlm_paradigm == 'bidirectional':
@@ -69,40 +108,60 @@ class MinistralFlexAttention(Ministral3Attention):
         self.block_size = self.block_size_orig
         self.mode = self.config.dlm_paradigm
         import torch._dynamo.config as dcfg
         dcfg.cache_size_limit = 512
     def set_attention_mode(self, mode, block_size=None):
         self.mode = mode
         self.block_size = block_size
-    def compute_block_mask(self, mode, q_len, block_size=None):
         def bidirectional_mask(b, h, q, kv):
             return (q >= kv) | (q < kv)
         def autoregressive_mask(b, h, q, kv):
             return (q >= kv)
-        def block_diff_mask(block_size, b, h, q_idx, kv_idx, n):
-            """
-            Constructs the specialized block diffusion attention mask for training
-            composed of three masks:
-            - **Block Diagonal Mask (M_BD)**: Self-attention within noised blocks
-            - **Offset Block Causal Mask (M_OBC)**: Cross-attention for conditional context
-            - **Block Causal Mask (M_BC)**: Attention to update x0
-            Args:
-                b, h: Batch and head indices (ignored for mask logic).
-                q_idx, kv_idx: Query and Key indices.
-                seq_len: Total sequence length.
-                block_size: Defines the block structure.
-            Returns:
-                A boolean attention mask.
-            """
-            # Indicate whether token belongs to xt or x0
             x0_flag_q = (q_idx >= n)
             x0_flag_kv = (kv_idx >= n)
@@ -165,15 +224,23 @@ class MinistralFlexAttention(Ministral3Attention):
             attn_mask = autoregressive_mask
         elif mode == 'block_diff':
             assert block_size is not None
-            attn_mask = lambda b, h, q, kv: block_diff_mask(block_size, b, h, q, kv, q_len//2)
         elif mode == 'sbd_block_diff':
             assert block_size is not None
-            attn_mask = lambda b, h, q, kv: sbd_block_diff_mask(block_size, b, h, q, kv, q_len//2)
         else:
             raise ValueError(f"Unknown attention mode: {mode}")
         block_mask = create_block_mask(
-            attn_mask, B=None, H=None, Q_LEN=q_len, KV_LEN=q_len
         )
         return block_mask
@@ -225,40 +292,131 @@ class MinistralFlexAttention(Ministral3Attention):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        if self.mode == 'bidirectional':
-            if self.bidirectional_mask is None or q_len != self.bidirectional_mask.shape[-2]:
-                block_mask = self.compute_block_mask(mode='bidirectional', q_len=q_len)
-            else:
-                block_mask = self.bidirectional_mask
-        elif self.mode == 'autoregressive':
-            if self.autoregressive_mask is None or q_len != self.autoregressive_mask.shape[-2]:
-                block_mask = self.compute_block_mask(mode='autoregressive', q_len=q_len)
-            else:
-                block_mask = self.autoregressive_mask
-        elif self.mode == 'block_diff':
-            if self.block_diff_mask is None or self.block_size != self.block_size_orig or q_len != self.block_diff_mask.shape[-2]:
-                block_mask = self.compute_block_mask(mode='block_diff', block_size=self.block_size, q_len=q_len)
-            else:
-                block_mask = self.block_diff_mask
-        elif self.mode == 'sbd_block_diff':
-            if self.sbd_block_diff_mask is None or self.block_size != self.block_size_orig or q_len != self.sbd_block_diff_mask.shape[-2]:
-                block_mask = self.compute_block_mask(mode='sbd_block_diff', block_size=self.block_size, q_len=q_len)
             else:
-                block_mask = self.sbd_block_diff_mask
-        else:
-            raise ValueError(f"Unknown attention mode: {self.mode}")
-        attn_output = fused_flex_attention(query_states, key_states, value_states, block_mask=block_mask)
-        attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-        return attn_output, None
 def gumbel_topk(log_w: torch.Tensor, k: int) -> torch.Tensor:
@@ -285,11 +443,12 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
         diffusion_config = copy.deepcopy(config)
         diffusion_config.diffusion_lm = True
         if config.dlm_paradigm in ['block_diff', 'sbd_block_diff']:
             diffusion_config.attn_class = MinistralFlexAttention
         elif config.dlm_paradigm in ['bidirectional', 'autoregressive']:
-            diffusion_config.attn_class = Ministral3Attention
             if config.dlm_paradigm == 'autoregressive':
                 diffusion_config.diffusion_lm = False
         else:
@@ -713,7 +872,10 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
         )
-    def generate(self, prompt_ids, max_new_tokens, steps, block_length, shift_logits, threshold, causal_context=True, temperature=0):
         out_ids, nfe = generate_with_prefix_cache_block_diff(
                         model=self,
                         prompt=prompt_ids,
@@ -727,8 +889,956 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
                         shift_logits=shift_logits,
                         neg_entropy=False,
                         causal_context=causal_context,
                     )
         return out_ids, nfe
-__all__ = ["MinistralDiffEncoderModel", "MinistralFlexAttention"]

 from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutput
 from transformers.utils import ModelOutput
+from torch.nn.attention.flex_attention import BlockMask, flex_attention, create_block_mask, or_masks
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from .modeling_ministral import Ministral3Model, Ministral3PreTrainedModel, Ministral3Attention, apply_rotary_pos_emb, repeat_kv, _get_llama_4_attn_scale
 from .configuration_ministral_dlm import MinistralDLMConfig
+__all__ = ["MinistralDiffEncoderModel", "MinistralFlexAttention"]
 @dataclass
 class MinistralDiffOutputWithPast(ModelOutput):
 def fused_flex_attention(q, k, v, block_mask=None):
     return flex_attention(q, k, v, block_mask=block_mask)
+def _crop_dynamic_cache(past_key_values: DynamicCache, max_length: int):
+    """Crop a DynamicCache to max_length, compatible with both old and new transformers."""
+    if hasattr(past_key_values, 'crop'):
+        past_key_values.crop(max_length)
+    else:
+        for layer_idx in range(len(past_key_values)):
+            past_key_values.key_cache[layer_idx] = past_key_values.key_cache[layer_idx][:, :, :max_length]
+            past_key_values.value_cache[layer_idx] = past_key_values.value_cache[layer_idx][:, :, :max_length]
+        past_key_values._seen_tokens = max_length
+def _extract_draft_kv_cache(past_key_values: DynamicCache, clean_len: int, block_length: int):
+    """After quadratic decoding, extract only draft tokens (first of each block) from cache."""
+    for layer_idx in range(len(past_key_values)):
+        if hasattr(past_key_values, 'layers'):
+            layer_cache = past_key_values.layers[layer_idx]
+            k, v = layer_cache.keys, layer_cache.values
+        else:
+            k = past_key_values.key_cache[layer_idx]
+            v = past_key_values.value_cache[layer_idx]
+        clean_k, draft_k = k[:, :, :clean_len], k[:, :, clean_len::block_length + 1]
+        clean_v, draft_v = v[:, :, :clean_len], v[:, :, clean_len::block_length + 1]
+        new_k = torch.cat([clean_k, draft_k], dim=2)
+        new_v = torch.cat([clean_v, draft_v], dim=2)
+        if hasattr(past_key_values, 'layers'):
+            layer_cache.keys = new_k
+            layer_cache.values = new_v
+        else:
+            past_key_values.key_cache[layer_idx] = new_k
+            past_key_values.value_cache[layer_idx] = new_v
+    past_key_values._seen_tokens = clean_len + block_length
 # with reference to https://github.com/pytorch-labs/attention-gym/blob/main/examples/flex_attn.ipynb
 class MinistralFlexAttention(Ministral3Attention):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.max_seq_length = getattr(self.config, 'max_seq_length', 4096)
         self.block_size_orig = self.config.block_size
         if self.config.dlm_paradigm == 'bidirectional':
         self.block_size = self.block_size_orig
         self.mode = self.config.dlm_paradigm
+        self._quadratic_block_mask = {}
         import torch._dynamo.config as dcfg
         dcfg.cache_size_limit = 512
+    def _get_sbd_inference_quadratic_decoding_block_mask(self, block_length: int):
+        if block_length not in self._quadratic_block_mask:
+            draft_len = block_length * (block_length + 1)
+            def quadratic(b, h, q_idx, kv_idx):
+                first_clean = torch.logical_and(
+                    kv_idx % (block_length + 1) == 0,
+                    kv_idx < draft_len,
+                )
+                first_clean = torch.logical_and(first_clean, q_idx >= kv_idx)
+                block_q = q_idx // (block_length + 1)
+                block_kv = kv_idx // (block_length + 1)
+                same_block = torch.logical_and(block_q == block_kv, q_idx < draft_len)
+                same_block_except_first = torch.logical_and(
+                    same_block,
+                    q_idx % (block_length + 1) != 0,
+                )
+                draft_part = torch.logical_or(first_clean, same_block_except_first)
+                clean_part = kv_idx >= draft_len
+                return torch.logical_or(draft_part, clean_part)
+            block_mask = create_block_mask(
+                quadratic,
+                B=None,
+                H=None,
+                Q_LEN=draft_len,
+                KV_LEN=draft_len + self.config.max_position_embeddings,
+                device="cuda",
+            )
+            self._quadratic_block_mask[block_length] = block_mask
+        return self._quadratic_block_mask[block_length]
     def set_attention_mode(self, mode, block_size=None):
         self.mode = mode
         self.block_size = block_size
+    def compute_block_mask(self, mode, q_len=None, block_size=None):
         def bidirectional_mask(b, h, q, kv):
             return (q >= kv) | (q < kv)
         def autoregressive_mask(b, h, q, kv):
             return (q >= kv)
+        def block_diff_mask(block_size, b, h, q_idx, kv_idx, n):
             x0_flag_q = (q_idx >= n)
             x0_flag_kv = (kv_idx >= n)
             attn_mask = autoregressive_mask
         elif mode == 'block_diff':
             assert block_size is not None
+            attn_mask = lambda b, h, q, kv: block_diff_mask(block_size, b, h, q, kv, self.max_seq_length)
         elif mode == 'sbd_block_diff':
             assert block_size is not None
+            attn_mask = lambda b, h, q, kv: sbd_block_diff_mask(block_size, b, h, q, kv, self.max_seq_length)
         else:
             raise ValueError(f"Unknown attention mode: {mode}")
+        if q_len is not None:
+            Q_LEN = q_len
+        else:
+            if mode in ['block_diff', 'sbd_block_diff']:
+                Q_LEN = self.max_seq_length * 2
+            else:
+                Q_LEN = self.max_seq_length
         block_mask = create_block_mask(
+            attn_mask, B=None, H=None, Q_LEN=Q_LEN, KV_LEN=Q_LEN
         )
         return block_mask
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        self_spec_inference_mode = getattr(self.config, "self_spec_inference_mode", None)
+        if self_spec_inference_mode is not None:
+            if self_spec_inference_mode == "quadratic":
+                block_length = getattr(self.config, "block_length", None) or getattr(self.config, "block_size", None)
+                if block_length is None:
+                    raise ValueError("SBD quadratic decoding requires block_length in config.")
+                if past_key_values is not None:
+                    seq_len = key_states.shape[2]
+                    draft_len = block_length * (block_length + 1)
+                    clean_keys = key_states[:, :, :-draft_len]
+                    draft_keys = key_states[:, :, -draft_len:]
+                    clean_values = value_states[:, :, :-draft_len]
+                    draft_values = value_states[:, :, -draft_len:]
+                    key_states = torch.cat([draft_keys, clean_keys], dim=2)
+                    value_states = torch.cat([draft_values, clean_values], dim=2)
+                    block_mask: BlockMask = self._get_sbd_inference_quadratic_decoding_block_mask(
+                        block_length=block_length
+                    )
+                    block_mask.seq_lengths = (draft_len, seq_len)
+                else:
+                    seq_len = query_states.shape[2]
+                    draft_len = block_length * (block_length + 1)
+                    clean_len = seq_len - draft_len
+                    def _causal_mask(b, h, q_idx, kv_idx):
+                        return torch.logical_and(q_idx >= kv_idx, q_idx < clean_len)
+                    def _draft2clean_mask(b, h, q_idx, kv_idx):
+                        full_clean = torch.logical_and(q_idx >= clean_len, kv_idx <= clean_len)
+                        first_clean = torch.logical_and(
+                            q_idx >= clean_len, (kv_idx - clean_len) % (block_length + 1) == 0
+                        )
+                        first_clean = torch.logical_and(first_clean, q_idx >= kv_idx)
+                        return torch.logical_or(full_clean, first_clean)
+                    def _draft_mask(b, h, q_idx, kv_idx):
+                        block_q = (q_idx - clean_len) // (block_length + 1)
+                        block_kv = (kv_idx - clean_len) // (block_length + 1)
+                        quadrant = torch.logical_and(q_idx >= clean_len, kv_idx >= clean_len)
+                        same_block = torch.logical_and(block_q == block_kv, quadrant)
+                        same_block_except_first = torch.logical_and(
+                            same_block,
+                            (q_idx - clean_len) % (block_length + 1) != 0,
+                        )
+                        return torch.logical_and(block_q == block_kv, same_block_except_first)
+                    mask = or_masks(_causal_mask, _draft2clean_mask)
+                    mask = or_masks(mask, _draft_mask)
+                    block_mask = create_block_mask(
+                        mask, B=None, H=None, Q_LEN=seq_len, KV_LEN=seq_len,
+                    )
+                key_states = repeat_kv(key_states, self.num_key_value_groups)
+                value_states = repeat_kv(value_states, self.num_key_value_groups)
+                attn_output = flex_attention(query_states, key_states, value_states, block_mask=block_mask)
+                attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
+                attn_output = self.o_proj(attn_output)
+                return attn_output, None
+            elif self_spec_inference_mode == "default":
+                block_length = getattr(self.config, "block_length", None) or getattr(self.config, "block_size", None)
+                if block_length is None:
+                    raise ValueError("SBD default decoding requires block_length in config.")
+                seq_len = query_states.shape[2]
+                prefix_len = seq_len - block_length
+                def _clean_q_mask(b, h, q_idx, kv_idx):
+                    return torch.logical_and(q_idx >= kv_idx, q_idx < prefix_len)
+                def _noisy_q_mask(b, h, q_idx, kv_idx):
+                    return q_idx >= prefix_len
+                block_mask = create_block_mask(
+                    or_masks(_clean_q_mask, _noisy_q_mask),
+                    B=None,
+                    H=None,
+                    Q_LEN=seq_len,
+                    KV_LEN=seq_len,
+                )
+                key_states = repeat_kv(key_states, self.num_key_value_groups)
+                value_states = repeat_kv(value_states, self.num_key_value_groups)
+                attn_output = flex_attention(query_states, key_states, value_states, block_mask=block_mask)
+                attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
+                attn_output = self.o_proj(attn_output)
+                return attn_output, None
+        else:
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
+            if self.mode == 'bidirectional':
+                if self.bidirectional_mask is None or q_len != self.bidirectional_mask.shape[-2]:
+                    block_mask = self.compute_block_mask(mode='bidirectional', q_len=q_len)
+                else:
+                    block_mask = self.bidirectional_mask
+            elif self.mode == 'autoregressive':
+                if self.autoregressive_mask is None or q_len != self.autoregressive_mask.shape[-2]:
+                    block_mask = self.compute_block_mask(mode='autoregressive', q_len=q_len)
+                else:
+                    block_mask = self.autoregressive_mask
+            elif self.mode == 'block_diff':
+                if self.block_diff_mask is None or self.block_size != self.block_size_orig or q_len != self.block_diff_mask.shape[-2]:
+                    block_mask = self.compute_block_mask(mode='block_diff', block_size=self.block_size, q_len=q_len)
+                else:
+                    block_mask = self.block_diff_mask
+            elif self.mode == 'sbd_block_diff':
+                if self.sbd_block_diff_mask is None or self.block_size != self.block_size_orig or q_len != self.sbd_block_diff_mask.shape[-2]:
+                    block_mask = self.compute_block_mask(mode='sbd_block_diff', block_size=self.block_size, q_len=q_len)
+                else:
+                    block_mask = self.sbd_block_diff_mask
             else:
+                raise ValueError(f"Unknown attention mode: {self.mode}")
+            attn_output = fused_flex_attention(query_states, key_states, value_states, block_mask=block_mask)
+            attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
+            attn_output = self.o_proj(attn_output)
+            return attn_output, None
 def gumbel_topk(log_w: torch.Tensor, k: int) -> torch.Tensor:
         diffusion_config = copy.deepcopy(config)
         diffusion_config.diffusion_lm = True
+        use_flex = getattr(config, 'enable_self_spec', False)
         if config.dlm_paradigm in ['block_diff', 'sbd_block_diff']:
             diffusion_config.attn_class = MinistralFlexAttention
         elif config.dlm_paradigm in ['bidirectional', 'autoregressive']:
+            diffusion_config.attn_class = MinistralFlexAttention if use_flex else Ministral3Attention
             if config.dlm_paradigm == 'autoregressive':
                 diffusion_config.diffusion_lm = False
         else:
         )
+    def generate(self, prompt_ids, max_new_tokens, steps, block_length, shift_logits, threshold, causal_context=True, temperature=0, eos_token_id=None, max_thinking_tokens=None, end_think_token_id=None):
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, 'eos_token_id', None)
         out_ids, nfe = generate_with_prefix_cache_block_diff(
                         model=self,
                         prompt=prompt_ids,
                         shift_logits=shift_logits,
                         neg_entropy=False,
                         causal_context=causal_context,
+                        eos_token_id=eos_token_id,
+                        max_thinking_tokens=max_thinking_tokens,
+                        end_think_token_id=end_think_token_id,
                     )
         return out_ids, nfe
+    @torch.no_grad()
+    def sbd_inference_diffusion_quadratic(
+        self,
+        clean_input_ids: Optional[torch.Tensor],
+        draft_input_ids: torch.Tensor,
+        block_length: int,
+        draft_only: bool = False,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+    ):
+        enc_config = self.encoder.config
+        enc_config.use_sbd_objective = True
+        enc_config.block_length = block_length
+        if draft_only:
+            assert clean_input_ids is not None
+            if use_cache and past_key_values is None:
+                past_key_values = DynamicCache()
+            enc_config.self_spec_inference_mode = "default"
+            input_ids = torch.cat([clean_input_ids, draft_input_ids], dim=-1)
+            outputs = self.encoder(
+                input_ids=input_ids,
+                position_ids=None,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                is_training=False,
+            )
+            hidden_states = outputs.last_hidden_state
+            logits = self.diffusion_head(hidden_states)
+            past_key_values = getattr(outputs, "past_key_values", None)
+            if use_cache and past_key_values is not None:
+                _crop_dynamic_cache(past_key_values, clean_input_ids.shape[1])
+            return logits, past_key_values
+        else:
+            enc_config.self_spec_inference_mode = "quadratic"
+            draft_len = block_length * (block_length + 1)
+            draft_input_ids = torch.cat(
+                [
+                    draft_input_ids.view(-1, block_length, 1),
+                    torch.full(
+                        (draft_input_ids.shape[0], block_length, block_length),
+                        fill_value=self.config.mask_token_id,
+                        device=draft_input_ids.device,
+                    ),
+                ],
+                dim=-1,
+            ).view(-1, draft_len)
+            if use_cache:
+                assert past_key_values is not None, (
+                    "Past key values should be provided when using cache, e.g. run draft_only=True first."
+                )
+                assert clean_input_ids is None, (
+                    "Clean input ids should already be in cache, thus none should be provided."
+                )
+                clean_len = past_key_values.get_seq_length()
+                input_ids = draft_input_ids
+            else:
+                clean_len = clean_input_ids.shape[1]
+                input_ids = torch.cat([clean_input_ids, draft_input_ids], dim=-1)
+            per_block_position_ids = torch.arange(
+                clean_len, clean_len + block_length + 1, device=draft_input_ids.device
+            )[None,].repeat(block_length, 1)
+            per_block_position_ids += torch.arange(block_length, device=draft_input_ids.device).view(-1, 1)
+            if use_cache:
+                position_ids = per_block_position_ids.view(-1)[None,]
+            else:
+                clean_position_ids = torch.arange(clean_len, device=draft_input_ids.device)
+                position_ids = torch.cat([clean_position_ids, per_block_position_ids.view(-1)], dim=-1)[None,]
+            outputs = self.encoder(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                is_training=False,
+            )
+            hidden_states = outputs.last_hidden_state
+            logits = self.diffusion_head(hidden_states)
+            past_key_values = getattr(outputs, "past_key_values", None)
+            if use_cache and past_key_values is not None:
+                _extract_draft_kv_cache(past_key_values, clean_len, block_length)
+            return logits, past_key_values
+    @torch.no_grad()
+    def ar_generate(
+        self,
+        prompt_ids: torch.Tensor,
+        max_new_tokens: int = 128,
+        temperature: float = 0.0,
+        eos_token_id: Optional[int] = None,
+        max_thinking_tokens: Optional[int] = None,
+        end_think_token_id: Optional[int] = None,
+    ) -> tuple:
+        """Autoregressive generation calling the encoder directly (injected by build_hf_tidar_repo).
+        Bypasses MinistralDiffEncoderModel.forward() to avoid diffusion-specific
+        code paths. Calls self.encoder (Ministral3Model) with explicit cache_position,
+        position_ids, and use_cache so the KV cache and causal masking behave
+        identically to MistralForCausalLM / vLLM.
+        Returns:
+            (output_ids, nfe) where output_ids includes the prompt.
+        """
+        for layer in self.encoder.layers:
+            if hasattr(layer.self_attn, 'diffusion_lm'):
+                layer.self_attn.diffusion_lm = False
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, 'eos_token_id', None)
+        device = prompt_ids.device
+        batch_size, prompt_len = prompt_ids.shape
+        past_key_values = DynamicCache()
+        cache_position = torch.arange(prompt_len, device=device)
+        position_ids = cache_position.unsqueeze(0).expand(batch_size, -1)
+        enc_out = self.encoder(
+            input_ids=prompt_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=True,
+            cache_position=cache_position,
+        )
+        past_key_values = enc_out.past_key_values
+        next_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
+        generated_tokens = []
+        nfe = 0
+        for step in range(max_new_tokens):
+            nfe += 1
+            if temperature > 0:
+                probs = torch.softmax(next_logit / temperature, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            else:
+                next_token = torch.argmax(next_logit, dim=-1, keepdim=True)
+            # ---- thinking budget enforcement ----
+            if end_think_token_id is not None and max_thinking_tokens is not None:
+                if step >= max_thinking_tokens:
+                    if generated_tokens:
+                        gen_tensor = torch.cat(generated_tokens, dim=1)
+                        has_end_think = (gen_tensor == end_think_token_id).any(dim=1)
+                    else:
+                        has_end_think = torch.zeros(batch_size, dtype=torch.bool, device=device)
+                    for b in range(batch_size):
+                        if not has_end_think[b]:
+                            next_token[b] = end_think_token_id
+            generated_tokens.append(next_token)
+            if eos_token_id is not None and (next_token == eos_token_id).all():
+                break
+            if step < max_new_tokens - 1:
+                cur_pos = prompt_len + step
+                step_cache_pos = torch.tensor([cur_pos], device=device)
+                step_pos_ids = step_cache_pos.unsqueeze(0).expand(batch_size, -1)
+                enc_out = self.encoder(
+                    input_ids=next_token,
+                    position_ids=step_pos_ids,
+                    past_key_values=past_key_values,
+                    use_cache=True,
+                    cache_position=step_cache_pos,
+                )
+                past_key_values = enc_out.past_key_values
+                next_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
+        all_generated = torch.cat(generated_tokens, dim=1)
+        output_ids = torch.cat([prompt_ids, all_generated], dim=1)
+        return output_ids, nfe
+    @torch.no_grad()
+    def self_spec_generate(
+        self,
+        prompt_ids: torch.Tensor,
+        max_new_tokens: int = 128,
+        steps: int = 128,
+        block_length: int = 16,
+        ar_mix_weight: Optional[float] = None,
+        temperature: float = 0.0,
+        mask_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        max_thinking_tokens: Optional[int] = None,
+        end_think_token_id: Optional[int] = None,
+    ):
+        self.config.use_sbd_objective = True
+        self.config.dlm_paradigm = "sbd"
+        if prompt_ids.shape[0] != 1:
+            raise ValueError("Self speculation quadratic decoding currently requires batch_size == 1")
+        token_mask_id = mask_token_id if mask_token_id is not None else self.config.mask_token_id
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, "eos_token_id", None)
+        x = torch.full(
+            (1, prompt_ids.shape[1] + max_new_tokens + block_length * 2),
+            token_mask_id,
+            dtype=torch.long,
+            device=prompt_ids.device,
+        )
+        x[:, : prompt_ids.shape[1]] = prompt_ids.clone()
+        if max_new_tokens % block_length != 0:
+            raise ValueError("max_new_tokens must be divisible by block_length")
+        num_blocks = max_new_tokens // block_length
+        if steps % num_blocks != 0:
+            raise ValueError("steps must be divisible by (max_new_tokens // block_length)")
+        prompt_len = prompt_ids.shape[1]
+        nfe = 0
+        nfe += 1
+        logits, past_key_values = self.sbd_inference_diffusion_quadratic(
+            clean_input_ids=x[:, :prompt_len],
+            draft_input_ids=x[:, prompt_len : prompt_len + block_length],
+            block_length=block_length,
+            draft_only=True,
+            use_cache=True,
+        )
+        logits_proposal = logits[:, prompt_len - 1 : prompt_len + block_length]
+        logits_proposal[:, 1] = logits_proposal[:, 0]
+        logits_proposal = logits_proposal[:, 1:]
+        x0_proposal = torch.argmax(logits_proposal, dim=-1)
+        x[:, prompt_len : prompt_len + block_length] = x0_proposal
+        total_accept_token = 0
+        while True:
+            nfe += 1
+            block_start = prompt_len + total_accept_token
+            block_end = block_start + block_length
+            draft_input_ids = x[:, block_start:block_end]
+            logits, past_key_values = self.sbd_inference_diffusion_quadratic(
+                clean_input_ids=None,
+                draft_input_ids=draft_input_ids,
+                block_length=block_length,
+                draft_only=False,
+                past_key_values=past_key_values,
+                use_cache=True,
+            )
+            useful_token_logits = logits.view(1, block_length, block_length + 1, -1)
+            if ar_mix_weight is None:
+                useful_token_logits[:, :, 1] = useful_token_logits[:, :, 0]
+            else:
+                if not (0.0 <= ar_mix_weight <= 1.0):
+                    raise ValueError("ar_mix_weight must be between 0 and 1")
+                mix_logits = useful_token_logits[:, :, 0] * ar_mix_weight + useful_token_logits[:, :, 1] * (1 - ar_mix_weight)
+                useful_token_logits[:, :, 0] = mix_logits
+                useful_token_logits[:, :, 1] = mix_logits
+            if temperature > 0:
+                useful_token_logits = useful_token_logits / temperature
+            useful_token_pred = torch.argmax(useful_token_logits, dim=-1)
+            new_draft_input_ids = useful_token_pred[:, 0, 1:]
+            accept_cnt = 1
+            while accept_cnt < block_length:
+                if useful_token_pred[:, accept_cnt - 1, 0].item() != draft_input_ids[:, accept_cnt].item():
+                    break
+                new_draft_input_ids = useful_token_pred[:, accept_cnt, 1:]
+                accept_cnt += 1
+            x[:, block_start : block_start + accept_cnt] = draft_input_ids[:, :accept_cnt]
+            # EoS early stopping: all accepted tokens are finalized left-to-right,
+            # so if any is EoS we can truncate and return immediately.
+            if eos_token_id is not None:
+                accepted = x[0, block_start : block_start + accept_cnt]
+                eos_positions = (accepted == eos_token_id).nonzero(as_tuple=True)[0]
+                if len(eos_positions) > 0:
+                    first_eos_rel = eos_positions[0].item()
+                    total_accept_token += first_eos_rel + 1
+                    output_end = prompt_len + total_accept_token
+                    return x[:, :output_end], nfe
+            x[:, block_start + accept_cnt : block_start + accept_cnt + block_length] = new_draft_input_ids
+            past_key_values.crop(block_start + accept_cnt)
+            # ---- thinking budget enforcement ----
+            # Insert end_think as the first token of the next draft block,
+            # shifting all subsequent tokens right by 1 (discarding the last).
+            # The first draft token is always accepted unconditionally, so
+            # end_think is guaranteed to be finalized in the next iteration
+            # without needing to re-encode or touch the KV cache.
+            if end_think_token_id is not None and max_thinking_tokens is not None:
+                tokens_so_far = total_accept_token + accept_cnt
+                if tokens_so_far > max_thinking_tokens:
+                    gen_so_far = x[0, prompt_len : prompt_len + tokens_so_far]
+                    has_end_think = (gen_so_far == end_think_token_id).any()
+                    if not has_end_think:
+                        insert_pos = block_start + accept_cnt
+                        x[0, insert_pos + 1:] = x[0, insert_pos:-1].clone()
+                        x[0, insert_pos] = end_think_token_id
+            total_accept_token += accept_cnt
+            if total_accept_token >= max_new_tokens:
+                break
+        return x[:, : -(block_length * 2)], nfe
+    @torch.no_grad()
+    def linear_spec_generate(
+        self,
+        prompt_ids: torch.Tensor,
+        max_new_tokens: int = 128,
+        block_length: int = 32,
+        temperature: float = 0.0,
+        mask_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        max_thinking_tokens: Optional[int] = None,
+        end_think_token_id: Optional[int] = None,
+        threshold: float = 0.0,
+    ):
+        """Linear speculative decoding: diffusion draft + AR verification.
+        Each step:
+          1. Draft: forward [last_accepted, mask, ...] with bidirectional attention
+             (diffusion_lm=True, use_cache=False).  Shift AR logits to get
+             per-position predictions; apply confidence filtering.
+          2. Verify: forward the drafted block with causal attention
+             (diffusion_lm=False, use_cache=True, use_causal_mask=True).
+             Accept consecutive AR-matching tokens plus one bonus token.
+        Args:
+            prompt_ids: Input token IDs of shape (1, prompt_len).
+            max_new_tokens: Maximum number of tokens to generate.
+            block_length: Number of tokens per draft/verify block.
+            temperature: Sampling temperature (0 = greedy).
+            mask_token_id: Override for config.mask_token_id.
+            eos_token_id: Override for config.eos_token_id.
+            max_thinking_tokens: Budget for thinking tokens before forcing end_think.
+            end_think_token_id: Token ID inserted when thinking budget is exceeded.
+            threshold: Confidence threshold for accepting draft predictions.
+        Returns:
+            (output_ids, nfe): output_ids includes the prompt; nfe is the number
+            of forward evaluations (matching self_spec_generate interface).
+        """
+        if prompt_ids.shape[0] != 1:
+            raise ValueError("Linear speculative decoding requires batch_size == 1")
+        token_mask_id = mask_token_id if mask_token_id is not None else self.config.mask_token_id
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, "eos_token_id", None)
+        device = prompt_ids.device
+        prompt_len = prompt_ids.shape[1]
+        dream_style = getattr(self.config, 'dlm_type', 'llada') == 'dream'
+        def _set_diffusion_lm(val: bool):
+            for layer in self.encoder.layers:
+                if hasattr(layer.self_attn, 'diffusion_lm'):
+                    layer.self_attn.diffusion_lm = val
+        # ===== Prefill (causal) =====
+        _set_diffusion_lm(False)
+        enc_out = self.encoder(
+            input_ids=prompt_ids,
+            past_key_values=DynamicCache(),
+            use_cache=True,
+            use_causal_mask=True,
+        )
+        past_key_values = enc_out.past_key_values
+        last_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
+        nfe = 1
+        if temperature > 0:
+            probs = torch.softmax(last_logit / temperature, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
+        if eos_token_id is not None and next_token.item() == eos_token_id:
+            output_ids = torch.cat([prompt_ids, next_token], dim=1)
+            return output_ids, nfe
+        generated = [next_token]
+        total_gen = 1
+        # ===== Main loop =====
+        while total_gen < max_new_tokens:
+            cache_len = past_key_values.get_seq_length()
+            block = torch.full(
+                (1, block_length), token_mask_id, dtype=torch.long, device=device
+            )
+            block[0, 0] = next_token.item()
+            # -------- Draft (bidirectional, don't update cache) --------
+            _set_diffusion_lm(True)
+            while True:
+                is_mask = block == token_mask_id
+                if not is_mask.any():
+                    break
+                enc_out = self.encoder(
+                    input_ids=block,
+                    past_key_values=past_key_values,
+                    use_cache=False,
+                )
+                nfe += 1
+                draft_logits = self.diffusion_head(enc_out.last_hidden_state)
+                if dream_style:
+                    # DREAM: logit[i] predicts position i+1 → shift to self-prediction
+                    draft_logits = torch.cat(
+                        [draft_logits[:, :1, :], draft_logits[:, :-1, :]], dim=1
+                    )
+                # LLaDA: logit[i] already predicts position i → no shift needed
+                if temperature > 0:
+                    draft_probs = torch.softmax(draft_logits / temperature, dim=-1)
+                    draft_tokens = torch.multinomial(
+                        draft_probs.view(-1, draft_probs.shape[-1]), num_samples=1
+                    ).view(1, block_length)
+                else:
+                    draft_tokens = draft_logits.argmax(dim=-1)
+                    draft_probs = torch.softmax(draft_logits, dim=-1)
+                if threshold > 0:
+                    draft_conf = torch.gather(
+                        draft_probs, -1, draft_tokens.unsqueeze(-1)
+                    ).squeeze(-1)
+                    draft_conf = torch.where(is_mask, draft_conf, -torch.inf)
+                    unmask = draft_conf >= threshold
+                    # Ensure each iteration makes progress even when every masked
+                    # position falls below the confidence threshold.
+                    if not unmask.any():
+                        best_idx = draft_conf.view(-1).argmax()
+                        unmask = torch.zeros_like(is_mask, dtype=torch.bool)
+                        unmask.view(-1)[best_idx] = True
+                    block[unmask] = draft_tokens[unmask]
+                else:
+                    block[is_mask] = draft_tokens[is_mask]
+                    break
+            # -------- Verify (causal, update cache) --------
+            _set_diffusion_lm(False)
+            enc_out = self.encoder(
+                input_ids=block,
+                past_key_values=past_key_values,
+                use_cache=True,
+                use_causal_mask=True,
+            )
+            past_key_values = enc_out.past_key_values
+            nfe += 1
+            verify_logits = self.diffusion_head(enc_out.last_hidden_state)
+            if temperature > 0:
+                verify_probs = torch.softmax(verify_logits / temperature, dim=-1)
+                ar_tokens = torch.multinomial(
+                    verify_probs.view(-1, verify_probs.shape[-1]), num_samples=1
+                ).view(1, block_length)
+            else:
+                ar_tokens = verify_logits.argmax(dim=-1)
+            accepted = 0
+            for i in range(block_length - 1):
+                if ar_tokens[0, i].item() == block[0, i + 1].item():
+                    accepted += 1
+                else:
+                    break
+            accepted += 1  # bonus token from AR verification
+            accepted_toks = ar_tokens[:, :accepted]
+            generated.append(accepted_toks)
+            total_gen += accepted
+            _crop_dynamic_cache(past_key_values, cache_len + accepted)
+            next_token = ar_tokens[:, accepted - 1 : accepted]
+            # -------- EOS check --------
+            if eos_token_id is not None:
+                eos_pos = (accepted_toks[0] == eos_token_id).nonzero(as_tuple=True)[0]
+                if len(eos_pos) > 0:
+                    first_eos = eos_pos[0].item()
+                    generated[-1] = accepted_toks[:, : first_eos + 1]
+                    total_gen = total_gen - accepted + first_eos + 1
+                    break
+            # -------- Thinking budget enforcement --------
+            if end_think_token_id is not None and max_thinking_tokens is not None:
+                if total_gen > max_thinking_tokens:
+                    all_gen = torch.cat(generated, dim=1)
+                    if not (all_gen == end_think_token_id).any():
+                        next_token = torch.tensor(
+                            [[end_think_token_id]], device=device
+                        )
+            if total_gen >= max_new_tokens:
+                break
+        all_generated = torch.cat(generated, dim=1)
+        output_ids = torch.cat([prompt_ids, all_generated], dim=1)
+        return output_ids, nfe
+    @torch.no_grad()
+    def linear_spec_generate_mp(
+        self,
+        prompt_ids: torch.Tensor,
+        max_new_tokens: int = 512,
+        block_length: int = 32,
+        temperature: float = 0.0,
+        mask_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        max_paths: int = 16,
+        uncertain_threshold: float = 0.7,
+        top_k_candidates: int = 2,
+        threshold: float = 0.0,
+        max_thinking_tokens: Optional[int] = None,
+        end_think_token_id: Optional[int] = None,
+    ):
+        """Linear speculative decoding with multi-path tree verification.
+        Self-contained method — no external file dependencies beyond the model itself.
+        Each iteration costs 2 NFE (1 draft + 1 verify):
+          1. Draft: single-step bidirectional diffusion fills a block of masks.
+          2. Verify: tree-structured AR verification with multiple candidate paths.
+        Multi-path verification identifies low-confidence draft positions and
+        explores top-k alternative tokens. All candidate paths share a trie
+        prefix and are verified in one forward pass via a 4D tree-ancestry
+        attention mask (~40 tokens), picking the path with the longest
+        accepted prefix.
+        Benchmark results (NeMo Skills prompt, enable_thinking=False):
+          GSM8K bl=32: +17.1% UW-TPF vs vanilla (acc 93.9%)
+          MBPP  bl=64: +17.8% UW-TPF vs vanilla (pass@1 78.2%)
+        Args:
+            prompt_ids: (1, prompt_len) input token IDs.
+            max_new_tokens: Maximum tokens to generate.
+            block_length: Draft block size. Use 32 for math, 64 for code.
+            temperature: Sampling temperature (0.0 = greedy).
+            eos_token_id: Stop token ID.
+            max_paths: Tree verification budget. 16 = up to 4 uncertain
+                positions x 2 candidates each.
+            uncertain_threshold: Confidence below which a position is
+                considered uncertain and expanded with alternatives.
+            top_k_candidates: Number of alternative tokens to try at each
+                uncertain position.
+        Returns:
+            output_ids: (1, prompt_len + generated_len) full sequence.
+            nfe: Total number of forward evaluations.
+        """
+        from itertools import product as _product
+        if prompt_ids.shape[0] != 1:
+            raise ValueError("Requires batch_size == 1")
+        device = prompt_ids.device
+        token_mask_id = mask_token_id if mask_token_id is not None else self.config.mask_token_id
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, "eos_token_id", None)
+        def _set_dlm(val: bool):
+            for layer in self.encoder.layers:
+                if hasattr(layer.self_attn, 'diffusion_lm'):
+                    layer.self_attn.diffusion_lm = val
+        def _crop_cache(kv, length):
+            for li in range(len(kv)):
+                kv.key_cache[li] = kv.key_cache[li][:, :, :length]
+                kv.value_cache[li] = kv.value_cache[li][:, :, :length]
+            kv._seen_tokens = length
+        # ----- tree verify helpers (inlined) -----
+        def _mp_verify(block, draft_probs, draft_conf, past_kv, cache_len):
+            """Multi-path verify via batch-stacking (flash-attention compatible).
+            Unlike tree attention (4D mask), batch-stacking expands the KV cache
+            batch dimension and runs all candidate paths as separate batch entries.
+            This keeps flash attention + GQA enabled, avoiding OOM from the 4D
+            mask path which disables both.
+            Returns (accepted_toks, n_accepted, past_kv, next_tok) or None.
+            """
+            bl = block.shape[1]
+            # Identify uncertain positions
+            is_filled = block[0] != token_mask_id
+            pos_conf = torch.zeros(bl, device=device)
+            pos_conf[0] = float('inf')
+            for p in range(1, bl):
+                if is_filled[p]:
+                    c = draft_conf[0, p].item()
+                    pos_conf[p] = c if c != float('-inf') else float('inf')
+                else:
+                    pos_conf[p] = float('-inf')
+            unc_mask = (pos_conf < uncertain_threshold) & (pos_conf > float('-inf'))
+            unc_pos = unc_mask.nonzero(as_tuple=True)[0].tolist()
+            if not unc_pos:
+                return None
+            import math as _math
+            max_unc = min(len(unc_pos), max(1, int(_math.log2(max_paths))))
+            unc_pos = sorted(unc_pos)[:max_unc]
+            # Build candidate blocks
+            topk_at = {}
+            for p in unc_pos:
+                _, ids = draft_probs[0, p].topk(top_k_candidates)
+                topk_at[p] = ids.tolist()
+            combos = list(_product(*(topk_at[p] for p in sorted(topk_at))))[:max_paths]
+            num_paths = len(combos)
+            if num_paths <= 1:
+                return None
+            candidate_blocks = block.expand(num_paths, -1).clone()
+            pos_list = sorted(topk_at.keys())
+            for pi, combo in enumerate(combos):
+                for ci, p in enumerate(pos_list):
+                    candidate_blocks[pi, p] = combo[ci]
+            # Expand KV cache batch dimension (shared, no copy)
+            for li in range(len(past_kv.key_cache)):
+                past_kv.key_cache[li] = past_kv.key_cache[li].expand(num_paths, -1, -1, -1)
+                past_kv.value_cache[li] = past_kv.value_cache[li].expand(num_paths, -1, -1, -1)
+            # Batched causal verify — uses flash attention + GQA
+            _set_dlm(False)
+            enc_out = self.encoder(
+                input_ids=candidate_blocks,
+                past_key_values=past_kv,
+                use_cache=True,
+                use_causal_mask=True,
+            )
+            past_kv = enc_out.past_key_values
+            vlogits = self.diffusion_head(enc_out.last_hidden_state)
+            if temperature > 0:
+                vp = torch.softmax(vlogits / temperature, dim=-1)
+                ar_tokens = torch.multinomial(vp.view(-1, vp.shape[-1]), 1).view(num_paths, bl)
+            else:
+                ar_tokens = vlogits.argmax(dim=-1)
+            # Find best path (longest accepted prefix)
+            best_acc, best_pidx = 0, 0
+            for pi in range(num_paths):
+                acc = 0
+                for i in range(bl - 1):
+                    if ar_tokens[pi, i].item() == candidate_blocks[pi, i + 1].item():
+                        acc += 1
+                    else:
+                        break
+                acc += 1
+                if acc > best_acc:
+                    best_acc, best_pidx = acc, pi
+            accepted_toks = ar_tokens[best_pidx:best_pidx+1, :best_acc]
+            # Extract winning path's KV cache slice
+            for li in range(len(past_kv.key_cache)):
+                past_kv.key_cache[li] = past_kv.key_cache[li][best_pidx:best_pidx+1].contiguous()
+                past_kv.value_cache[li] = past_kv.value_cache[li][best_pidx:best_pidx+1].contiguous()
+            _crop_cache(past_kv, cache_len + best_acc)
+            return accepted_toks, best_acc, past_kv, accepted_toks[:, -1:]
+        # ── Prefill (causal) ──
+        _set_dlm(False)
+        enc_out = self.encoder(
+            input_ids=prompt_ids, past_key_values=DynamicCache(),
+            use_cache=True, use_causal_mask=True,
+        )
+        past_key_values = enc_out.past_key_values
+        last_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
+        nfe = 1
+        if temperature > 0:
+            next_token = torch.multinomial(torch.softmax(last_logit / temperature, dim=-1), 1)
+        else:
+            next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
+        if eos_token_id is not None and next_token.item() == eos_token_id:
+            return torch.cat([prompt_ids, next_token], dim=1), nfe
+        generated = [next_token]
+        total_gen = 1
+        # ── Main draft-verify loop ──
+        while total_gen < max_new_tokens:
+            cache_len = past_key_values.get_seq_length()
+            block = torch.full((1, block_length), token_mask_id, dtype=torch.long, device=device)
+            block[0, 0] = next_token.item()
+            # Draft: single-step bidirectional diffusion (1 NFE)
+            _set_dlm(True)
+            enc_out = self.encoder(input_ids=block, past_key_values=past_key_values, use_cache=False)
+            nfe += 1
+            draft_logits = self.diffusion_head(enc_out.last_hidden_state)
+            if temperature > 0:
+                draft_probs = torch.softmax(draft_logits / temperature, dim=-1)
+                draft_tokens = torch.multinomial(
+                    draft_probs.view(-1, draft_probs.shape[-1]), 1
+                ).view(1, block_length)
+            else:
+                draft_tokens = draft_logits.argmax(dim=-1)
+                draft_probs = torch.softmax(draft_logits, dim=-1)
+            draft_conf = torch.gather(draft_probs, -1, draft_tokens.unsqueeze(-1)).squeeze(-1)
+            is_mask = block == token_mask_id
+            draft_conf = torch.where(is_mask, draft_conf, -torch.inf)
+            block[is_mask] = draft_tokens[is_mask]
+            # Verify: multi-path batch-stacking (1 NFE, flash-attention compatible)
+            result = _mp_verify(block, draft_probs, draft_conf, past_key_values, cache_len)
+            if result is not None:
+                accepted_toks, accepted, past_key_values, next_token = result
+                nfe += 1
+            else:
+                # No uncertain positions — single-path causal verify
+                _set_dlm(False)
+                enc_out = self.encoder(
+                    input_ids=block, past_key_values=past_key_values,
+                    use_cache=True, use_causal_mask=True,
+                )
+                past_key_values = enc_out.past_key_values
+                nfe += 1
+                vlogits = self.diffusion_head(enc_out.last_hidden_state)
+                if temperature > 0:
+                    vp = torch.softmax(vlogits / temperature, dim=-1)
+                    ar_tokens = torch.multinomial(vp.view(-1, vp.shape[-1]), 1).view(1, block_length)
+                else:
+                    ar_tokens = vlogits.argmax(dim=-1)
+                accepted = 0
+                for i in range(block_length - 1):
+                    if ar_tokens[0, i].item() == block[0, i + 1].item():
+                        accepted += 1
+                    else:
+                        break
+                accepted += 1
+                accepted_toks = ar_tokens[:, :accepted]
+                _crop_cache(past_key_values, cache_len + accepted)
+                next_token = ar_tokens[:, accepted - 1 : accepted]
+            generated.append(accepted_toks)
+            total_gen += accepted
+            if eos_token_id is not None:
+                eos_pos = (accepted_toks[0] == eos_token_id).nonzero(as_tuple=True)[0]
+                if len(eos_pos) > 0:
+                    first_eos = eos_pos[0].item()
+                    generated[-1] = accepted_toks[:, :first_eos + 1]
+                    total_gen = total_gen - accepted + first_eos + 1
+                    break
+            if end_think_token_id is not None and max_thinking_tokens is not None:
+                if total_gen > max_thinking_tokens:
+                    all_gen = torch.cat(generated, dim=1)
+                    if not (all_gen == end_think_token_id).any():
+                        next_token = torch.tensor(
+                            [[end_think_token_id]], device=device
+                        )
+            if total_gen >= max_new_tokens:
+                break
+        all_generated = torch.cat(generated, dim=1)
+        output_ids = torch.cat([prompt_ids, all_generated], dim=1)
+        return output_ids, nfe
+    @torch.no_grad()
+    def linear_spec_generate_lora(
+        self,
+        prompt_ids: torch.Tensor,
+        max_new_tokens: int = 128,
+        block_length: int = 32,
+        temperature: float = 0.0,
+        mask_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        threshold: float = 0.0,
+        rebuild_kv: str = 'none',
+        max_thinking_tokens: Optional[int] = None,
+        end_think_token_id: Optional[int] = None,
+    ):
+        """Linear speculative decoding: diffusion draft + AR verify.
+        LoRA adapter toggling: ON for draft (bidirectional), OFF for verify (causal).
+        Returns (output_ids, nfe).
+        """
+        if prompt_ids.shape[0] != 1:
+            raise ValueError("linear_spec_generate requires batch_size == 1")
+        token_mask_id = mask_token_id if mask_token_id is not None else self.config.mask_token_id
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, "eos_token_id", None)
+        device = prompt_ids.device
+        dream_style = getattr(self.config, 'dlm_type', 'llada') == 'dream'
+        def _set_diffusion_lm(val: bool):
+            for layer in self.encoder.layers:
+                if hasattr(layer.self_attn, 'diffusion_lm'):
+                    layer.self_attn.diffusion_lm = val
+        def _toggle_adapters(model, enable: bool):
+            for module in model.modules():
+                if hasattr(module, '_disable_adapters'):
+                    module._disable_adapters = not enable
+        # Prefill (causal, LoRA OFF)
+        _set_diffusion_lm(False)
+        _toggle_adapters(self, False)
+        enc_out = self.encoder(
+            input_ids=prompt_ids,
+            past_key_values=DynamicCache(),
+            use_cache=True,
+            use_causal_mask=True,
+        )
+        past_key_values = enc_out.past_key_values
+        last_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
+        nfe = 1
+        if temperature > 0:
+            next_token = torch.multinomial(torch.softmax(last_logit / temperature, dim=-1), num_samples=1)
+        else:
+            next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
+        if eos_token_id is not None and next_token.item() == eos_token_id:
+            return torch.cat([prompt_ids, next_token], dim=1), nfe
+        generated = [next_token]
+        total_gen = 1
+        while total_gen < max_new_tokens:
+            cache_len = past_key_values.get_seq_length()
+            block = torch.full((1, block_length), token_mask_id, dtype=torch.long, device=device)
+            block[0, 0] = next_token.item()
+            # Draft (bidirectional, LoRA ON)
+            _set_diffusion_lm(True)
+            _toggle_adapters(self, True)
+            enc_out = self.encoder(input_ids=block, past_key_values=past_key_values, use_cache=False)
+            nfe += 1
+            draft_logits = self.diffusion_head(enc_out.last_hidden_state)
+            if dream_style:
+                draft_logits = torch.cat([draft_logits[:, :1, :], draft_logits[:, :-1, :]], dim=1)
+            if temperature > 0:
+                draft_probs = torch.softmax(draft_logits / temperature, dim=-1)
+                draft_tokens = torch.multinomial(draft_probs.view(-1, draft_probs.shape[-1]), num_samples=1).view(1, block_length)
+            else:
+                draft_tokens = draft_logits.argmax(dim=-1)
+                draft_probs = torch.softmax(draft_logits, dim=-1)
+            draft_conf = torch.gather(draft_probs, -1, draft_tokens.unsqueeze(-1)).squeeze(-1)
+            is_mask = block == token_mask_id
+            draft_conf = torch.where(is_mask, draft_conf, -torch.inf)
+            unmask = draft_conf > threshold
+            if unmask.sum() > 0:
+                block[unmask] = draft_tokens[unmask]
+            # Verify (causal, LoRA OFF)
+            _set_diffusion_lm(False)
+            _toggle_adapters(self, False)
+            enc_out = self.encoder(input_ids=block, past_key_values=past_key_values, use_cache=True, use_causal_mask=True)
+            past_key_values = enc_out.past_key_values
+            nfe += 1
+            verify_logits = self.diffusion_head(enc_out.last_hidden_state)
+            if temperature > 0:
+                ar_tokens = torch.multinomial(torch.softmax(verify_logits / temperature, dim=-1).view(-1, verify_logits.shape[-1]), num_samples=1).view(1, block_length)
+            else:
+                ar_tokens = verify_logits.argmax(dim=-1)
+            accepted = 0
+            for i in range(block_length - 1):
+                if ar_tokens[0, i].item() == block[0, i + 1].item():
+                    accepted += 1
+                else:
+                    break
+            accepted += 1  # bonus token
+            accepted_toks = ar_tokens[:, :accepted]
+            generated.append(accepted_toks)
+            total_gen += accepted
+            _crop_dynamic_cache(past_key_values, cache_len + accepted)
+            next_token = ar_tokens[:, accepted - 1 : accepted]
+            # EOS check
+            if eos_token_id is not None:
+                eos_pos = (accepted_toks[0] == eos_token_id).nonzero(as_tuple=True)[0]
+                if len(eos_pos) > 0:
+                    first_eos = eos_pos[0].item()
+                    generated[-1] = accepted_toks[:, : first_eos + 1]
+                    total_gen = total_gen - accepted + first_eos + 1
+                    break
+            # Thinking budget enforcement
+            if end_think_token_id is not None and max_thinking_tokens is not None:
+                if total_gen > max_thinking_tokens:
+                    all_gen = torch.cat(generated, dim=1)
+                    if not (all_gen == end_think_token_id).any():
+                        next_token = torch.tensor([[end_think_token_id]], device=device)
+            if total_gen >= max_new_tokens:
+                break
+        all_generated = torch.cat(generated, dim=1)
+        output_ids = torch.cat([prompt_ids, all_generated], dim=1)
+        return output_ids, nfe