Instructions to use nvidia/Nemotron-Labs-Diffusion-3B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use nvidia/Nemotron-Labs-Diffusion-3B with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="nvidia/Nemotron-Labs-Diffusion-3B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("nvidia/Nemotron-Labs-Diffusion-3B", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use nvidia/Nemotron-Labs-Diffusion-3B with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "nvidia/Nemotron-Labs-Diffusion-3B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-3B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/nvidia/Nemotron-Labs-Diffusion-3B

SGLang

How to use nvidia/Nemotron-Labs-Diffusion-3B with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "nvidia/Nemotron-Labs-Diffusion-3B" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-3B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "nvidia/Nemotron-Labs-Diffusion-3B" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-3B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use nvidia/Nemotron-Labs-Diffusion-3B with Docker Model Runner:
```
docker model run hf.co/nvidia/Nemotron-Labs-Diffusion-3B
```

YongganFu commited on 24 days ago

Commit

1c8770d

verified ·

1 Parent(s): 80ab10a

Upload model

Browse files

Files changed (2) hide show

chat_utils.py +49 -3
modeling_ministral_dlm.py +659 -2

chat_utils.py CHANGED Viewed

@@ -114,10 +114,12 @@ def generate_with_prefix_cache_block_diff(
     neg_entropy=False,
     causal_context=False,
     eos_token_id=None,
 ):
     dream_style=shift_logits
-    # Initialize the accumulator
     x_accum = prompt.clone()
     assert gen_length % block_length == 0
     num_blocks = gen_length // block_length
@@ -142,30 +144,66 @@ def generate_with_prefix_cache_block_diff(
             if hasattr(layer.self_attn, 'diffusion_lm'):
                 layer.self_attn.diffusion_lm=True
     # For dream_style: store the "next token logit" of the context
     next_logits_context = None
     if dream_style:
         next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)
     for num_block in range(num_blocks):
-        # Create a new block with mask tokens (no seeding)
         mask_block = torch.ones(
             (prompt.shape[0], block_length),
             dtype=prompt.dtype,
             device=prompt.device
         ) * mask_id
         # Append the block of masks
         x_accum = torch.cat([x_accum, mask_block], dim=1)
         current_block_start = prompt.size(1) + num_block * block_length
         block_slice = slice(current_block_start, current_block_start + block_length)
         # Build the initial mask for this block
         mask_block_idx0 = (x_accum[:, block_slice] == mask_id)  # (B, Lb)
         # Precompute the transfer schedule for this block
         if dream_style:
-            # still denoise *all* positions (0..Lb-1), since none are seeded
             schedule_mask = mask_block_idx0
         else:
             schedule_mask = mask_block_idx0
@@ -245,11 +283,19 @@ def generate_with_prefix_cache_block_diff(
             use_causal_mask=causal_context
         )
         past_key_values = output.past_key_values
         if causal_context:
             for layer in model_module.encoder.layers:
                 if hasattr(layer.self_attn, 'diffusion_lm'):
                     layer.self_attn.diffusion_lm=True
         if dream_style and num_block < num_blocks - 1:
             # refresh context-next logit for the next block

     neg_entropy=False,
     causal_context=False,
     eos_token_id=None,
+    max_thinking_tokens=None,
+    end_think_token_id=None,
 ):
     dream_style=shift_logits
     x_accum = prompt.clone()
+    B = prompt.shape[0]
     assert gen_length % block_length == 0
     num_blocks = gen_length // block_length
             if hasattr(layer.self_attn, 'diffusion_lm'):
                 layer.self_attn.diffusion_lm=True
+    # Causal prefill: next token from last position (same as linear_spec_generate).
+    next_token = None
+    if causal_context:
+        last_logit = output.logits[:, -1, :]
+        if temperature > 0:
+            probs = torch.softmax(last_logit / temperature, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
     # For dream_style: store the "next token logit" of the context
     next_logits_context = None
     if dream_style:
         next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)
     for num_block in range(num_blocks):
+        # Create a new block with mask tokens; under causal context, seed position 0
+        # with the next-token prediction from the previous causal forward (prefill or
+        # post-block encode), matching linear_spec_generate.
         mask_block = torch.ones(
             (prompt.shape[0], block_length),
             dtype=prompt.dtype,
             device=prompt.device
         ) * mask_id
+        if causal_context:
+            mask_block[:, 0] = next_token[:, 0]
         # Append the block of masks
         x_accum = torch.cat([x_accum, mask_block], dim=1)
         current_block_start = prompt.size(1) + num_block * block_length
         block_slice = slice(current_block_start, current_block_start + block_length)
+        # ---- thinking budget enforcement ----
+        # If we've generated >= max_thinking_tokens without a </think>, inject one.
+        if end_think_token_id is not None and max_thinking_tokens is not None:
+            tokens_before_block = num_block * block_length
+            tokens_after_block = tokens_before_block + block_length
+            if tokens_after_block > max_thinking_tokens:
+                gen_so_far = x_accum[:, prompt.size(1):current_block_start]
+                has_end_think = (
+                    (gen_so_far == end_think_token_id).any(dim=1)
+                    if gen_so_far.size(1) > 0
+                    else torch.zeros(B, dtype=torch.bool, device=prompt.device)
+                )
+                if not has_end_think.all():
+                    if tokens_before_block < max_thinking_tokens:
+                        offset = max_thinking_tokens - tokens_before_block
+                    else:
+                        offset = 0
+                    inject_pos = current_block_start + offset
+                    for b in range(B):
+                        if not has_end_think[b]:
+                            x_accum[b, inject_pos] = end_think_token_id
         # Build the initial mask for this block
         mask_block_idx0 = (x_accum[:, block_slice] == mask_id)  # (B, Lb)
         # Precompute the transfer schedule for this block
         if dream_style:
+            # masked positions only (position 0 may be causal-seeded, not mask_id)
             schedule_mask = mask_block_idx0
         else:
             schedule_mask = mask_block_idx0
             use_causal_mask=causal_context
         )
         past_key_values = output.past_key_values
+        nfe += 1
         if causal_context:
             for layer in model_module.encoder.layers:
                 if hasattr(layer.self_attn, 'diffusion_lm'):
                     layer.self_attn.diffusion_lm=True
+            # Next block's first position = greedy/sampled next token from this causal encode
+            last_logit = output.logits[:, -1, :]
+            if temperature > 0:
+                probs = torch.softmax(last_logit / temperature, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            else:
+                next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
         if dream_style and num_block < num_blocks - 1:
             # refresh context-next logit for the next block

modeling_ministral_dlm.py CHANGED Viewed

@@ -31,6 +31,7 @@ from .chat_utils import generate_with_prefix_cache_block_diff
 from .modeling_ministral import Ministral3Model, Ministral3PreTrainedModel, Ministral3Attention, apply_rotary_pos_emb, repeat_kv, _get_llama_4_attn_scale
 from .configuration_ministral_dlm import MinistralDLMConfig
 @dataclass
 class MinistralDiffOutputWithPast(ModelOutput):
@@ -871,7 +872,7 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
         )
-    def generate(self, prompt_ids, max_new_tokens, steps, block_length, shift_logits, threshold, causal_context=True, temperature=0, eos_token_id=None):
         if eos_token_id is None:
             eos_token_id = getattr(self.config, 'eos_token_id', None)
@@ -889,6 +890,8 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
                         neg_entropy=False,
                         causal_context=causal_context,
                         eos_token_id=eos_token_id,
                     )
         return out_ids, nfe
@@ -997,6 +1000,8 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
         max_new_tokens: int = 128,
         temperature: float = 0.0,
         eos_token_id: Optional[int] = None,
     ) -> tuple:
         """Autoregressive generation calling the encoder directly (injected by build_hf_tidar_repo).
@@ -1044,6 +1049,18 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
             else:
                 next_token = torch.argmax(next_logit, dim=-1, keepdim=True)
             generated_tokens.append(next_token)
             if eos_token_id is not None and (next_token == eos_token_id).all():
@@ -1080,6 +1097,8 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
         temperature: float = 0.0,
         mask_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = None,
     ):
         self.config.use_sbd_objective = True
         self.config.dlm_paradigm = "sbd"
@@ -1176,6 +1195,23 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
             x[:, block_start + accept_cnt : block_start + accept_cnt + block_length] = new_draft_input_ids
             past_key_values.crop(block_start + accept_cnt)
             total_accept_token += accept_cnt
             if total_accept_token >= max_new_tokens:
@@ -1184,4 +1220,625 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
         return x[:, : -(block_length * 2)], nfe
-__all__ = ["MinistralDiffEncoderModel", "MinistralFlexAttention"]

 from .modeling_ministral import Ministral3Model, Ministral3PreTrainedModel, Ministral3Attention, apply_rotary_pos_emb, repeat_kv, _get_llama_4_attn_scale
 from .configuration_ministral_dlm import MinistralDLMConfig
+__all__ = ["MinistralDiffEncoderModel", "MinistralFlexAttention"]
 @dataclass
 class MinistralDiffOutputWithPast(ModelOutput):
         )
+    def generate(self, prompt_ids, max_new_tokens, steps, block_length, shift_logits, threshold, causal_context=True, temperature=0, eos_token_id=None, max_thinking_tokens=None, end_think_token_id=None):
         if eos_token_id is None:
             eos_token_id = getattr(self.config, 'eos_token_id', None)
                         neg_entropy=False,
                         causal_context=causal_context,
                         eos_token_id=eos_token_id,
+                        max_thinking_tokens=max_thinking_tokens,
+                        end_think_token_id=end_think_token_id,
                     )
         return out_ids, nfe
         max_new_tokens: int = 128,
         temperature: float = 0.0,
         eos_token_id: Optional[int] = None,
+        max_thinking_tokens: Optional[int] = None,
+        end_think_token_id: Optional[int] = None,
     ) -> tuple:
         """Autoregressive generation calling the encoder directly (injected by build_hf_tidar_repo).
             else:
                 next_token = torch.argmax(next_logit, dim=-1, keepdim=True)
+            # ---- thinking budget enforcement ----
+            if end_think_token_id is not None and max_thinking_tokens is not None:
+                if step >= max_thinking_tokens:
+                    if generated_tokens:
+                        gen_tensor = torch.cat(generated_tokens, dim=1)
+                        has_end_think = (gen_tensor == end_think_token_id).any(dim=1)
+                    else:
+                        has_end_think = torch.zeros(batch_size, dtype=torch.bool, device=device)
+                    for b in range(batch_size):
+                        if not has_end_think[b]:
+                            next_token[b] = end_think_token_id
             generated_tokens.append(next_token)
             if eos_token_id is not None and (next_token == eos_token_id).all():
         temperature: float = 0.0,
         mask_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = None,
+        max_thinking_tokens: Optional[int] = None,
+        end_think_token_id: Optional[int] = None,
     ):
         self.config.use_sbd_objective = True
         self.config.dlm_paradigm = "sbd"
             x[:, block_start + accept_cnt : block_start + accept_cnt + block_length] = new_draft_input_ids
             past_key_values.crop(block_start + accept_cnt)
+            # ---- thinking budget enforcement ----
+            # Insert end_think as the first token of the next draft block,
+            # shifting all subsequent tokens right by 1 (discarding the last).
+            # The first draft token is always accepted unconditionally, so
+            # end_think is guaranteed to be finalized in the next iteration
+            # without needing to re-encode or touch the KV cache.
+            if end_think_token_id is not None and max_thinking_tokens is not None:
+                tokens_so_far = total_accept_token + accept_cnt
+                if tokens_so_far > max_thinking_tokens:
+                    gen_so_far = x[0, prompt_len : prompt_len + tokens_so_far]
+                    has_end_think = (gen_so_far == end_think_token_id).any()
+                    if not has_end_think:
+                        insert_pos = block_start + accept_cnt
+                        x[0, insert_pos + 1:] = x[0, insert_pos:-1].clone()
+                        x[0, insert_pos] = end_think_token_id
             total_accept_token += accept_cnt
             if total_accept_token >= max_new_tokens:
         return x[:, : -(block_length * 2)], nfe
+    @torch.no_grad()
+    def linear_spec_generate(
+        self,
+        prompt_ids: torch.Tensor,
+        max_new_tokens: int = 128,
+        block_length: int = 32,
+        temperature: float = 0.0,
+        mask_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        max_thinking_tokens: Optional[int] = None,
+        end_think_token_id: Optional[int] = None,
+        threshold: float = 0.0,
+    ):
+        """Linear speculative decoding: diffusion draft + AR verification.
+        Each step:
+          1. Draft: forward [last_accepted, mask, ...] with bidirectional attention
+             (diffusion_lm=True, use_cache=False).  Shift AR logits to get
+             per-position predictions; apply confidence filtering.
+          2. Verify: forward the drafted block with causal attention
+             (diffusion_lm=False, use_cache=True, use_causal_mask=True).
+             Accept consecutive AR-matching tokens plus one bonus token.
+        Args:
+            prompt_ids: Input token IDs of shape (1, prompt_len).
+            max_new_tokens: Maximum number of tokens to generate.
+            block_length: Number of tokens per draft/verify block.
+            temperature: Sampling temperature (0 = greedy).
+            mask_token_id: Override for config.mask_token_id.
+            eos_token_id: Override for config.eos_token_id.
+            max_thinking_tokens: Budget for thinking tokens before forcing end_think.
+            end_think_token_id: Token ID inserted when thinking budget is exceeded.
+            threshold: Confidence threshold for accepting draft predictions.
+        Returns:
+            (output_ids, nfe): output_ids includes the prompt; nfe is the number
+            of forward evaluations (matching self_spec_generate interface).
+        """
+        if prompt_ids.shape[0] != 1:
+            raise ValueError("Linear speculative decoding requires batch_size == 1")
+        token_mask_id = mask_token_id if mask_token_id is not None else self.config.mask_token_id
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, "eos_token_id", None)
+        device = prompt_ids.device
+        prompt_len = prompt_ids.shape[1]
+        dream_style = getattr(self.config, 'dlm_type', 'llada') == 'dream'
+        def _set_diffusion_lm(val: bool):
+            for layer in self.encoder.layers:
+                if hasattr(layer.self_attn, 'diffusion_lm'):
+                    layer.self_attn.diffusion_lm = val
+        # ===== Prefill (causal) =====
+        _set_diffusion_lm(False)
+        enc_out = self.encoder(
+            input_ids=prompt_ids,
+            past_key_values=DynamicCache(),
+            use_cache=True,
+            use_causal_mask=True,
+        )
+        past_key_values = enc_out.past_key_values
+        last_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
+        nfe = 1
+        if temperature > 0:
+            probs = torch.softmax(last_logit / temperature, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
+        if eos_token_id is not None and next_token.item() == eos_token_id:
+            output_ids = torch.cat([prompt_ids, next_token], dim=1)
+            return output_ids, nfe
+        generated = [next_token]
+        total_gen = 1
+        # ===== Main loop =====
+        while total_gen < max_new_tokens:
+            cache_len = past_key_values.get_seq_length()
+            block = torch.full(
+                (1, block_length), token_mask_id, dtype=torch.long, device=device
+            )
+            block[0, 0] = next_token.item()
+            # -------- Draft (bidirectional, don't update cache) --------
+            _set_diffusion_lm(True)
+            while True:
+                is_mask = block == token_mask_id
+                if not is_mask.any():
+                    break
+                enc_out = self.encoder(
+                    input_ids=block,
+                    past_key_values=past_key_values,
+                    use_cache=False,
+                )
+                nfe += 1
+                draft_logits = self.diffusion_head(enc_out.last_hidden_state)
+                if dream_style:
+                    # DREAM: logit[i] predicts position i+1 → shift to self-prediction
+                    draft_logits = torch.cat(
+                        [draft_logits[:, :1, :], draft_logits[:, :-1, :]], dim=1
+                    )
+                # LLaDA: logit[i] already predicts position i → no shift needed
+                if temperature > 0:
+                    draft_probs = torch.softmax(draft_logits / temperature, dim=-1)
+                    draft_tokens = torch.multinomial(
+                        draft_probs.view(-1, draft_probs.shape[-1]), num_samples=1
+                    ).view(1, block_length)
+                else:
+                    draft_tokens = draft_logits.argmax(dim=-1)
+                    draft_probs = torch.softmax(draft_logits, dim=-1)
+                if threshold > 0:
+                    draft_conf = torch.gather(
+                        draft_probs, -1, draft_tokens.unsqueeze(-1)
+                    ).squeeze(-1)
+                    draft_conf = torch.where(is_mask, draft_conf, -torch.inf)
+                    unmask = draft_conf >= threshold
+                    # Ensure each iteration makes progress even when every masked
+                    # position falls below the confidence threshold.
+                    if not unmask.any():
+                        best_idx = draft_conf.view(-1).argmax()
+                        unmask = torch.zeros_like(is_mask, dtype=torch.bool)
+                        unmask.view(-1)[best_idx] = True
+                    block[unmask] = draft_tokens[unmask]
+                else:
+                    block[is_mask] = draft_tokens[is_mask]
+                    break
+            # -------- Verify (causal, update cache) --------
+            _set_diffusion_lm(False)
+            enc_out = self.encoder(
+                input_ids=block,
+                past_key_values=past_key_values,
+                use_cache=True,
+                use_causal_mask=True,
+            )
+            past_key_values = enc_out.past_key_values
+            nfe += 1
+            verify_logits = self.diffusion_head(enc_out.last_hidden_state)
+            if temperature > 0:
+                verify_probs = torch.softmax(verify_logits / temperature, dim=-1)
+                ar_tokens = torch.multinomial(
+                    verify_probs.view(-1, verify_probs.shape[-1]), num_samples=1
+                ).view(1, block_length)
+            else:
+                ar_tokens = verify_logits.argmax(dim=-1)
+            accepted = 0
+            for i in range(block_length - 1):
+                if ar_tokens[0, i].item() == block[0, i + 1].item():
+                    accepted += 1
+                else:
+                    break
+            accepted += 1  # bonus token from AR verification
+            accepted_toks = ar_tokens[:, :accepted]
+            generated.append(accepted_toks)
+            total_gen += accepted
+            _crop_dynamic_cache(past_key_values, cache_len + accepted)
+            next_token = ar_tokens[:, accepted - 1 : accepted]
+            # -------- EOS check --------
+            if eos_token_id is not None:
+                eos_pos = (accepted_toks[0] == eos_token_id).nonzero(as_tuple=True)[0]
+                if len(eos_pos) > 0:
+                    first_eos = eos_pos[0].item()
+                    generated[-1] = accepted_toks[:, : first_eos + 1]
+                    total_gen = total_gen - accepted + first_eos + 1
+                    break
+            # -------- Thinking budget enforcement --------
+            if end_think_token_id is not None and max_thinking_tokens is not None:
+                if total_gen > max_thinking_tokens:
+                    all_gen = torch.cat(generated, dim=1)
+                    if not (all_gen == end_think_token_id).any():
+                        next_token = torch.tensor(
+                            [[end_think_token_id]], device=device
+                        )
+            if total_gen >= max_new_tokens:
+                break
+        all_generated = torch.cat(generated, dim=1)
+        output_ids = torch.cat([prompt_ids, all_generated], dim=1)
+        return output_ids, nfe
+    @torch.no_grad()
+    def linear_spec_generate_mp(
+        self,
+        prompt_ids: torch.Tensor,
+        max_new_tokens: int = 512,
+        block_length: int = 32,
+        temperature: float = 0.0,
+        mask_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        max_paths: int = 16,
+        uncertain_threshold: float = 0.7,
+        top_k_candidates: int = 2,
+        threshold: float = 0.0,
+        max_thinking_tokens: Optional[int] = None,
+        end_think_token_id: Optional[int] = None,
+    ):
+        """Linear speculative decoding with multi-path tree verification.
+        Self-contained method — no external file dependencies beyond the model itself.
+        Each iteration costs 2 NFE (1 draft + 1 verify):
+          1. Draft: single-step bidirectional diffusion fills a block of masks.
+          2. Verify: tree-structured AR verification with multiple candidate paths.
+        Multi-path verification identifies low-confidence draft positions and
+        explores top-k alternative tokens. All candidate paths share a trie
+        prefix and are verified in one forward pass via a 4D tree-ancestry
+        attention mask (~40 tokens), picking the path with the longest
+        accepted prefix.
+        Benchmark results (NeMo Skills prompt, enable_thinking=False):
+          GSM8K bl=32: +17.1% UW-TPF vs vanilla (acc 93.9%)
+          MBPP  bl=64: +17.8% UW-TPF vs vanilla (pass@1 78.2%)
+        Args:
+            prompt_ids: (1, prompt_len) input token IDs.
+            max_new_tokens: Maximum tokens to generate.
+            block_length: Draft block size. Use 32 for math, 64 for code.
+            temperature: Sampling temperature (0.0 = greedy).
+            eos_token_id: Stop token ID.
+            max_paths: Tree verification budget. 16 = up to 4 uncertain
+                positions x 2 candidates each.
+            uncertain_threshold: Confidence below which a position is
+                considered uncertain and expanded with alternatives.
+            top_k_candidates: Number of alternative tokens to try at each
+                uncertain position.
+        Returns:
+            output_ids: (1, prompt_len + generated_len) full sequence.
+            nfe: Total number of forward evaluations.
+        """
+        from itertools import product as _product
+        if prompt_ids.shape[0] != 1:
+            raise ValueError("Requires batch_size == 1")
+        device = prompt_ids.device
+        token_mask_id = mask_token_id if mask_token_id is not None else self.config.mask_token_id
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, "eos_token_id", None)
+        def _set_dlm(val: bool):
+            for layer in self.encoder.layers:
+                if hasattr(layer.self_attn, 'diffusion_lm'):
+                    layer.self_attn.diffusion_lm = val
+        def _crop_cache(kv, length):
+            for li in range(len(kv)):
+                kv.key_cache[li] = kv.key_cache[li][:, :, :length]
+                kv.value_cache[li] = kv.value_cache[li][:, :, :length]
+            kv._seen_tokens = length
+        # ----- tree verify helpers (inlined) -----
+        def _mp_verify(block, draft_probs, draft_conf, past_kv, cache_len):
+            """Multi-path verify via batch-stacking (flash-attention compatible).
+            Unlike tree attention (4D mask), batch-stacking expands the KV cache
+            batch dimension and runs all candidate paths as separate batch entries.
+            This keeps flash attention + GQA enabled, avoiding OOM from the 4D
+            mask path which disables both.
+            Returns (accepted_toks, n_accepted, past_kv, next_tok) or None.
+            """
+            bl = block.shape[1]
+            # Identify uncertain positions
+            is_filled = block[0] != token_mask_id
+            pos_conf = torch.zeros(bl, device=device)
+            pos_conf[0] = float('inf')
+            for p in range(1, bl):
+                if is_filled[p]:
+                    c = draft_conf[0, p].item()
+                    pos_conf[p] = c if c != float('-inf') else float('inf')
+                else:
+                    pos_conf[p] = float('-inf')
+            unc_mask = (pos_conf < uncertain_threshold) & (pos_conf > float('-inf'))
+            unc_pos = unc_mask.nonzero(as_tuple=True)[0].tolist()
+            if not unc_pos:
+                return None
+            import math as _math
+            max_unc = min(len(unc_pos), max(1, int(_math.log2(max_paths))))
+            unc_pos = sorted(unc_pos)[:max_unc]
+            # Build candidate blocks
+            topk_at = {}
+            for p in unc_pos:
+                _, ids = draft_probs[0, p].topk(top_k_candidates)
+                topk_at[p] = ids.tolist()
+            combos = list(_product(*(topk_at[p] for p in sorted(topk_at))))[:max_paths]
+            num_paths = len(combos)
+            if num_paths <= 1:
+                return None
+            candidate_blocks = block.expand(num_paths, -1).clone()
+            pos_list = sorted(topk_at.keys())
+            for pi, combo in enumerate(combos):
+                for ci, p in enumerate(pos_list):
+                    candidate_blocks[pi, p] = combo[ci]
+            # Expand KV cache batch dimension (shared, no copy)
+            for li in range(len(past_kv.key_cache)):
+                past_kv.key_cache[li] = past_kv.key_cache[li].expand(num_paths, -1, -1, -1)
+                past_kv.value_cache[li] = past_kv.value_cache[li].expand(num_paths, -1, -1, -1)
+            # Batched causal verify — uses flash attention + GQA
+            _set_dlm(False)
+            enc_out = self.encoder(
+                input_ids=candidate_blocks,
+                past_key_values=past_kv,
+                use_cache=True,
+                use_causal_mask=True,
+            )
+            past_kv = enc_out.past_key_values
+            vlogits = self.diffusion_head(enc_out.last_hidden_state)
+            if temperature > 0:
+                vp = torch.softmax(vlogits / temperature, dim=-1)
+                ar_tokens = torch.multinomial(vp.view(-1, vp.shape[-1]), 1).view(num_paths, bl)
+            else:
+                ar_tokens = vlogits.argmax(dim=-1)
+            # Find best path (longest accepted prefix)
+            best_acc, best_pidx = 0, 0
+            for pi in range(num_paths):
+                acc = 0
+                for i in range(bl - 1):
+                    if ar_tokens[pi, i].item() == candidate_blocks[pi, i + 1].item():
+                        acc += 1
+                    else:
+                        break
+                acc += 1
+                if acc > best_acc:
+                    best_acc, best_pidx = acc, pi
+            accepted_toks = ar_tokens[best_pidx:best_pidx+1, :best_acc]
+            # Extract winning path's KV cache slice
+            for li in range(len(past_kv.key_cache)):
+                past_kv.key_cache[li] = past_kv.key_cache[li][best_pidx:best_pidx+1].contiguous()
+                past_kv.value_cache[li] = past_kv.value_cache[li][best_pidx:best_pidx+1].contiguous()
+            _crop_cache(past_kv, cache_len + best_acc)
+            return accepted_toks, best_acc, past_kv, accepted_toks[:, -1:]
+        # ── Prefill (causal) ──
+        _set_dlm(False)
+        enc_out = self.encoder(
+            input_ids=prompt_ids, past_key_values=DynamicCache(),
+            use_cache=True, use_causal_mask=True,
+        )
+        past_key_values = enc_out.past_key_values
+        last_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
+        nfe = 1
+        if temperature > 0:
+            next_token = torch.multinomial(torch.softmax(last_logit / temperature, dim=-1), 1)
+        else:
+            next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
+        if eos_token_id is not None and next_token.item() == eos_token_id:
+            return torch.cat([prompt_ids, next_token], dim=1), nfe
+        generated = [next_token]
+        total_gen = 1
+        # ── Main draft-verify loop ──
+        while total_gen < max_new_tokens:
+            cache_len = past_key_values.get_seq_length()
+            block = torch.full((1, block_length), token_mask_id, dtype=torch.long, device=device)
+            block[0, 0] = next_token.item()
+            # Draft: single-step bidirectional diffusion (1 NFE)
+            _set_dlm(True)
+            enc_out = self.encoder(input_ids=block, past_key_values=past_key_values, use_cache=False)
+            nfe += 1
+            draft_logits = self.diffusion_head(enc_out.last_hidden_state)
+            if temperature > 0:
+                draft_probs = torch.softmax(draft_logits / temperature, dim=-1)
+                draft_tokens = torch.multinomial(
+                    draft_probs.view(-1, draft_probs.shape[-1]), 1
+                ).view(1, block_length)
+            else:
+                draft_tokens = draft_logits.argmax(dim=-1)
+                draft_probs = torch.softmax(draft_logits, dim=-1)
+            draft_conf = torch.gather(draft_probs, -1, draft_tokens.unsqueeze(-1)).squeeze(-1)
+            is_mask = block == token_mask_id
+            draft_conf = torch.where(is_mask, draft_conf, -torch.inf)
+            block[is_mask] = draft_tokens[is_mask]
+            # Verify: multi-path batch-stacking (1 NFE, flash-attention compatible)
+            result = _mp_verify(block, draft_probs, draft_conf, past_key_values, cache_len)
+            if result is not None:
+                accepted_toks, accepted, past_key_values, next_token = result
+                nfe += 1
+            else:
+                # No uncertain positions — single-path causal verify
+                _set_dlm(False)
+                enc_out = self.encoder(
+                    input_ids=block, past_key_values=past_key_values,
+                    use_cache=True, use_causal_mask=True,
+                )
+                past_key_values = enc_out.past_key_values
+                nfe += 1
+                vlogits = self.diffusion_head(enc_out.last_hidden_state)
+                if temperature > 0:
+                    vp = torch.softmax(vlogits / temperature, dim=-1)
+                    ar_tokens = torch.multinomial(vp.view(-1, vp.shape[-1]), 1).view(1, block_length)
+                else:
+                    ar_tokens = vlogits.argmax(dim=-1)
+                accepted = 0
+                for i in range(block_length - 1):
+                    if ar_tokens[0, i].item() == block[0, i + 1].item():
+                        accepted += 1
+                    else:
+                        break
+                accepted += 1
+                accepted_toks = ar_tokens[:, :accepted]
+                _crop_cache(past_key_values, cache_len + accepted)
+                next_token = ar_tokens[:, accepted - 1 : accepted]
+            generated.append(accepted_toks)
+            total_gen += accepted
+            if eos_token_id is not None:
+                eos_pos = (accepted_toks[0] == eos_token_id).nonzero(as_tuple=True)[0]
+                if len(eos_pos) > 0:
+                    first_eos = eos_pos[0].item()
+                    generated[-1] = accepted_toks[:, :first_eos + 1]
+                    total_gen = total_gen - accepted + first_eos + 1
+                    break
+            if end_think_token_id is not None and max_thinking_tokens is not None:
+                if total_gen > max_thinking_tokens:
+                    all_gen = torch.cat(generated, dim=1)
+                    if not (all_gen == end_think_token_id).any():
+                        next_token = torch.tensor(
+                            [[end_think_token_id]], device=device
+                        )
+            if total_gen >= max_new_tokens:
+                break
+        all_generated = torch.cat(generated, dim=1)
+        output_ids = torch.cat([prompt_ids, all_generated], dim=1)
+        return output_ids, nfe
+    @torch.no_grad()
+    def linear_spec_generate_lora(
+        self,
+        prompt_ids: torch.Tensor,
+        max_new_tokens: int = 128,
+        block_length: int = 32,
+        temperature: float = 0.0,
+        mask_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        threshold: float = 0.0,
+        rebuild_kv: str = 'none',
+        max_thinking_tokens: Optional[int] = None,
+        end_think_token_id: Optional[int] = None,
+    ):
+        """Linear speculative decoding: diffusion draft + AR verify.
+        LoRA adapter toggling: ON for draft (bidirectional), OFF for verify (causal).
+        Returns (output_ids, nfe).
+        """
+        if prompt_ids.shape[0] != 1:
+            raise ValueError("linear_spec_generate requires batch_size == 1")
+        token_mask_id = mask_token_id if mask_token_id is not None else self.config.mask_token_id
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, "eos_token_id", None)
+        device = prompt_ids.device
+        dream_style = getattr(self.config, 'dlm_type', 'llada') == 'dream'
+        def _set_diffusion_lm(val: bool):
+            for layer in self.encoder.layers:
+                if hasattr(layer.self_attn, 'diffusion_lm'):
+                    layer.self_attn.diffusion_lm = val
+        def _toggle_adapters(model, enable: bool):
+            for module in model.modules():
+                if hasattr(module, '_disable_adapters'):
+                    module._disable_adapters = not enable
+        # Prefill (causal, LoRA OFF)
+        _set_diffusion_lm(False)
+        _toggle_adapters(self, False)
+        enc_out = self.encoder(
+            input_ids=prompt_ids,
+            past_key_values=DynamicCache(),
+            use_cache=True,
+            use_causal_mask=True,
+        )
+        past_key_values = enc_out.past_key_values
+        last_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
+        nfe = 1
+        if temperature > 0:
+            next_token = torch.multinomial(torch.softmax(last_logit / temperature, dim=-1), num_samples=1)
+        else:
+            next_token = torch.argmax(last_logit, dim=-1, keepdim=True)
+        if eos_token_id is not None and next_token.item() == eos_token_id:
+            return torch.cat([prompt_ids, next_token], dim=1), nfe
+        generated = [next_token]
+        total_gen = 1
+        while total_gen < max_new_tokens:
+            cache_len = past_key_values.get_seq_length()
+            block = torch.full((1, block_length), token_mask_id, dtype=torch.long, device=device)
+            block[0, 0] = next_token.item()
+            # Draft (bidirectional, LoRA ON)
+            _set_diffusion_lm(True)
+            _toggle_adapters(self, True)
+            enc_out = self.encoder(input_ids=block, past_key_values=past_key_values, use_cache=False)
+            nfe += 1
+            draft_logits = self.diffusion_head(enc_out.last_hidden_state)
+            if dream_style:
+                draft_logits = torch.cat([draft_logits[:, :1, :], draft_logits[:, :-1, :]], dim=1)
+            if temperature > 0:
+                draft_probs = torch.softmax(draft_logits / temperature, dim=-1)
+                draft_tokens = torch.multinomial(draft_probs.view(-1, draft_probs.shape[-1]), num_samples=1).view(1, block_length)
+            else:
+                draft_tokens = draft_logits.argmax(dim=-1)
+                draft_probs = torch.softmax(draft_logits, dim=-1)
+            draft_conf = torch.gather(draft_probs, -1, draft_tokens.unsqueeze(-1)).squeeze(-1)
+            is_mask = block == token_mask_id
+            draft_conf = torch.where(is_mask, draft_conf, -torch.inf)
+            unmask = draft_conf > threshold
+            if unmask.sum() > 0:
+                block[unmask] = draft_tokens[unmask]
+            # Verify (causal, LoRA OFF)
+            _set_diffusion_lm(False)
+            _toggle_adapters(self, False)
+            enc_out = self.encoder(input_ids=block, past_key_values=past_key_values, use_cache=True, use_causal_mask=True)
+            past_key_values = enc_out.past_key_values
+            nfe += 1
+            verify_logits = self.diffusion_head(enc_out.last_hidden_state)
+            if temperature > 0:
+                ar_tokens = torch.multinomial(torch.softmax(verify_logits / temperature, dim=-1).view(-1, verify_logits.shape[-1]), num_samples=1).view(1, block_length)
+            else:
+                ar_tokens = verify_logits.argmax(dim=-1)
+            accepted = 0
+            for i in range(block_length - 1):
+                if ar_tokens[0, i].item() == block[0, i + 1].item():
+                    accepted += 1
+                else:
+                    break
+            accepted += 1  # bonus token
+            accepted_toks = ar_tokens[:, :accepted]
+            generated.append(accepted_toks)
+            total_gen += accepted
+            _crop_dynamic_cache(past_key_values, cache_len + accepted)
+            next_token = ar_tokens[:, accepted - 1 : accepted]
+            # EOS check
+            if eos_token_id is not None:
+                eos_pos = (accepted_toks[0] == eos_token_id).nonzero(as_tuple=True)[0]
+                if len(eos_pos) > 0:
+                    first_eos = eos_pos[0].item()
+                    generated[-1] = accepted_toks[:, : first_eos + 1]
+                    total_gen = total_gen - accepted + first_eos + 1
+                    break
+            # Thinking budget enforcement
+            if end_think_token_id is not None and max_thinking_tokens is not None:
+                if total_gen > max_thinking_tokens:
+                    all_gen = torch.cat(generated, dim=1)
+                    if not (all_gen == end_think_token_id).any():
+                        next_token = torch.tensor([[end_think_token_id]], device=device)
+            if total_gen >= max_new_tokens:
+                break
+        all_generated = torch.cat(generated, dim=1)
+        output_ids = torch.cat([prompt_ids, all_generated], dim=1)
+        return output_ids, nfe