Instructions to use nvidia/Nemotron-Labs-Diffusion-3B-Base with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use nvidia/Nemotron-Labs-Diffusion-3B-Base with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="nvidia/Nemotron-Labs-Diffusion-3B-Base", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("nvidia/Nemotron-Labs-Diffusion-3B-Base", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use nvidia/Nemotron-Labs-Diffusion-3B-Base with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "nvidia/Nemotron-Labs-Diffusion-3B-Base"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-3B-Base",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/nvidia/Nemotron-Labs-Diffusion-3B-Base

SGLang

How to use nvidia/Nemotron-Labs-Diffusion-3B-Base with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "nvidia/Nemotron-Labs-Diffusion-3B-Base" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-3B-Base",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "nvidia/Nemotron-Labs-Diffusion-3B-Base" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-3B-Base",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use nvidia/Nemotron-Labs-Diffusion-3B-Base with Docker Model Runner:
```
docker model run hf.co/nvidia/Nemotron-Labs-Diffusion-3B-Base
```

YongganFu commited on Mar 3

Commit

36a1c93

verified ·

1 Parent(s): a11afdb

Upload model

Browse files

Files changed (2) hide show

chat_utils.py +20 -0
modeling_ministral_dlm.py +407 -29

chat_utils.py CHANGED Viewed

@@ -113,6 +113,7 @@ def generate_with_prefix_cache_block_diff(
     shift_logits=False,
     neg_entropy=False,
     causal_context=False,
 ):
     dream_style=shift_logits
     # Initialize the accumulator
@@ -221,6 +222,16 @@ def generate_with_prefix_cache_block_diff(
                 cur[transfer_idx] = x0[transfer_idx]
                 x_accum[:, block_slice] = cur
         if causal_context:
             for layer in model_module.encoder.layers:
                 if hasattr(layer.self_attn, 'diffusion_lm'):
@@ -244,4 +255,13 @@ def generate_with_prefix_cache_block_diff(
             # refresh context-next logit for the next block
             next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)
     return x_accum, nfe

     shift_logits=False,
     neg_entropy=False,
     causal_context=False,
+    eos_token_id=None,
 ):
     dream_style=shift_logits
     # Initialize the accumulator
                 cur[transfer_idx] = x0[transfer_idx]
                 x_accum[:, block_slice] = cur
+            if eos_token_id is not None:
+                block_tokens = x_accum[:, block_slice]              # (B, Lb)
+                eos_mask = (block_tokens == eos_token_id)           # (B, Lb)
+                any_eos = eos_mask.any(dim=1)                       # (B,)
+                if any_eos.any():
+                    after_eos = eos_mask.cumsum(dim=1).bool()       # (B, Lb)
+                    mask_before = (block_tokens == mask_id) & ~after_eos
+                    if (any_eos & ~mask_before.any(dim=1)).any():
+                        break
         if causal_context:
             for layer in model_module.encoder.layers:
                 if hasattr(layer.self_attn, 'diffusion_lm'):
             # refresh context-next logit for the next block
             next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)
+        if eos_token_id is not None:
+            gen_so_far = x_accum[:, prompt.size(1):]                    # (B, gen_len_so_far)
+            is_eos = (gen_so_far == eos_token_id)                       # (B, gen_len_so_far)
+            has_eos = is_eos.any(dim=1)                                 # (B,)
+            if has_eos.all():
+                first_eos_pos = is_eos.to(torch.int64).argmax(dim=1)    # (B,)
+                max_eos = first_eos_pos.max().item()
+                return x_accum[:, : prompt.size(1) + max_eos + 1], nfe
     return x_accum, nfe

modeling_ministral_dlm.py CHANGED Viewed

@@ -13,7 +13,7 @@ from torch import nn
 from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutput
 from transformers.utils import ModelOutput
-from torch.nn.attention.flex_attention import flex_attention, create_block_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
@@ -49,6 +49,43 @@ class MinistralDiffOutputWithPast(ModelOutput):
 def fused_flex_attention(q, k, v, block_mask=None):
     return flex_attention(q, k, v, block_mask=block_mask)
 # with reference to https://github.com/pytorch-labs/attention-gym/blob/main/examples/flex_attn.ipynb
 class MinistralFlexAttention(Ministral3Attention):
     def __init__(self, *args, **kwargs):
@@ -69,11 +106,47 @@ class MinistralFlexAttention(Ministral3Attention):
         self.block_size = self.block_size_orig
         self.mode = self.config.dlm_paradigm
         import torch._dynamo.config as dcfg
         dcfg.cache_size_limit = 512
     def set_attention_mode(self, mode, block_size=None):
         self.mode = mode
         self.block_size = block_size
@@ -225,40 +298,131 @@ class MinistralFlexAttention(Ministral3Attention):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        if self.mode == 'bidirectional':
-            if self.bidirectional_mask is None or q_len != self.bidirectional_mask.shape[-2]:
-                block_mask = self.compute_block_mask(mode='bidirectional', q_len=q_len)
-            else:
-                block_mask = self.bidirectional_mask
-        elif self.mode == 'autoregressive':
-            if self.autoregressive_mask is None or q_len != self.autoregressive_mask.shape[-2]:
-                block_mask = self.compute_block_mask(mode='autoregressive', q_len=q_len)
-            else:
-                block_mask = self.autoregressive_mask
-        elif self.mode == 'block_diff':
-            if self.block_diff_mask is None or self.block_size != self.block_size_orig or q_len != self.block_diff_mask.shape[-2]:
-                block_mask = self.compute_block_mask(mode='block_diff', block_size=self.block_size, q_len=q_len)
-            else:
-                block_mask = self.block_diff_mask
-        elif self.mode == 'sbd_block_diff':
-            if self.sbd_block_diff_mask is None or self.block_size != self.block_size_orig or q_len != self.sbd_block_diff_mask.shape[-2]:
-                block_mask = self.compute_block_mask(mode='sbd_block_diff', block_size=self.block_size, q_len=q_len)
             else:
-                block_mask = self.sbd_block_diff_mask
-        else:
-            raise ValueError(f"Unknown attention mode: {self.mode}")
-        attn_output = fused_flex_attention(query_states, key_states, value_states, block_mask=block_mask)
-        attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-        return attn_output, None
 def gumbel_topk(log_w: torch.Tensor, k: int) -> torch.Tensor:
@@ -713,7 +877,7 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
         )
-    def generate(self, prompt_ids, max_new_tokens, steps, block_length, shift_logits, threshold, causal_context=True, temperature=0):
         out_ids, nfe = generate_with_prefix_cache_block_diff(
                         model=self,
                         prompt=prompt_ids,
@@ -727,8 +891,222 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
                         shift_logits=shift_logits,
                         neg_entropy=False,
                         causal_context=causal_context,
                     )
         return out_ids, nfe
 __all__ = ["MinistralDiffEncoderModel", "MinistralFlexAttention"]

 from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutput
 from transformers.utils import ModelOutput
+from torch.nn.attention.flex_attention import BlockMask, flex_attention, create_block_mask, or_masks
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 def fused_flex_attention(q, k, v, block_mask=None):
     return flex_attention(q, k, v, block_mask=block_mask)
+def _crop_dynamic_cache(past_key_values: DynamicCache, max_length: int):
+    """Crop a DynamicCache to max_length, compatible with both old and new transformers."""
+    if hasattr(past_key_values, 'crop'):
+        past_key_values.crop(max_length)
+    else:
+        for layer_idx in range(len(past_key_values)):
+            past_key_values.key_cache[layer_idx] = past_key_values.key_cache[layer_idx][:, :, :max_length]
+            past_key_values.value_cache[layer_idx] = past_key_values.value_cache[layer_idx][:, :, :max_length]
+        past_key_values._seen_tokens = max_length
+def _extract_draft_kv_cache(past_key_values: DynamicCache, clean_len: int, block_length: int):
+    """After quadratic decoding, extract only draft tokens (first of each block) from cache."""
+    for layer_idx in range(len(past_key_values)):
+        if hasattr(past_key_values, 'layers'):
+            layer_cache = past_key_values.layers[layer_idx]
+            k, v = layer_cache.keys, layer_cache.values
+        else:
+            k = past_key_values.key_cache[layer_idx]
+            v = past_key_values.value_cache[layer_idx]
+        clean_k, draft_k = k[:, :, :clean_len], k[:, :, clean_len::block_length + 1]
+        clean_v, draft_v = v[:, :, :clean_len], v[:, :, clean_len::block_length + 1]
+        new_k = torch.cat([clean_k, draft_k], dim=2)
+        new_v = torch.cat([clean_v, draft_v], dim=2)
+        if hasattr(past_key_values, 'layers'):
+            layer_cache.keys = new_k
+            layer_cache.values = new_v
+        else:
+            past_key_values.key_cache[layer_idx] = new_k
+            past_key_values.value_cache[layer_idx] = new_v
+    past_key_values._seen_tokens = clean_len + block_length
 # with reference to https://github.com/pytorch-labs/attention-gym/blob/main/examples/flex_attn.ipynb
 class MinistralFlexAttention(Ministral3Attention):
     def __init__(self, *args, **kwargs):
         self.block_size = self.block_size_orig
         self.mode = self.config.dlm_paradigm
+        self._quadratic_block_mask = {}
         import torch._dynamo.config as dcfg
         dcfg.cache_size_limit = 512
+    def _get_sbd_inference_quadratic_decoding_block_mask(self, block_length: int):
+        if block_length not in self._quadratic_block_mask:
+            draft_len = block_length * (block_length + 1)
+            def quadratic(b, h, q_idx, kv_idx):
+                first_clean = torch.logical_and(
+                    kv_idx % (block_length + 1) == 0,
+                    kv_idx < draft_len,
+                )
+                first_clean = torch.logical_and(first_clean, q_idx >= kv_idx)
+                block_q = q_idx // (block_length + 1)
+                block_kv = kv_idx // (block_length + 1)
+                same_block = torch.logical_and(block_q == block_kv, q_idx < draft_len)
+                same_block_except_first = torch.logical_and(
+                    same_block,
+                    q_idx % (block_length + 1) != 0,
+                )
+                draft_part = torch.logical_or(first_clean, same_block_except_first)
+                clean_part = kv_idx >= draft_len
+                return torch.logical_or(draft_part, clean_part)
+            block_mask = create_block_mask(
+                quadratic,
+                B=None,
+                H=None,
+                Q_LEN=draft_len,
+                KV_LEN=draft_len + self.config.max_position_embeddings,
+                device="cuda",
+            )
+            self._quadratic_block_mask[block_length] = block_mask
+        return self._quadratic_block_mask[block_length]
     def set_attention_mode(self, mode, block_size=None):
         self.mode = mode
         self.block_size = block_size
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        tidar_inference_mode = getattr(self.config, "tidar_inference_mode", None)
+        if tidar_inference_mode is not None:
+            if tidar_inference_mode == "quadratic":
+                block_length = getattr(self.config, "block_length", None) or getattr(self.config, "block_size", None)
+                if block_length is None:
+                    raise ValueError("SBD quadratic decoding requires block_length in config.")
+                if past_key_values is not None:
+                    seq_len = key_states.shape[2]
+                    draft_len = block_length * (block_length + 1)
+                    clean_keys = key_states[:, :, :-draft_len]
+                    draft_keys = key_states[:, :, -draft_len:]
+                    clean_values = value_states[:, :, :-draft_len]
+                    draft_values = value_states[:, :, -draft_len:]
+                    key_states = torch.cat([draft_keys, clean_keys], dim=2)
+                    value_states = torch.cat([draft_values, clean_values], dim=2)
+                    block_mask: BlockMask = self._get_sbd_inference_quadratic_decoding_block_mask(
+                        block_length=block_length
+                    )
+                    block_mask.seq_lengths = (draft_len, seq_len)
+                else:
+                    seq_len = query_states.shape[2]
+                    draft_len = block_length * (block_length + 1)
+                    clean_len = seq_len - draft_len
+                    def _causal_mask(b, h, q_idx, kv_idx):
+                        return torch.logical_and(q_idx >= kv_idx, q_idx < clean_len)
+                    def _draft2clean_mask(b, h, q_idx, kv_idx):
+                        full_clean = torch.logical_and(q_idx >= clean_len, kv_idx <= clean_len)
+                        first_clean = torch.logical_and(
+                            q_idx >= clean_len, (kv_idx - clean_len) % (block_length + 1) == 0
+                        )
+                        first_clean = torch.logical_and(first_clean, q_idx >= kv_idx)
+                        return torch.logical_or(full_clean, first_clean)
+                    def _draft_mask(b, h, q_idx, kv_idx):
+                        block_q = (q_idx - clean_len) // (block_length + 1)
+                        block_kv = (kv_idx - clean_len) // (block_length + 1)
+                        quadrant = torch.logical_and(q_idx >= clean_len, kv_idx >= clean_len)
+                        same_block = torch.logical_and(block_q == block_kv, quadrant)
+                        same_block_except_first = torch.logical_and(
+                            same_block,
+                            (q_idx - clean_len) % (block_length + 1) != 0,
+                        )
+                        return torch.logical_and(block_q == block_kv, same_block_except_first)
+                    mask = or_masks(_causal_mask, _draft2clean_mask)
+                    mask = or_masks(mask, _draft_mask)
+                    block_mask = create_block_mask(
+                        mask, B=None, H=None, Q_LEN=seq_len, KV_LEN=seq_len,
+                    )
+                key_states = repeat_kv(key_states, self.num_key_value_groups)
+                value_states = repeat_kv(value_states, self.num_key_value_groups)
+                attn_output = flex_attention(query_states, key_states, value_states, block_mask=block_mask)
+                attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
+                attn_output = self.o_proj(attn_output)
+                return attn_output, None
+            elif tidar_inference_mode == "default":
+                block_length = getattr(self.config, "block_length", None) or getattr(self.config, "block_size", None)
+                if block_length is None:
+                    raise ValueError("SBD default decoding requires block_length in config.")
+                seq_len = query_states.shape[2]
+                prefix_len = seq_len - block_length
+                def _clean_q_mask(b, h, q_idx, kv_idx):
+                    return torch.logical_and(q_idx >= kv_idx, q_idx < prefix_len)
+                def _noisy_q_mask(b, h, q_idx, kv_idx):
+                    return q_idx >= prefix_len
+                block_mask = create_block_mask(
+                    or_masks(_clean_q_mask, _noisy_q_mask),
+                    B=None,
+                    H=None,
+                    Q_LEN=seq_len,
+                    KV_LEN=seq_len,
+                )
+                key_states = repeat_kv(key_states, self.num_key_value_groups)
+                value_states = repeat_kv(value_states, self.num_key_value_groups)
+                attn_output = flex_attention(query_states, key_states, value_states, block_mask=block_mask)
+                attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
+                attn_output = self.o_proj(attn_output)
+                return attn_output, None
+        else:
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
+            if self.mode == 'bidirectional':
+                if self.bidirectional_mask is None or q_len != self.bidirectional_mask.shape[-2]:
+                    block_mask = self.compute_block_mask(mode='bidirectional', q_len=q_len)
+                else:
+                    block_mask = self.bidirectional_mask
+            elif self.mode == 'autoregressive':
+                if self.autoregressive_mask is None or q_len != self.autoregressive_mask.shape[-2]:
+                    block_mask = self.compute_block_mask(mode='autoregressive', q_len=q_len)
+                else:
+                    block_mask = self.autoregressive_mask
+            elif self.mode == 'block_diff':
+                if self.block_diff_mask is None or self.block_size != self.block_size_orig or q_len != self.block_diff_mask.shape[-2]:
+                    block_mask = self.compute_block_mask(mode='block_diff', block_size=self.block_size, q_len=q_len)
+                else:
+                    block_mask = self.block_diff_mask
+            elif self.mode == 'sbd_block_diff':
+                if self.sbd_block_diff_mask is None or self.block_size != self.block_size_orig or q_len != self.sbd_block_diff_mask.shape[-2]:
+                    block_mask = self.compute_block_mask(mode='sbd_block_diff', block_size=self.block_size, q_len=q_len)
+                else:
+                    block_mask = self.sbd_block_diff_mask
             else:
+                raise ValueError(f"Unknown attention mode: {self.mode}")
+            attn_output = fused_flex_attention(query_states, key_states, value_states, block_mask=block_mask)
+            attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
+            attn_output = self.o_proj(attn_output)
+            return attn_output, None
 def gumbel_topk(log_w: torch.Tensor, k: int) -> torch.Tensor:
         )
+    def generate(self, prompt_ids, max_new_tokens, steps, block_length, shift_logits, threshold, causal_context=True, temperature=0, eos_token_id=None):
         out_ids, nfe = generate_with_prefix_cache_block_diff(
                         model=self,
                         prompt=prompt_ids,
                         shift_logits=shift_logits,
                         neg_entropy=False,
                         causal_context=causal_context,
+                        eos_token_id=eos_token_id,
                     )
         return out_ids, nfe
+    @torch.no_grad()
+    def sbd_inference_diffusion_quadratic(
+        self,
+        clean_input_ids: Optional[torch.Tensor],
+        draft_input_ids: torch.Tensor,
+        block_length: int,
+        draft_only: bool = False,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+    ):
+        """SBD quadratic inference (injected by build_hf_tidar_repo)."""
+        enc_config = self.encoder.config
+        enc_config.use_sbd_objective = True
+        enc_config.block_length = block_length
+        if draft_only:
+            assert clean_input_ids is not None
+            if use_cache and past_key_values is None:
+                past_key_values = DynamicCache()
+            enc_config.tidar_inference_mode = "default"
+            input_ids = torch.cat([clean_input_ids, draft_input_ids], dim=-1)
+            outputs = self.encoder(
+                input_ids=input_ids,
+                position_ids=None,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                is_training=False,
+            )
+            hidden_states = outputs.last_hidden_state
+            logits = self.diffusion_head(hidden_states)
+            past_key_values = getattr(outputs, "past_key_values", None)
+            if use_cache and past_key_values is not None:
+                _crop_dynamic_cache(past_key_values, clean_input_ids.shape[1])
+            return logits, past_key_values
+        else:
+            enc_config.tidar_inference_mode = "quadratic"
+            draft_len = block_length * (block_length + 1)
+            draft_input_ids = torch.cat(
+                [
+                    draft_input_ids.view(-1, block_length, 1),
+                    torch.full(
+                        (draft_input_ids.shape[0], block_length, block_length),
+                        fill_value=self.config.mask_token_id,
+                        device=draft_input_ids.device,
+                    ),
+                ],
+                dim=-1,
+            ).view(-1, draft_len)
+            if use_cache:
+                assert past_key_values is not None, (
+                    "Past key values should be provided when using cache, e.g. run draft_only=True first."
+                )
+                assert clean_input_ids is None, (
+                    "Clean input ids should already be in cache, thus none should be provided."
+                )
+                clean_len = past_key_values.get_seq_length()
+                input_ids = draft_input_ids
+            else:
+                clean_len = clean_input_ids.shape[1]
+                input_ids = torch.cat([clean_input_ids, draft_input_ids], dim=-1)
+            per_block_position_ids = torch.arange(
+                clean_len, clean_len + block_length + 1, device=draft_input_ids.device
+            )[None,].repeat(block_length, 1)
+            per_block_position_ids += torch.arange(block_length, device=draft_input_ids.device).view(-1, 1)
+            if use_cache:
+                position_ids = per_block_position_ids.view(-1)[None,]
+            else:
+                clean_position_ids = torch.arange(clean_len, device=draft_input_ids.device)
+                position_ids = torch.cat([clean_position_ids, per_block_position_ids.view(-1)], dim=-1)[None,]
+            outputs = self.encoder(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                is_training=False,
+            )
+            hidden_states = outputs.last_hidden_state
+            logits = self.diffusion_head(hidden_states)
+            past_key_values = getattr(outputs, "past_key_values", None)
+            if use_cache and past_key_values is not None:
+                _extract_draft_kv_cache(past_key_values, clean_len, block_length)
+            return logits, past_key_values
+    @torch.no_grad()
+    def tidar_generate(
+        self,
+        prompt_ids: torch.Tensor,
+        max_new_tokens: int = 128,
+        steps: int = 128,
+        block_length: int = 16,
+        threshold: Optional[float] = None,
+        temperature: float = 0.0,
+        mask_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+    ):
+        """TiDAR quadratic speculative decoding (injected by build_hf_tidar_repo)."""
+        self.config.use_sbd_objective = True
+        self.config.dlm_paradigm = "sbd"
+        if prompt_ids.shape[0] != 1:
+            raise ValueError("TiDAR quadratic decoding currently requires batch_size == 1")
+        token_mask_id = mask_token_id if mask_token_id is not None else self.config.mask_token_id
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, "eos_token_id", None)
+        x = torch.full(
+            (1, prompt_ids.shape[1] + max_new_tokens + block_length * 2),
+            token_mask_id,
+            dtype=torch.long,
+            device=prompt_ids.device,
+        )
+        x[:, : prompt_ids.shape[1]] = prompt_ids.clone()
+        if max_new_tokens % block_length != 0:
+            raise ValueError("max_new_tokens must be divisible by block_length")
+        num_blocks = max_new_tokens // block_length
+        if steps % num_blocks != 0:
+            raise ValueError("steps must be divisible by (max_new_tokens // block_length)")
+        prompt_len = prompt_ids.shape[1]
+        nfe = 0
+        nfe += 1
+        logits, past_key_values = self.sbd_inference_diffusion_quadratic(
+            clean_input_ids=x[:, :prompt_len],
+            draft_input_ids=x[:, prompt_len : prompt_len + block_length],
+            block_length=block_length,
+            draft_only=True,
+            use_cache=True,
+        )
+        logits_proposal = logits[:, prompt_len - 1 : prompt_len + block_length]
+        logits_proposal[:, 1] = logits_proposal[:, 0]
+        logits_proposal = logits_proposal[:, 1:]
+        x0_proposal = torch.argmax(logits_proposal, dim=-1)
+        x[:, prompt_len : prompt_len + block_length] = x0_proposal
+        total_accept_token = 0
+        while True:
+            nfe += 1
+            block_start = prompt_len + total_accept_token
+            block_end = block_start + block_length
+            draft_input_ids = x[:, block_start:block_end]
+            logits, past_key_values = self.sbd_inference_diffusion_quadratic(
+                clean_input_ids=None,
+                draft_input_ids=draft_input_ids,
+                block_length=block_length,
+                draft_only=False,
+                past_key_values=past_key_values,
+                use_cache=True,
+            )
+            useful_token_logits = logits.view(1, block_length, block_length + 1, -1)
+            if threshold is None:
+                useful_token_logits[:, :, 1] = useful_token_logits[:, :, 0]
+            else:
+                if not (0.0 <= threshold <= 1.0):
+                    raise ValueError("threshold must be between 0 and 1")
+                mix_logits = useful_token_logits[:, :, 0] * threshold + useful_token_logits[:, :, 1] * (1 - threshold)
+                useful_token_logits[:, :, 0] = mix_logits
+                useful_token_logits[:, :, 1] = mix_logits
+            if temperature > 0:
+                useful_token_logits = useful_token_logits / temperature
+            useful_token_pred = torch.argmax(useful_token_logits, dim=-1)
+            new_draft_input_ids = useful_token_pred[:, 0, 1:]
+            accept_cnt = 1
+            while accept_cnt < block_length:
+                if useful_token_pred[:, accept_cnt - 1, 0].item() != draft_input_ids[:, accept_cnt].item():
+                    break
+                new_draft_input_ids = useful_token_pred[:, accept_cnt, 1:]
+                accept_cnt += 1
+            x[:, block_start : block_start + accept_cnt] = draft_input_ids[:, :accept_cnt]
+            # EoS early stopping: all accepted tokens are finalized left-to-right,
+            # so if any is EoS we can truncate and return immediately.
+            if eos_token_id is not None:
+                accepted = x[0, block_start : block_start + accept_cnt]
+                eos_positions = (accepted == eos_token_id).nonzero(as_tuple=True)[0]
+                if len(eos_positions) > 0:
+                    first_eos_rel = eos_positions[0].item()
+                    total_accept_token += first_eos_rel + 1
+                    output_end = prompt_len + total_accept_token
+                    return x[:, :output_end], nfe
+            x[:, block_start + accept_cnt : block_start + accept_cnt + block_length] = new_draft_input_ids
+            past_key_values.crop(block_start + accept_cnt)
+            total_accept_token += accept_cnt
+            if total_accept_token >= max_new_tokens:
+                break
+        return x[:, : -(block_length * 2)], nfe
 __all__ = ["MinistralDiffEncoderModel", "MinistralFlexAttention"]