Instructions to use nvidia/Nemotron-Labs-Diffusion-3B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use nvidia/Nemotron-Labs-Diffusion-3B with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="nvidia/Nemotron-Labs-Diffusion-3B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("nvidia/Nemotron-Labs-Diffusion-3B", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use nvidia/Nemotron-Labs-Diffusion-3B with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "nvidia/Nemotron-Labs-Diffusion-3B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-3B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/nvidia/Nemotron-Labs-Diffusion-3B

SGLang

How to use nvidia/Nemotron-Labs-Diffusion-3B with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "nvidia/Nemotron-Labs-Diffusion-3B" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-3B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "nvidia/Nemotron-Labs-Diffusion-3B" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-3B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use nvidia/Nemotron-Labs-Diffusion-3B with Docker Model Runner:
```
docker model run hf.co/nvidia/Nemotron-Labs-Diffusion-3B
```

YongganFu commited on Mar 12

Commit

281fcf8

verified ·

1 Parent(s): 6c967fb

Upload model

Browse files

Files changed (3) hide show

config.json +1 -1
generation_config.json +1 -1
modeling_ministral_dlm.py +83 -0

config.json CHANGED Viewed

@@ -24,7 +24,7 @@
   "dp_varying_mask_ratio": false,
   "enable_self_spec": false,
   "enforce_mask": false,
-  "eos_token_id": 2,
   "global_loss_avg": false,
   "head_dim": 128,
   "hidden_act": "silu",

   "dp_varying_mask_ratio": false,
   "enable_self_spec": false,
   "enforce_mask": false,
+  "eos_token_id": 11,
   "global_loss_avg": false,
   "head_dim": 128,
   "hidden_act": "silu",

generation_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "_from_model_config": true,
   "bos_token_id": 1,
-  "eos_token_id": 2,
   "transformers_version": "4.55.4",
   "use_cache": false
 }

 {
   "_from_model_config": true,
   "bos_token_id": 1,
+  "eos_token_id": 11,
   "transformers_version": "4.55.4",
   "use_cache": false
 }

modeling_ministral_dlm.py CHANGED Viewed

@@ -872,6 +872,9 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
     def generate(self, prompt_ids, max_new_tokens, steps, block_length, shift_logits, threshold, causal_context=True, temperature=0, eos_token_id=None):
         out_ids, nfe = generate_with_prefix_cache_block_diff(
                         model=self,
                         prompt=prompt_ids,
@@ -986,6 +989,86 @@ class MinistralDiffEncoderModel(Ministral3PreTrainedModel, GenerationMixin):
             return logits, past_key_values
     @torch.no_grad()
     def self_spec_generate(
         self,

     def generate(self, prompt_ids, max_new_tokens, steps, block_length, shift_logits, threshold, causal_context=True, temperature=0, eos_token_id=None):
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, 'eos_token_id', None)
         out_ids, nfe = generate_with_prefix_cache_block_diff(
                         model=self,
                         prompt=prompt_ids,
             return logits, past_key_values
+    @torch.no_grad()
+    def ar_generate(
+        self,
+        prompt_ids: torch.Tensor,
+        max_new_tokens: int = 128,
+        temperature: float = 0.0,
+        eos_token_id: Optional[int] = None,
+    ) -> tuple:
+        """Autoregressive generation calling the encoder directly (injected by build_hf_tidar_repo).
+        Bypasses MinistralDiffEncoderModel.forward() to avoid diffusion-specific
+        code paths. Calls self.encoder (Ministral3Model) with explicit cache_position,
+        position_ids, and use_cache so the KV cache and causal masking behave
+        identically to MistralForCausalLM / vLLM.
+        Returns:
+            (output_ids, nfe) where output_ids includes the prompt.
+        """
+        for layer in self.encoder.layers:
+            if hasattr(layer.self_attn, 'diffusion_lm'):
+                layer.self_attn.diffusion_lm = False
+        if eos_token_id is None:
+            eos_token_id = getattr(self.config, 'eos_token_id', None)
+        device = prompt_ids.device
+        batch_size, prompt_len = prompt_ids.shape
+        past_key_values = DynamicCache()
+        cache_position = torch.arange(prompt_len, device=device)
+        position_ids = cache_position.unsqueeze(0).expand(batch_size, -1)
+        enc_out = self.encoder(
+            input_ids=prompt_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=True,
+            cache_position=cache_position,
+        )
+        past_key_values = enc_out.past_key_values
+        next_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
+        generated_tokens = []
+        nfe = 0
+        for step in range(max_new_tokens):
+            nfe += 1
+            if temperature > 0:
+                probs = torch.softmax(next_logit / temperature, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            else:
+                next_token = torch.argmax(next_logit, dim=-1, keepdim=True)
+            generated_tokens.append(next_token)
+            if eos_token_id is not None and (next_token == eos_token_id).all():
+                break
+            if step < max_new_tokens - 1:
+                cur_pos = prompt_len + step
+                step_cache_pos = torch.tensor([cur_pos], device=device)
+                step_pos_ids = step_cache_pos.unsqueeze(0).expand(batch_size, -1)
+                enc_out = self.encoder(
+                    input_ids=next_token,
+                    position_ids=step_pos_ids,
+                    past_key_values=past_key_values,
+                    use_cache=True,
+                    cache_position=step_cache_pos,
+                )
+                past_key_values = enc_out.past_key_values
+                next_logit = self.diffusion_head(enc_out.last_hidden_state[:, -1:, :]).squeeze(1)
+        all_generated = torch.cat(generated_tokens, dim=1)
+        output_ids = torch.cat([prompt_ids, all_generated], dim=1)
+        return output_ids, nfe
     @torch.no_grad()
     def self_spec_generate(
         self,