Instructions to use nvidia/Nemotron-Labs-Diffusion-14B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use nvidia/Nemotron-Labs-Diffusion-14B with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="nvidia/Nemotron-Labs-Diffusion-14B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("nvidia/Nemotron-Labs-Diffusion-14B", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use nvidia/Nemotron-Labs-Diffusion-14B with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "nvidia/Nemotron-Labs-Diffusion-14B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-14B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/nvidia/Nemotron-Labs-Diffusion-14B

SGLang

How to use nvidia/Nemotron-Labs-Diffusion-14B with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "nvidia/Nemotron-Labs-Diffusion-14B" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-14B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "nvidia/Nemotron-Labs-Diffusion-14B" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "nvidia/Nemotron-Labs-Diffusion-14B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use nvidia/Nemotron-Labs-Diffusion-14B with Docker Model Runner:
```
docker model run hf.co/nvidia/Nemotron-Labs-Diffusion-14B
```

Nemotron-Labs-Diffusion-14B

File size: 12,942 Bytes

3ef6080

import numpy as np
import torch
import torch.nn.functional as F


def add_gumbel_noise(logits, temperature):
    '''
    The Gumbel max is a method for sampling categorical distributions.
    According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality.
    Thus, we use float64.
    '''
    if temperature == 0:
        return logits
    logits = logits.to(torch.float64)
    noise = torch.rand_like(logits, dtype=torch.float64)
    gumbel_noise = (- torch.log(noise)) ** temperature
    return logits.exp() / gumbel_noise


def get_transfer_index(logits, temperature, remasking, mask_index, x, num_transfer_tokens, threshold=None, neg_entropy=False):
    logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
    x0 = torch.argmax(logits_with_noise, dim=-1)

    if remasking == 'low_confidence':
        # p = F.softmax(logits.to(torch.float64), dim=-1)
        p = F.softmax(logits, dim=-1)
        x0_p = torch.squeeze(
            torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1) # b, l
    elif remasking == 'top_p_margin':
        # Compute probabilities
        p = F.softmax(logits, dim=-1)                       # (B, L, V)
        # Top-2 per position
        top2 = torch.topk(p, k=2, dim=-1).values            # (B, L, 2)
        margin = top2[..., 0] - top2[..., 1]                # (B, L)

        # Normalize margin to [0,1] over MASKED positions per row
        plus_inf  = torch.full_like(margin, float('inf'))
        minus_inf = torch.full_like(margin, float('-inf'))
        masked_for_min = torch.where(mask_index, margin, plus_inf)
        masked_for_max = torch.where(mask_index, margin, minus_inf)
        row_min = masked_for_min.amin(dim=1, keepdim=True)  # (B, 1)
        row_max = masked_for_max.amax(dim=1, keepdim=True)  # (B, 1)
        denom = (row_max - row_min)

        # If denom==0 (all equal), set normalized=1 on masked; 0 elsewhere by default
        normalized = torch.zeros_like(margin)
        nonzero = denom > 0
        normalized = torch.where(
            mask_index & nonzero,
            (margin - row_min) / (denom + 1e-12),
            normalized
        )
        normalized = torch.where(
            mask_index & (~nonzero),
            torch.ones_like(normalized),
            normalized
        )
        x0_p = normalized  # ∈ [0,1] on masked positions
    elif remasking == 'random':
        x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
    else:
        raise NotImplementedError(remasking)
    
    # Calculate negative entropy if requested
    if neg_entropy:
        # p = F.softmax(logits.to(torch.float64), dim=-1)
        p = F.softmax(logits, dim=-1)
        epsilon = 1e-10
        log_probs = torch.log(p + epsilon)
        confidence_scores = torch.sum(p * log_probs, dim=-1)  # negative entropy per position
    else:
        confidence_scores = x0_p
    
    x0 = torch.where(mask_index, x0, x)
    confidence = torch.where(mask_index, confidence_scores, -np.inf)

    transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
    if threshold is not None:
        num_transfer_tokens = mask_index.sum(dim=1, keepdim=True)
    # print(f'confidence: {confidence}')
    for j in range(confidence.shape[0]):
        _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j])
        transfer_index[j, select_index] = True
        if threshold is not None:
            for k in range(1, num_transfer_tokens[j]):
                if confidence[j, select_index[k]] < threshold:
                    transfer_index[j, select_index[k]] = False
    return x0, transfer_index


def get_num_transfer_tokens(mask_index, steps: int):
    mask_num = mask_index.sum(dim=1, keepdim=True)
    base = mask_num // steps
    remainder = mask_num % steps
    num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base
    for i in range(mask_num.size(0)):
        num_transfer_tokens[i, : int(remainder[i])] += 1
    return num_transfer_tokens


@torch.no_grad()
def generate_with_prefix_cache_block_diff(
    model,
    prompt,
    steps=128,
    gen_length=128,
    block_length=128,
    temperature=0.,
    remasking='low_confidence',
    mask_id=126336,
    threshold=None,
    factor=None,
    shift_logits=False,
    neg_entropy=False,
    causal_context=False,
    eos_token_id=None,
    max_thinking_tokens=None,
    end_think_token_id=None,
):
    dream_style=shift_logits
    x_accum = prompt.clone()
    B = prompt.shape[0]

    assert gen_length % block_length == 0
    num_blocks = gen_length // block_length

    assert steps % num_blocks == 0
    steps_per_block = steps // num_blocks

    nfe = 0

    if causal_context:
        model_module = model.module if hasattr(model, "module") else model
        for layer in model_module.encoder.layers:
            if hasattr(layer.self_attn, 'diffusion_lm'):
                layer.self_attn.diffusion_lm=False

    # Compute KV cache for the prompt initially
    output = model(prompt, use_cache=True, use_causal_mask=causal_context)
    past_key_values = output.past_key_values

    if causal_context:
        for layer in model_module.encoder.layers:
            if hasattr(layer.self_attn, 'diffusion_lm'):
                layer.self_attn.diffusion_lm=True

    # Causal prefill: next token from last position (same as linear_spec_generate).
    next_token = None
    if causal_context:
        last_logit = output.logits[:, -1, :]
        if temperature > 0:
            probs = torch.softmax(last_logit / temperature, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
        else:
            next_token = torch.argmax(last_logit, dim=-1, keepdim=True)

    # For dream_style: store the "next token logit" of the context
    next_logits_context = None
    if dream_style:
        next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)

    for num_block in range(num_blocks):
        # Create a new block with mask tokens; under causal context, seed position 0
        # with the next-token prediction from the previous causal forward (prefill or
        # post-block encode), matching linear_spec_generate.
        mask_block = torch.ones(
            (prompt.shape[0], block_length),
            dtype=prompt.dtype,
            device=prompt.device
        ) * mask_id
        if causal_context:
            mask_block[:, 0] = next_token[:, 0]

        # Append the block of masks
        x_accum = torch.cat([x_accum, mask_block], dim=1)
        current_block_start = prompt.size(1) + num_block * block_length
        block_slice = slice(current_block_start, current_block_start + block_length)

        # ---- thinking budget enforcement ----
        # If we've generated >= max_thinking_tokens without a </think>, inject one.
        if end_think_token_id is not None and max_thinking_tokens is not None:
            tokens_before_block = num_block * block_length
            tokens_after_block = tokens_before_block + block_length
            if tokens_after_block > max_thinking_tokens:
                gen_so_far = x_accum[:, prompt.size(1):current_block_start]
                has_end_think = (
                    (gen_so_far == end_think_token_id).any(dim=1)
                    if gen_so_far.size(1) > 0
                    else torch.zeros(B, dtype=torch.bool, device=prompt.device)
                )
                if not has_end_think.all():
                    if tokens_before_block < max_thinking_tokens:
                        offset = max_thinking_tokens - tokens_before_block
                    else:
                        offset = 0
                    inject_pos = current_block_start + offset
                    for b in range(B):
                        if not has_end_think[b]:
                            x_accum[b, inject_pos] = end_think_token_id

        # Build the initial mask for this block
        mask_block_idx0 = (x_accum[:, block_slice] == mask_id)  # (B, Lb)

        # Precompute the transfer schedule for this block
        if dream_style:
            # masked positions only (position 0 may be causal-seeded, not mask_id)
            schedule_mask = mask_block_idx0
        else:
            schedule_mask = mask_block_idx0

        num_transfer_tokens = get_num_transfer_tokens(schedule_mask, steps_per_block)  # (B, steps)

        # Denoise the current block
        for i in range(steps_per_block):
            mask_block_idx = (x_accum[:, block_slice] == mask_id)  # (B, Lb)
            if mask_block_idx.sum() == 0:
                break

            nfe += 1

            # Forward only the current noisy block using cached context
            logits_block = model(
                x_accum[:, block_slice],
                past_key_values=past_key_values,
                use_cache=False
            ).logits

            if dream_style:
                # Align logits so that each masked position has a predictor:
                # prepend context-next logit, then use logits_block[:-1]
                if block_length == 1:
                    logits_use = next_logits_context              # (B, 1, V)
                else:
                    logits_use = torch.cat(
                        [next_logits_context, logits_block[:, :-1, :]],
                        dim=1
                    )  # (B, Lb, V)

                mask_use = mask_block_idx                        # (B, Lb)
                x_use   = x_accum[:, block_slice]                # (B, Lb)

                x0, transfer_idx = get_transfer_index(
                    logits_use, temperature, remasking, mask_use, x_use,
                    num_transfer_tokens=num_transfer_tokens[:, i],
                    threshold=threshold, neg_entropy=neg_entropy
                )
                cur = x_accum[:, block_slice].clone()
                cur[transfer_idx] = x0[transfer_idx]
                x_accum[:, block_slice] = cur

            else:
                # non-AR (same-position) case
                x0, transfer_idx = get_transfer_index(
                    logits_block, temperature, remasking, mask_block_idx,
                    x_accum[:, block_slice],
                    num_transfer_tokens=num_transfer_tokens[:, i],
                    threshold=threshold, neg_entropy=neg_entropy
                )
                cur = x_accum[:, block_slice].clone()
                cur[transfer_idx] = x0[transfer_idx]
                x_accum[:, block_slice] = cur

            if eos_token_id is not None:
                block_tokens = x_accum[:, block_slice]              # (B, Lb)
                eos_mask = (block_tokens == eos_token_id)           # (B, Lb)
                any_eos = eos_mask.any(dim=1)                       # (B,)
                if any_eos.any():
                    after_eos = eos_mask.cumsum(dim=1).bool()       # (B, Lb)
                    mask_before = (block_tokens == mask_id) & ~after_eos
                    if (any_eos & ~mask_before.any(dim=1)).any():
                        break

        if causal_context:
            for layer in model_module.encoder.layers:
                if hasattr(layer.self_attn, 'diffusion_lm'):
                    layer.self_attn.diffusion_lm=False

        # after block is fully denoised, update KV cache
        output = model(
            x_accum[:, block_slice],
            past_key_values=past_key_values,
            use_cache=True,
            use_causal_mask=causal_context
        )
        past_key_values = output.past_key_values
        nfe += 1

        if causal_context:
            for layer in model_module.encoder.layers:
                if hasattr(layer.self_attn, 'diffusion_lm'):
                    layer.self_attn.diffusion_lm=True
            # Next block's first position = greedy/sampled next token from this causal encode
            last_logit = output.logits[:, -1, :]
            if temperature > 0:
                probs = torch.softmax(last_logit / temperature, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
            else:
                next_token = torch.argmax(last_logit, dim=-1, keepdim=True)

        if dream_style and num_block < num_blocks - 1:
            # refresh context-next logit for the next block
            next_logits_context = output.logits[:, -1:, :]  # (B, 1, V)

        if eos_token_id is not None:
            gen_so_far = x_accum[:, prompt.size(1):]                    # (B, gen_len_so_far)
            is_eos = (gen_so_far == eos_token_id)                       # (B, gen_len_so_far)
            has_eos = is_eos.any(dim=1)                                 # (B,)
            if has_eos.all():
                first_eos_pos = is_eos.to(torch.int64).argmax(dim=1)    # (B,)
                max_eos = first_eos_pos.max().item()
                return x_accum[:, : prompt.size(1) + max_eos + 1], nfe

    return x_accum, nfe