Instructions to use SupraLabs/Supra-Mini-v4-2M with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use SupraLabs/Supra-Mini-v4-2M with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="SupraLabs/Supra-Mini-v4-2M")

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("SupraLabs/Supra-Mini-v4-2M")
model = AutoModelForCausalLM.from_pretrained("SupraLabs/Supra-Mini-v4-2M")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use SupraLabs/Supra-Mini-v4-2M with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "SupraLabs/Supra-Mini-v4-2M"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "SupraLabs/Supra-Mini-v4-2M",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/SupraLabs/Supra-Mini-v4-2M

SGLang

How to use SupraLabs/Supra-Mini-v4-2M with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "SupraLabs/Supra-Mini-v4-2M" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "SupraLabs/Supra-Mini-v4-2M",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "SupraLabs/Supra-Mini-v4-2M" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "SupraLabs/Supra-Mini-v4-2M",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use SupraLabs/Supra-Mini-v4-2M with Docker Model Runner:
```
docker model run hf.co/SupraLabs/Supra-Mini-v4-2M
```

Supra-Mini-v4-2M

File size: 5,281 Bytes

208972d

"""
© SupraLabs 2026 - Official pretraining code for Supra Mini v3 0.5M
"""

import os
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

print("[*] Loading libraries...")
import torch
import math
import numpy as np
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
from transformers import (
    LlamaConfig,
    LlamaForCausalLM,
    PreTrainedTokenizerFast,
    Trainer,
    TrainingArguments,
)
from torch.utils.data import Dataset
from tqdm import tqdm

print("[*] Loading tokenizer...")
fast_tokenizer = ByteLevelBPETokenizer(
    "./custom_llama_tokenizer-vocab.json",
    "./custom_llama_tokenizer-merges.txt"
)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=fast_tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
)

TOKEN_BIN     = "./tokens.bin"
TARGET_TOKENS = 1_000_000_000
SEQ_LEN       = 512
BATCH_TEXTS   = 1000
FLUSH_EVERY   = 1_000_000


def build_token_bin(fast_tokenizer, path=TOKEN_BIN, target_tokens=TARGET_TOKENS):
    if os.path.exists(path) and os.path.getsize(path) >= target_tokens * 2:
        print(f"[=] Reusing existing token file: {path}")
        return

    print(f"[*] Streaming + tokenizing {target_tokens:,} tokens → {path}")
    mm = np.memmap(path, dtype=np.uint16, mode="w+", shape=(target_tokens,))

    dataset = load_dataset(
        "HuggingFaceFW/fineweb-edu", "sample-10BT",
        split="train", streaming=True
    )

    written = 0
    buf = []
    texts = []
    pbar = tqdm(total=target_tokens, desc="[*] Gathering tokens", unit="tok")

    def flush_buf():
        nonlocal written, buf
        if not buf:
            return False
        n = min(len(buf), target_tokens - written)
        mm[written:written + n] = np.asarray(buf[:n], dtype=np.uint16)
        written += n
        pbar.update(n)
        del buf[:n]
        return written >= target_tokens

    for example in dataset:
        texts.append(example["text"])
        if len(texts) >= BATCH_TEXTS:
            encs = fast_tokenizer.encode_batch(texts)
            texts.clear()
            for e in encs:
                buf.extend(e.ids)
            if len(buf) >= FLUSH_EVERY:
                if flush_buf():
                    break

    if written < target_tokens and texts:
        encs = fast_tokenizer.encode_batch(texts)
        for e in encs:
            buf.extend(e.ids)
    if written < target_tokens:
        flush_buf()

    pbar.close()
    mm.flush()
    del mm
    print(f"[+] Wrote {written:,} tokens to {path} "
          f"({os.path.getsize(path)/1e6:.1f} MB)")


class MemmapDataset(Dataset):
    def __init__(self, path, total_tokens, seq_len=SEQ_LEN):
        self.path     = path
        self.seq_len  = seq_len
        self.n_chunks = total_tokens // seq_len
        self._data    = None  # lazy open (Multiprocessing-safe)

    @property
    def data(self):
        if self._data is None:
            self._data = np.memmap(
                self.path, dtype=np.uint16, mode="r",
                shape=(self.n_chunks * self.seq_len,)
            )
        return self._data

    def __len__(self):
        return self.n_chunks

    def __getitem__(self, idx):
        s   = idx * self.seq_len
        arr = np.asarray(self.data[s:s + self.seq_len], dtype=np.int64)
        ids = torch.from_numpy(arr)
        return {"input_ids": ids, "labels": ids.clone()}


def collate_fn(batch):
    input_ids = torch.stack([b["input_ids"] for b in batch])
    labels    = torch.stack([b["labels"]    for b in batch])
    return {"input_ids": input_ids, "labels": labels}


print(f"[*] Preparing {TARGET_TOKENS:,} tokens (streaming, memmap-backed)...")
build_token_bin(fast_tokenizer, TOKEN_BIN, TARGET_TOKENS)
dataset = MemmapDataset(TOKEN_BIN, TARGET_TOKENS, seq_len=SEQ_LEN)
print(f"[+] Dataset ready: {len(dataset):,} chunks of {SEQ_LEN} tokens")

print("[*] Setting up model...")
config = LlamaConfig(
    vocab_size=len(tokenizer.get_vocab()),
    hidden_size=64,
    intermediate_size=128,
    num_hidden_layers=5,
    num_attention_heads=8,
    num_key_value_heads=8,
    max_position_embeddings=512,
    tie_word_embeddings=True,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = LlamaForCausalLM(config)
print(f"[*] Model parameters: {model.num_parameters():,}")

print("[*] Defining training arguments...")
training_args = TrainingArguments(
    output_dir="./Supra-Mini-v3-0.5M",
    num_train_epochs=2,
    per_device_train_batch_size=256,
    gradient_accumulation_steps=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    weight_decay=0.01,
    fp16=False,
    bf16=True,
    push_to_hub=False,
    report_to="none",
    dataloader_num_workers=os.cpu_count() // 2,
    dataloader_pin_memory=True,
    learning_rate=5e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.02,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collate_fn,
)

print("[*] Starting training...")
trainer.train()
trainer.save_model("./Supra-Mini-v3-0.5M-FINAL")
tokenizer.save_pretrained("./Supra-Mini-v3-0.5M-FINAL")
print("[*] Training finished.")