Instructions to use qubitpage/sentinel-prime-350m with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use qubitpage/sentinel-prime-350m with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="qubitpage/sentinel-prime-350m", trust_remote_code=True)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("qubitpage/sentinel-prime-350m", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use qubitpage/sentinel-prime-350m with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "qubitpage/sentinel-prime-350m"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "qubitpage/sentinel-prime-350m",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/qubitpage/sentinel-prime-350m

SGLang

How to use qubitpage/sentinel-prime-350m with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "qubitpage/sentinel-prime-350m" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "qubitpage/sentinel-prime-350m",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "qubitpage/sentinel-prime-350m" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "qubitpage/sentinel-prime-350m",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use qubitpage/sentinel-prime-350m with Docker Model Runner:
```
docker model run hf.co/qubitpage/sentinel-prime-350m
```

sentinel-prime-350m / hf_tokenizer.py

qubitpage

Upload hf_tokenizer.py with huggingface_hub

dfc9678 verified 20 days ago

raw

history blame contribute delete

5.21 kB

	"""
	HuggingFace-compatible tokenizer wrapper for tiktoken cl100k_base.

	Wraps tiktoken so it works with HF's generate(), lm-evaluation-harness,
	and the Hub (tokenizer.json / tokenizer_config.json).

	Usage:
	from hf_tokenizer import SentinelBrainTokenizer
	tok = SentinelBrainTokenizer()
	ids = tok("Hello world", return_tensors="pt")
	"""

	import json
	import os
	from typing import Optional, List, Dict, Union
	import tiktoken
	from transformers import PreTrainedTokenizer


	class SentinelBrainTokenizer(PreTrainedTokenizer):
	"""HuggingFace PreTrainedTokenizer wrapping tiktoken cl100k_base."""

	vocab_files_names = {"vocab_file": "tiktoken_vocab.json"}
	model_input_names = ["input_ids", "attention_mask"]

	def __init__(
	self,
	vocab_file: Optional[str] = None,
	eos_token: str = "<\|endoftext\|>",
	pad_token: str = "<\|endoftext\|>",
	model_max_length: int = 1024,
	**kwargs,
	):
	self._enc = tiktoken.get_encoding("cl100k_base")
	self._vocab_size = self._enc.n_vocab # 100277

	# Build token-to-id mapping for special tokens
	self._special_tokens = {
	"<\|endoftext\|>": self._enc.eot_token, # 100257
	}

	super().__init__(
	eos_token=eos_token,
	pad_token=pad_token,
	model_max_length=model_max_length,
	**kwargs,
	)

	@property
	def vocab_size(self) -> int:
	return self._vocab_size

	def get_vocab(self) -> Dict[str, int]:
	"""Return vocab dict. tiktoken doesn't expose full vocab easily,
	so we return a partial mapping for special tokens."""
	vocab = {}
	# Add special tokens
	for tok, idx in self._special_tokens.items():
	vocab[tok] = idx
	return vocab

	def _tokenize(self, text: str, **kwargs) -> List[str]:
	"""Tokenize into string tokens (HF convention).
	We return token IDs as strings since tiktoken uses bytes."""
	token_ids = self._enc.encode(text, allowed_special={"<\|endoftext\|>"})
	return [str(tid) for tid in token_ids]

	def _convert_token_to_id(self, token: str) -> int:
	"""Convert string token → ID."""
	if token in self._special_tokens:
	return self._special_tokens[token]
	try:
	return int(token)
	except ValueError:
	return self._enc.eot_token # fallback

	def _convert_id_to_token(self, index: int) -> str:
	"""Convert ID → string token."""
	try:
	return self._enc.decode([index])
	except Exception:
	return "<\|unk\|>"

	def convert_tokens_to_string(self, tokens: List[str]) -> str:
	"""Convert token strings back to text."""
	ids = []
	for t in tokens:
	try:
	ids.append(int(t))
	except ValueError:
	if t in self._special_tokens:
	ids.append(self._special_tokens[t])
	try:
	return self._enc.decode(ids)
	except Exception:
	return ""

	def encode(self, text: Union[str, List[str]], add_special_tokens: bool = True,
	**kwargs) -> Union[List[int], List[List[int]]]:
	"""Fast-path encode using tiktoken directly."""
	if isinstance(text, str):
	ids = self._enc.encode(text, allowed_special={"<\|endoftext\|>"})
	return ids
	return [self._enc.encode(t, allowed_special={"<\|endoftext\|>"}) for t in text]

	def decode(self, token_ids: Union[List[int], int], skip_special_tokens: bool = False,
	**kwargs) -> str:
	"""Fast-path decode using tiktoken directly."""
	if isinstance(token_ids, int):
	token_ids = [token_ids]
	if skip_special_tokens:
	token_ids = [t for t in token_ids if t != self._enc.eot_token]
	try:
	return self._enc.decode(token_ids)
	except Exception:
	return ""

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
	"""Save a minimal vocab file so from_pretrained works."""
	if not os.path.isdir(save_directory):
	os.makedirs(save_directory, exist_ok=True)
	prefix = filename_prefix + "-" if filename_prefix else ""
	vocab_file = os.path.join(save_directory, prefix + "tiktoken_vocab.json")
	vocab_data = {
	"encoding": "cl100k_base",
	"vocab_size": self._vocab_size,
	"eos_token_id": self._enc.eot_token,
	"special_tokens": self._special_tokens,
	}
	with open(vocab_file, "w", encoding="utf-8") as f:
	json.dump(vocab_data, f, indent=2)
	return (vocab_file,)

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, args, *kwargs):
	"""Load from directory. Falls back to creating fresh tokenizer."""
	try:
	return super().from_pretrained(pretrained_model_name_or_path, args, *kwargs)
	except Exception:
	return cls(**kwargs)