Text Generation
Transformers
Safetensors
English
sentinel_brain
sentinel-prime
Mixture of Experts
sparse-mixture-of-experts
from-scratch
custom-architecture
custom_code
Instructions to use qubitpage/sentinel-prime-350m with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use qubitpage/sentinel-prime-350m with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="qubitpage/sentinel-prime-350m", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("qubitpage/sentinel-prime-350m", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use qubitpage/sentinel-prime-350m with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "qubitpage/sentinel-prime-350m" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "qubitpage/sentinel-prime-350m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/qubitpage/sentinel-prime-350m
- SGLang
How to use qubitpage/sentinel-prime-350m with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "qubitpage/sentinel-prime-350m" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "qubitpage/sentinel-prime-350m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "qubitpage/sentinel-prime-350m" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "qubitpage/sentinel-prime-350m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use qubitpage/sentinel-prime-350m with Docker Model Runner:
docker model run hf.co/qubitpage/sentinel-prime-350m
Upload hf_model.py with huggingface_hub
Browse files- hf_model.py +29 -11
hf_model.py
CHANGED
|
@@ -357,18 +357,36 @@ class SentinelBrainForCausalLM(PreTrainedModel, GenerationMixin):
|
|
| 357 |
B, T = input_ids.shape
|
| 358 |
x = self.tok_emb(input_ids)
|
| 359 |
|
| 360 |
-
# Determine if we have valid past KV caches
|
|
|
|
| 361 |
has_past = False
|
| 362 |
past_len = 0
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
rope_cos, rope_sin = self.rope(past_len + T)
|
| 374 |
rope_cos = rope_cos[:, :, past_len:past_len + T].to(x.device)
|
|
@@ -379,7 +397,7 @@ class SentinelBrainForCausalLM(PreTrainedModel, GenerationMixin):
|
|
| 379 |
total_z = 0.0
|
| 380 |
|
| 381 |
for i, layer in enumerate(self.layers):
|
| 382 |
-
kv_cache =
|
| 383 |
x, new_kv, aux, z = layer(x, rope_cos, rope_sin, kv_cache=kv_cache)
|
| 384 |
new_kv_caches.append(new_kv)
|
| 385 |
total_aux += aux
|
|
|
|
| 357 |
B, T = input_ids.shape
|
| 358 |
x = self.tok_emb(input_ids)
|
| 359 |
|
| 360 |
+
# Determine if we have valid past KV caches.
|
| 361 |
+
# Support: list-of-tuples (legacy), tuple-of-tuples, and DynamicCache (new transformers).
|
| 362 |
has_past = False
|
| 363 |
past_len = 0
|
| 364 |
+
_legacy_past = None # normalized to list-of-tuples form
|
| 365 |
+
|
| 366 |
+
if past_key_values is not None:
|
| 367 |
+
# New API: DynamicCache or similar Cache object
|
| 368 |
+
if hasattr(past_key_values, "to_legacy_cache"):
|
| 369 |
+
try:
|
| 370 |
+
legacy = past_key_values.to_legacy_cache()
|
| 371 |
+
if legacy is not None and len(legacy) > 0:
|
| 372 |
+
_legacy_past = list(legacy)
|
| 373 |
+
first = _legacy_past[0]
|
| 374 |
+
if first is not None and len(first) > 0 and first[0] is not None:
|
| 375 |
+
has_past = True
|
| 376 |
+
past_len = first[0].shape[2]
|
| 377 |
+
except Exception:
|
| 378 |
+
pass
|
| 379 |
+
# Legacy API: list/tuple of (k, v) tuples
|
| 380 |
+
elif isinstance(past_key_values, (list, tuple)) and len(past_key_values) > 0:
|
| 381 |
+
_legacy_past = list(past_key_values)
|
| 382 |
+
first = _legacy_past[0]
|
| 383 |
+
if first is not None:
|
| 384 |
+
if isinstance(first, (tuple, list)) and len(first) > 0 and first[0] is not None:
|
| 385 |
+
has_past = True
|
| 386 |
+
past_len = first[0].shape[2]
|
| 387 |
+
elif hasattr(first, "shape"):
|
| 388 |
+
has_past = True
|
| 389 |
+
past_len = first.shape[2]
|
| 390 |
|
| 391 |
rope_cos, rope_sin = self.rope(past_len + T)
|
| 392 |
rope_cos = rope_cos[:, :, past_len:past_len + T].to(x.device)
|
|
|
|
| 397 |
total_z = 0.0
|
| 398 |
|
| 399 |
for i, layer in enumerate(self.layers):
|
| 400 |
+
kv_cache = _legacy_past[i] if (has_past and _legacy_past is not None and i < len(_legacy_past)) else None
|
| 401 |
x, new_kv, aux, z = layer(x, rope_cos, rope_sin, kv_cache=kv_cache)
|
| 402 |
new_kv_caches.append(new_kv)
|
| 403 |
total_aux += aux
|