| """ |
| HuggingFace Space β Gemma 4 26B A4B Coding API |
| Model : unsloth/gemma-4-26B-A4B-it-GGUF β UD-IQ3_XXS (11.2 GB) |
| RAM : fits in 16 GB with ~4 GB left for KV cache at ctx=4096 |
| Params: temp=0.3, top_p=0.9, min_p=0.1, top_k=20 (tuned for coding per reddit) |
| |
| Endpoints |
| GET / β landing page |
| GET /health β status (also used by self-ping) |
| GET /v1/models β OpenAI model list |
| POST /v1/chat/completions β OpenAI-compatible |
| POST /v1/messages β Anthropic-compatible β Claude Code uses this |
| """ |
|
|
| import os, sys, json, time, uuid, asyncio, threading, requests |
| from contextlib import asynccontextmanager |
| from typing import Optional, List, Union, Any, Dict |
|
|
| import httpx |
| from fastapi import FastAPI, HTTPException |
| from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse |
| from fastapi.middleware.cors import CORSMiddleware |
| from pydantic import BaseModel |
|
|
| |
| MODEL_REPO = os.getenv("MODEL_REPO", "unsloth/gemma-4-26B-A4B-it-GGUF") |
| MODEL_FILE = os.getenv("MODEL_FILE", "gemma-4-26B-A4B-it-UD-IQ3_XXS.gguf") |
| MODEL_DIR = "/app/models" |
| MODEL_PATH = f"{MODEL_DIR}/{MODEL_FILE}" |
| SPACE_URL = os.getenv("SPACE_URL", "") |
| HF_TOKEN = os.getenv("HF_TOKEN", "") |
|
|
| N_CTX = int(os.getenv("N_CTX", "4096")) |
| N_THREADS = int(os.getenv("N_THREADS", "2")) |
|
|
| DEFAULT_TEMP = float(os.getenv("DEFAULT_TEMP", "0.3")) |
| DEFAULT_TOP_P = float(os.getenv("DEFAULT_TOP_P", "0.9")) |
| DEFAULT_MIN_P = float(os.getenv("DEFAULT_MIN_P", "0.1")) |
| DEFAULT_TOP_K = int(os.getenv("DEFAULT_TOP_K", "20")) |
|
|
| |
| MIN_MODEL_BYTES = 10 * 1024 ** 3 |
|
|
| MODEL_ALIAS = "gemma-4-26b" |
| llm = None |
|
|
| |
| def download_model(): |
| os.makedirs(MODEL_DIR, exist_ok=True) |
|
|
| |
| if os.path.exists(MODEL_PATH): |
| size = os.path.getsize(MODEL_PATH) |
| if size >= MIN_MODEL_BYTES: |
| print(f"[model] Cached model found ({size / 1e9:.2f} GB) β skipping download.", flush=True) |
| return |
| print(f"[model] Incomplete file detected ({size / 1e9:.2f} GB) β re-downloading...", flush=True) |
| os.remove(MODEL_PATH) |
|
|
| url = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILE}" |
| headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {} |
| tmp_path = MODEL_PATH + ".tmp" |
|
|
| print(f"[model] Connecting to HuggingFace...", flush=True) |
|
|
| with requests.get(url, stream=True, headers=headers, timeout=60) as r: |
| r.raise_for_status() |
| total = int(r.headers.get("content-length", 0)) |
| total_gb = total / (1024 ** 3) |
|
|
| print(f"[model] Downloading {MODEL_FILE}", flush=True) |
| print(f"[model] Total size : {total_gb:.2f} GB", flush=True) |
| print(f"[model] Destination: {MODEL_PATH}", flush=True) |
| print(f"[model] {'β' * 52}", flush=True) |
|
|
| downloaded = 0 |
| last_step = -1 |
| chunk_size = 8 * 1024 * 1024 |
|
|
| with open(tmp_path, "wb") as f: |
| for chunk in r.iter_content(chunk_size=chunk_size): |
| if not chunk: |
| continue |
| f.write(chunk) |
| downloaded += len(chunk) |
|
|
| if total > 0: |
| pct = downloaded / total * 100 |
| step = int(pct) // 5 |
| if step > last_step: |
| last_step = step |
| filled = step |
| empty = 20 - filled |
| bar = "β" * filled + "β" * empty |
| gb_done = downloaded / (1024 ** 3) |
| speed_mb = (downloaded / (time.monotonic() + 1e-9)) / 1e6 |
| print( |
| f"[model] |{bar}| {pct:5.1f}% " |
| f"{gb_done:.2f}/{total_gb:.2f} GB", |
| flush=True, |
| ) |
|
|
| |
| os.rename(tmp_path, MODEL_PATH) |
| final_size = os.path.getsize(MODEL_PATH) |
| print(f"[model] {'β' * 52}", flush=True) |
| print(f"[model] Download complete! {final_size / 1e9:.2f} GB saved to {MODEL_PATH}", flush=True) |
|
|
|
|
| |
| def load_model(): |
| global llm |
| from llama_cpp import Llama |
| download_model() |
| print(f"[model] Loading {MODEL_FILE} into RAM (ctx={N_CTX}, threads={N_THREADS})...", flush=True) |
| llm = Llama( |
| model_path = MODEL_PATH, |
| n_ctx = N_CTX, |
| n_threads = N_THREADS, |
| n_batch = 512, |
| n_gpu_layers = 0, |
| verbose = False, |
| chat_format = None, |
| ) |
| print(f"[model] β Gemma 4 26B ready!", flush=True) |
|
|
| |
| async def self_ping_loop(): |
| while True: |
| await asyncio.sleep(25 * 60) |
| if SPACE_URL: |
| try: |
| async with httpx.AsyncClient(timeout=15) as c: |
| r = await c.get(f"{SPACE_URL}/health") |
| print(f"[ping] {r.status_code}", flush=True) |
| except Exception as e: |
| print(f"[ping] failed: {e}", flush=True) |
|
|
| |
| @asynccontextmanager |
| async def lifespan(app: FastAPI): |
| threading.Thread(target=load_model, daemon=True).start() |
| asyncio.create_task(self_ping_loop()) |
| yield |
|
|
| app = FastAPI(title="Gemma 4 Coding API", lifespan=lifespan) |
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_methods=["*"], |
| allow_headers=["*"], |
| allow_credentials=True, |
| ) |
|
|
| |
| def _check_model(): |
| if llm is None: |
| raise HTTPException( |
| 503, |
| detail="Model still loading β first boot downloads ~11 GB, wait ~5-10 min" |
| ) |
|
|
| def _extract_text(content) -> str: |
| if isinstance(content, str): |
| return content |
| if isinstance(content, list): |
| parts = [] |
| for block in content: |
| if isinstance(block, dict): |
| if block.get("type") == "text": |
| parts.append(block.get("text", "")) |
| elif block.get("type") == "tool_result": |
| parts.append(_extract_text(block.get("content", ""))) |
| else: |
| parts.append(str(block)) |
| return "".join(parts) |
| return str(content) |
|
|
| |
| @app.get("/health") |
| async def health(): |
| return { |
| "status": "ok", |
| "model_loaded": llm is not None, |
| "model": MODEL_FILE, |
| "ctx": N_CTX, |
| } |
|
|
| |
| class OAIMessage(BaseModel): |
| role: str |
| content: Union[str, List[Any]] |
|
|
| class OAIRequest(BaseModel): |
| model: str = MODEL_ALIAS |
| messages: List[OAIMessage] |
| temperature: float = DEFAULT_TEMP |
| top_p: float = DEFAULT_TOP_P |
| min_p: float = DEFAULT_MIN_P |
| top_k: int = DEFAULT_TOP_K |
| max_tokens: int = 2048 |
| stream: bool = False |
| stop: Optional[List[str]] = None |
|
|
| @app.get("/v1/models") |
| async def oai_models(): |
| return { |
| "object": "list", |
| "data": [{ |
| "id": MODEL_ALIAS, |
| "object": "model", |
| "created": int(time.time()), |
| "owned_by": "google-deepmind", |
| }], |
| } |
|
|
| @app.post("/v1/chat/completions") |
| async def oai_chat(req: OAIRequest): |
| _check_model() |
| msgs = [ |
| {"role": m.role, "content": _extract_text(m.content)} |
| for m in req.messages |
| ] |
| kwargs = dict( |
| messages = msgs, |
| temperature = req.temperature, |
| top_p = req.top_p, |
| min_p = req.min_p, |
| top_k = req.top_k, |
| max_tokens = req.max_tokens, |
| stop = req.stop, |
| ) |
|
|
| if req.stream: |
| async def gen(): |
| rid = f"chatcmpl-{uuid.uuid4().hex[:8]}" |
| ts = int(time.time()) |
| for chunk in llm.create_chat_completion(**kwargs, stream=True): |
| data = { |
| "id": rid, |
| "object": "chat.completion.chunk", |
| "created": ts, |
| "model": req.model, |
| "choices": [{ |
| "index": 0, |
| "delta": chunk["choices"][0]["delta"], |
| "finish_reason": chunk["choices"][0]["finish_reason"], |
| }], |
| } |
| yield f"data: {json.dumps(data)}\n\n" |
| yield "data: [DONE]\n\n" |
| return StreamingResponse(gen(), media_type="text/event-stream") |
|
|
| result = llm.create_chat_completion(**kwargs, stream=False) |
| return JSONResponse(result) |
|
|
| |
| class AnthropicMessage(BaseModel): |
| role: str |
| content: Union[str, List[Dict]] |
|
|
| class AnthropicRequest(BaseModel): |
| model: str = MODEL_ALIAS |
| messages: List[AnthropicMessage] |
| system: Optional[str] = None |
| max_tokens: int = 2048 |
| temperature: float = DEFAULT_TEMP |
| top_p: float = DEFAULT_TOP_P |
| top_k: int = DEFAULT_TOP_K |
| stream: bool = False |
| stop_sequences: Optional[List[str]] = None |
|
|
| @app.post("/v1/messages") |
| async def anthropic_messages(req: AnthropicRequest): |
| _check_model() |
| msgs = [] |
| if req.system: |
| msgs.append({"role": "system", "content": req.system}) |
| for m in req.messages: |
| msgs.append({"role": m.role, "content": _extract_text(m.content)}) |
|
|
| kwargs = dict( |
| messages = msgs, |
| temperature = req.temperature, |
| top_p = req.top_p, |
| min_p = DEFAULT_MIN_P, |
| top_k = req.top_k, |
| max_tokens = req.max_tokens, |
| stop = req.stop_sequences, |
| ) |
|
|
| if req.stream: |
| async def gen(): |
| msg_id = f"msg_{uuid.uuid4().hex[:20]}" |
| yield f"data: {json.dumps({'type':'message_start','message':{'id':msg_id,'type':'message','role':'assistant','content':[],'model':req.model,'stop_reason':None,'usage':{'input_tokens':0,'output_tokens':0}}})}\n\n" |
| yield f"data: {json.dumps({'type':'content_block_start','index':0,'content_block':{'type':'text','text':''}})}\n\n" |
| full = "" |
| for chunk in llm.create_chat_completion(**kwargs, stream=True): |
| dt = chunk["choices"][0]["delta"].get("content", "") |
| if dt: |
| full += dt |
| yield f"data: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':dt}})}\n\n" |
| yield f"data: {json.dumps({'type':'content_block_stop','index':0})}\n\n" |
| yield f"data: {json.dumps({'type':'message_delta','delta':{'stop_reason':'end_turn','stop_sequence':None},'usage':{'output_tokens':len(full.split())}})}\n\n" |
| yield f"data: {json.dumps({'type':'message_stop'})}\n\n" |
| return StreamingResponse( |
| gen(), |
| media_type="text/event-stream", |
| headers={"anthropic-version": "2023-06-01"}, |
| ) |
|
|
| result = llm.create_chat_completion(**kwargs, stream=False) |
| text = result["choices"][0]["message"]["content"] |
| usage = result.get("usage", {}) |
| return JSONResponse({ |
| "id": f"msg_{uuid.uuid4().hex[:20]}", |
| "type": "message", |
| "role": "assistant", |
| "content": [{"type": "text", "text": text}], |
| "model": req.model, |
| "stop_reason": "end_turn", |
| "stop_sequence": None, |
| "usage": { |
| "input_tokens": usage.get("prompt_tokens", 0), |
| "output_tokens": usage.get("completion_tokens", 0), |
| }, |
| }) |
|
|
| |
| @app.get("/", response_class=HTMLResponse) |
| async def landing(): |
| sc = "#22c55e" if llm is not None else "#f59e0b" |
| st = "Model ready" if llm is not None else "Loading model... (~5-10 min on first boot)" |
| return LANDING_HTML.replace("{{SC}}", sc).replace("{{ST}}", st) |
|
|
| LANDING_HTML = r"""<!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1"> |
| <title>Gemma 4 26B Coding API</title> |
| <style> |
| *{box-sizing:border-box;margin:0;padding:0} |
| body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;background:#0d0d12;color:#e2e2ed;min-height:100vh;display:flex;flex-direction:column;align-items:center;padding:3.5rem 1.5rem 4rem} |
| h1{font-size:2.1rem;font-weight:700;background:linear-gradient(130deg,#818cf8 20%,#34d399 80%);-webkit-background-clip:text;-webkit-text-fill-color:transparent;margin-bottom:.35rem;letter-spacing:-.5px} |
| .tagline{color:#6b7280;font-size:.93rem;margin-bottom:2.5rem;text-align:center;line-height:1.5} |
| .badge{display:inline-flex;align-items:center;gap:.45rem;background:#151520;border:1px solid #2a2a3a;border-radius:999px;padding:.3rem .9rem;font-size:.8rem;margin:.25rem} |
| .dot{width:7px;height:7px;border-radius:50%;background:{{SC}};flex-shrink:0} |
| .badges{display:flex;flex-wrap:wrap;justify-content:center;margin-bottom:2.8rem} |
| .cards{display:grid;grid-template-columns:repeat(auto-fit,minmax(290px,1fr));gap:1.1rem;width:100%;max-width:920px;margin-bottom:2.8rem} |
| .card{background:#13131c;border:1px solid #252535;border-radius:14px;padding:1.3rem 1.5rem} |
| .card-title{font-size:.72rem;font-weight:600;text-transform:uppercase;letter-spacing:.1em;color:#6b7280;margin-bottom:.75rem} |
| pre{background:#090910;border:1px solid #1e1e2e;border-radius:9px;padding:.85rem 1rem;font-family:'JetBrains Mono','Fira Code',monospace;font-size:.78rem;color:#a5b4fc;line-height:1.65;overflow-x:auto;white-space:pre-wrap;word-break:break-all} |
| .ep-table{width:100%;max-width:920px;border-collapse:collapse;margin-bottom:2rem} |
| .ep-table thead th{font-size:.72rem;text-transform:uppercase;letter-spacing:.08em;color:#4b5563;padding:.5rem .8rem;border-bottom:1px solid #1e1e2e;text-align:left} |
| .ep-table tbody tr{border-bottom:1px solid #161622} |
| .ep-table tbody td{padding:.7rem .8rem;font-size:.84rem} |
| .method{display:inline-block;font-size:.68rem;font-weight:700;padding:.18rem .5rem;border-radius:5px;min-width:42px;text-align:center} |
| .get{background:#064e3b;color:#34d399}.post{background:#1e3a5f;color:#60a5fa} |
| .path{font-family:monospace;color:#e2e8f0;font-size:.85rem} |
| .note{font-size:.78rem;color:#4b5563} |
| .tip{background:#131a1f;border:1px solid #1d3040;border-radius:10px;padding:1rem 1.25rem;width:100%;max-width:920px;font-size:.82rem;color:#7dd3fc;line-height:1.6;margin-bottom:1.2rem} |
| footer{margin-top:2.5rem;font-size:.75rem;color:#374151;text-align:center;line-height:1.8} |
| </style> |
| </head> |
| <body> |
| <h1>Gemma 4 26B A4B</h1> |
| <p class="tagline">Coding-tuned Β· Anthropic & OpenAI compatible Β· HuggingFace Spaces</p> |
| <div class="badges"> |
| <span class="badge"><span class="dot"></span>{{ST}}</span> |
| <span class="badge" style="color:#9ca3af">IQ3_XXS Β· 11.2 GB</span> |
| <span class="badge" style="color:#9ca3af">ctx 4096 Β· 2 vCPU Β· 16 GB RAM</span> |
| <span class="badge" style="color:#9ca3af">temp 0.3 Β· top-k 20 Β· min-p 0.1</span> |
| </div> |
| <div class="cards"> |
| <div class="card"> |
| <div class="card-title">Claude Code setup</div> |
| <pre>export ANTHROPIC_BASE_URL=\ |
| https://YOUR-USER-space-name.hf.space |
| export ANTHROPIC_API_KEY=gemma4-local |
| |
| claude --model gemma-4-26b</pre> |
| </div> |
| <div class="card"> |
| <div class="card-title">OpenAI Python client</div> |
| <pre>from openai import OpenAI |
| client = OpenAI( |
| base_url="https://YOUR-SPACE.hf.space/v1", |
| api_key="gemma4-local", |
| ) |
| r = client.chat.completions.create( |
| model="gemma-4-26b", |
| messages=[{"role":"user", |
| "content":"write binary search"}], |
| )</pre> |
| </div> |
| <div class="card"> |
| <div class="card-title">curl quick test</div> |
| <pre>curl YOUR-SPACE.hf.space/v1/chat/completions \ |
| -H "Content-Type: application/json" \ |
| -d '{ |
| "model": "gemma-4-26b", |
| "messages": [ |
| {"role":"user","content":"hello"} |
| ] |
| }'</pre> |
| </div> |
| </div> |
| <div class="tip"> |
| <strong>First boot:</strong> The model (~11.2 GB) downloads on first start β allow 5β10 min. |
| Watch the container logs for a live progress bar. |
| <code style="background:#0d1b26;padding:1px 5px;border-radius:4px">/health</code> returns |
| <code style="background:#0d1b26;padding:1px 5px;border-radius:4px">model_loaded: false</code> |
| until ready. Subsequent restarts load from disk in ~60 s. |
| </div> |
| <table class="ep-table"> |
| <thead><tr><th>Method</th><th>Path</th><th>Notes</th></tr></thead> |
| <tbody> |
| <tr><td><span class="method get">GET</span></td><td class="path">/health</td><td class="note">Status + model_loaded</td></tr> |
| <tr><td><span class="method get">GET</span></td><td class="path">/v1/models</td><td class="note">Model list (OpenAI)</td></tr> |
| <tr><td><span class="method post">POST</span></td><td class="path">/v1/chat/completions</td><td class="note">OpenAI-compatible Β· streaming supported</td></tr> |
| <tr><td><span class="method post">POST</span></td><td class="path">/v1/messages</td><td class="note">Anthropic-compatible Β· used by Claude Code</td></tr> |
| </tbody> |
| </table> |
| <footer> |
| Gemma 4 26B A4B Β· unsloth UD-IQ3_XXS Β· llama-cpp-python + OpenBLAS<br> |
| Self-pings /health every 25 min Β· April 2026 |
| </footer> |
| </body> |
| </html>""" |