""" HuggingFace Space — Gemma 4 26B A4B Coding API Model : unsloth/gemma-4-26B-A4B-it-GGUF → UD-IQ3_XXS (11.2 GB) RAM : fits in 16 GB with ~4 GB left for KV cache at ctx=4096 Params: temp=0.3, top_p=0.9, min_p=0.1, top_k=20 (tuned for coding per reddit) Endpoints GET / → landing page GET /health → status (also used by self-ping) GET /v1/models → OpenAI model list POST /v1/chat/completions → OpenAI-compatible POST /v1/messages → Anthropic-compatible ← Claude Code uses this """ import os, sys, json, time, uuid, asyncio, threading, requests from contextlib import asynccontextmanager from typing import Optional, List, Union, Any, Dict import httpx from fastapi import FastAPI, HTTPException from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel # ── Config ──────────────────────────────────────────────────────────────────── MODEL_REPO = os.getenv("MODEL_REPO", "unsloth/gemma-4-26B-A4B-it-GGUF") MODEL_FILE = os.getenv("MODEL_FILE", "gemma-4-26B-A4B-it-UD-IQ3_XXS.gguf") MODEL_DIR = "/app/models" MODEL_PATH = f"{MODEL_DIR}/{MODEL_FILE}" SPACE_URL = os.getenv("SPACE_URL", "") HF_TOKEN = os.getenv("HF_TOKEN", "") N_CTX = int(os.getenv("N_CTX", "4096")) N_THREADS = int(os.getenv("N_THREADS", "2")) DEFAULT_TEMP = float(os.getenv("DEFAULT_TEMP", "0.3")) DEFAULT_TOP_P = float(os.getenv("DEFAULT_TOP_P", "0.9")) DEFAULT_MIN_P = float(os.getenv("DEFAULT_MIN_P", "0.1")) DEFAULT_TOP_K = int(os.getenv("DEFAULT_TOP_K", "20")) # Minimum expected size for a complete model file (10 GB safety margin) MIN_MODEL_BYTES = 10 * 1024 ** 3 MODEL_ALIAS = "gemma-4-26b" llm = None # ── Model download ──────────────────────────────────────────────────────────── def download_model(): os.makedirs(MODEL_DIR, exist_ok=True) # Check for existing complete file if os.path.exists(MODEL_PATH): size = os.path.getsize(MODEL_PATH) if size >= MIN_MODEL_BYTES: print(f"[model] Cached model found ({size / 1e9:.2f} GB) — skipping download.", flush=True) return print(f"[model] Incomplete file detected ({size / 1e9:.2f} GB) — re-downloading...", flush=True) os.remove(MODEL_PATH) url = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILE}" headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {} tmp_path = MODEL_PATH + ".tmp" print(f"[model] Connecting to HuggingFace...", flush=True) with requests.get(url, stream=True, headers=headers, timeout=60) as r: r.raise_for_status() total = int(r.headers.get("content-length", 0)) total_gb = total / (1024 ** 3) print(f"[model] Downloading {MODEL_FILE}", flush=True) print(f"[model] Total size : {total_gb:.2f} GB", flush=True) print(f"[model] Destination: {MODEL_PATH}", flush=True) print(f"[model] {'─' * 52}", flush=True) downloaded = 0 last_step = -1 # tracks which 5%-band was last printed chunk_size = 8 * 1024 * 1024 # 8 MB chunks with open(tmp_path, "wb") as f: for chunk in r.iter_content(chunk_size=chunk_size): if not chunk: continue f.write(chunk) downloaded += len(chunk) if total > 0: pct = downloaded / total * 100 step = int(pct) // 5 # 0–20 if step > last_step: last_step = step filled = step empty = 20 - filled bar = "█" * filled + "░" * empty gb_done = downloaded / (1024 ** 3) speed_mb = (downloaded / (time.monotonic() + 1e-9)) / 1e6 print( f"[model] |{bar}| {pct:5.1f}% " f"{gb_done:.2f}/{total_gb:.2f} GB", flush=True, ) # Atomic rename — avoids half-written files on crash/restart os.rename(tmp_path, MODEL_PATH) final_size = os.path.getsize(MODEL_PATH) print(f"[model] {'─' * 52}", flush=True) print(f"[model] Download complete! {final_size / 1e9:.2f} GB saved to {MODEL_PATH}", flush=True) # ── Model load ──────────────────────────────────────────────────────────────── def load_model(): global llm from llama_cpp import Llama download_model() print(f"[model] Loading {MODEL_FILE} into RAM (ctx={N_CTX}, threads={N_THREADS})...", flush=True) llm = Llama( model_path = MODEL_PATH, n_ctx = N_CTX, n_threads = N_THREADS, n_batch = 512, n_gpu_layers = 0, verbose = False, chat_format = None, ) print(f"[model] ✓ Gemma 4 26B ready!", flush=True) # ── Self-ping ───────────────────────────────────────────────────────────────── async def self_ping_loop(): while True: await asyncio.sleep(25 * 60) if SPACE_URL: try: async with httpx.AsyncClient(timeout=15) as c: r = await c.get(f"{SPACE_URL}/health") print(f"[ping] {r.status_code}", flush=True) except Exception as e: print(f"[ping] failed: {e}", flush=True) # ── App ─────────────────────────────────────────────────────────────────────── @asynccontextmanager async def lifespan(app: FastAPI): threading.Thread(target=load_model, daemon=True).start() asyncio.create_task(self_ping_loop()) yield app = FastAPI(title="Gemma 4 Coding API", lifespan=lifespan) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], allow_credentials=True, ) # ── Helpers ─────────────────────────────────────────────────────────────────── def _check_model(): if llm is None: raise HTTPException( 503, detail="Model still loading — first boot downloads ~11 GB, wait ~5-10 min" ) def _extract_text(content) -> str: if isinstance(content, str): return content if isinstance(content, list): parts = [] for block in content: if isinstance(block, dict): if block.get("type") == "text": parts.append(block.get("text", "")) elif block.get("type") == "tool_result": parts.append(_extract_text(block.get("content", ""))) else: parts.append(str(block)) return "".join(parts) return str(content) # ── Health ──────────────────────────────────────────────────────────────────── @app.get("/health") async def health(): return { "status": "ok", "model_loaded": llm is not None, "model": MODEL_FILE, "ctx": N_CTX, } # ══ OpenAI-compatible /v1/chat/completions ══════════════════════════════════ class OAIMessage(BaseModel): role: str content: Union[str, List[Any]] class OAIRequest(BaseModel): model: str = MODEL_ALIAS messages: List[OAIMessage] temperature: float = DEFAULT_TEMP top_p: float = DEFAULT_TOP_P min_p: float = DEFAULT_MIN_P top_k: int = DEFAULT_TOP_K max_tokens: int = 2048 stream: bool = False stop: Optional[List[str]] = None @app.get("/v1/models") async def oai_models(): return { "object": "list", "data": [{ "id": MODEL_ALIAS, "object": "model", "created": int(time.time()), "owned_by": "google-deepmind", }], } @app.post("/v1/chat/completions") async def oai_chat(req: OAIRequest): _check_model() msgs = [ {"role": m.role, "content": _extract_text(m.content)} for m in req.messages ] kwargs = dict( messages = msgs, temperature = req.temperature, top_p = req.top_p, min_p = req.min_p, top_k = req.top_k, max_tokens = req.max_tokens, stop = req.stop, ) if req.stream: async def gen(): rid = f"chatcmpl-{uuid.uuid4().hex[:8]}" ts = int(time.time()) for chunk in llm.create_chat_completion(**kwargs, stream=True): data = { "id": rid, "object": "chat.completion.chunk", "created": ts, "model": req.model, "choices": [{ "index": 0, "delta": chunk["choices"][0]["delta"], "finish_reason": chunk["choices"][0]["finish_reason"], }], } yield f"data: {json.dumps(data)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(gen(), media_type="text/event-stream") result = llm.create_chat_completion(**kwargs, stream=False) return JSONResponse(result) # ══ Anthropic-compatible /v1/messages (Claude Code) ═══════════════════════ class AnthropicMessage(BaseModel): role: str content: Union[str, List[Dict]] class AnthropicRequest(BaseModel): model: str = MODEL_ALIAS messages: List[AnthropicMessage] system: Optional[str] = None max_tokens: int = 2048 temperature: float = DEFAULT_TEMP top_p: float = DEFAULT_TOP_P top_k: int = DEFAULT_TOP_K stream: bool = False stop_sequences: Optional[List[str]] = None @app.post("/v1/messages") async def anthropic_messages(req: AnthropicRequest): _check_model() msgs = [] if req.system: msgs.append({"role": "system", "content": req.system}) for m in req.messages: msgs.append({"role": m.role, "content": _extract_text(m.content)}) kwargs = dict( messages = msgs, temperature = req.temperature, top_p = req.top_p, min_p = DEFAULT_MIN_P, top_k = req.top_k, max_tokens = req.max_tokens, stop = req.stop_sequences, ) if req.stream: async def gen(): msg_id = f"msg_{uuid.uuid4().hex[:20]}" yield f"data: {json.dumps({'type':'message_start','message':{'id':msg_id,'type':'message','role':'assistant','content':[],'model':req.model,'stop_reason':None,'usage':{'input_tokens':0,'output_tokens':0}}})}\n\n" yield f"data: {json.dumps({'type':'content_block_start','index':0,'content_block':{'type':'text','text':''}})}\n\n" full = "" for chunk in llm.create_chat_completion(**kwargs, stream=True): dt = chunk["choices"][0]["delta"].get("content", "") if dt: full += dt yield f"data: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':dt}})}\n\n" yield f"data: {json.dumps({'type':'content_block_stop','index':0})}\n\n" yield f"data: {json.dumps({'type':'message_delta','delta':{'stop_reason':'end_turn','stop_sequence':None},'usage':{'output_tokens':len(full.split())}})}\n\n" yield f"data: {json.dumps({'type':'message_stop'})}\n\n" return StreamingResponse( gen(), media_type="text/event-stream", headers={"anthropic-version": "2023-06-01"}, ) result = llm.create_chat_completion(**kwargs, stream=False) text = result["choices"][0]["message"]["content"] usage = result.get("usage", {}) return JSONResponse({ "id": f"msg_{uuid.uuid4().hex[:20]}", "type": "message", "role": "assistant", "content": [{"type": "text", "text": text}], "model": req.model, "stop_reason": "end_turn", "stop_sequence": None, "usage": { "input_tokens": usage.get("prompt_tokens", 0), "output_tokens": usage.get("completion_tokens", 0), }, }) # ══ Landing page ══════════════════════════════════════════════════════════════ @app.get("/", response_class=HTMLResponse) async def landing(): sc = "#22c55e" if llm is not None else "#f59e0b" st = "Model ready" if llm is not None else "Loading model... (~5-10 min on first boot)" return LANDING_HTML.replace("{{SC}}", sc).replace("{{ST}}", st) LANDING_HTML = r""" Gemma 4 26B Coding API

Gemma 4 26B A4B

Coding-tuned · Anthropic & OpenAI compatible · HuggingFace Spaces

{{ST}} IQ3_XXS · 11.2 GB ctx 4096 · 2 vCPU · 16 GB RAM temp 0.3 · top-k 20 · min-p 0.1
Claude Code setup
export ANTHROPIC_BASE_URL=\
  https://YOUR-USER-space-name.hf.space
export ANTHROPIC_API_KEY=gemma4-local

claude --model gemma-4-26b
OpenAI Python client
from openai import OpenAI
client = OpenAI(
  base_url="https://YOUR-SPACE.hf.space/v1",
  api_key="gemma4-local",
)
r = client.chat.completions.create(
  model="gemma-4-26b",
  messages=[{"role":"user",
    "content":"write binary search"}],
)
curl quick test
curl YOUR-SPACE.hf.space/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gemma-4-26b",
    "messages": [
      {"role":"user","content":"hello"}
    ]
  }'
First boot: The model (~11.2 GB) downloads on first start — allow 5–10 min. Watch the container logs for a live progress bar. /health returns model_loaded: false until ready. Subsequent restarts load from disk in ~60 s.
MethodPathNotes
GET/healthStatus + model_loaded
GET/v1/modelsModel list (OpenAI)
POST/v1/chat/completionsOpenAI-compatible · streaming supported
POST/v1/messagesAnthropic-compatible · used by Claude Code
"""