init
Browse files
app.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HuggingFace Space β Gemma 4 26B A4B Coding API
|
| 3 |
+
Model : unsloth/gemma-4-26B-A4B-it-GGUF β UD-IQ3_XXS (11.2 GB)
|
| 4 |
+
RAM : fits in 16 GB with ~4 GB left for KV cache at ctx=4096
|
| 5 |
+
Params: temp=0.3, top_p=0.9, min_p=0.1, top_k=20 (tuned for coding per reddit)
|
| 6 |
+
|
| 7 |
+
Endpoints
|
| 8 |
+
GET / β landing page
|
| 9 |
+
GET /health β status (also used by self-ping)
|
| 10 |
+
GET /v1/models β OpenAI model list
|
| 11 |
+
POST /v1/chat/completions β OpenAI-compatible
|
| 12 |
+
POST /v1/messages β Anthropic-compatible β Claude Code uses this
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import os, json, time, uuid, asyncio, threading
|
| 16 |
+
from contextlib import asynccontextmanager
|
| 17 |
+
from typing import Optional, List, Union, Any, Dict
|
| 18 |
+
|
| 19 |
+
import httpx
|
| 20 |
+
from fastapi import FastAPI, HTTPException
|
| 21 |
+
from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
|
| 22 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 23 |
+
from pydantic import BaseModel
|
| 24 |
+
|
| 25 |
+
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
MODEL_REPO = os.getenv("MODEL_REPO", "unsloth/gemma-4-26B-A4B-it-GGUF")
|
| 27 |
+
MODEL_FILE = os.getenv("MODEL_FILE", "gemma-4-26B-A4B-it-UD-IQ3_XXS.gguf")
|
| 28 |
+
MODEL_DIR = "/app/models"
|
| 29 |
+
MODEL_PATH = f"{MODEL_DIR}/{MODEL_FILE}"
|
| 30 |
+
SPACE_URL = os.getenv("SPACE_URL", "")
|
| 31 |
+
|
| 32 |
+
# Context 4096 keeps KV cache β€2 GB β safe with 11.2 GB model on 16 GB RAM
|
| 33 |
+
N_CTX = int(os.getenv("N_CTX", "4096"))
|
| 34 |
+
N_THREADS = int(os.getenv("N_THREADS", "2"))
|
| 35 |
+
|
| 36 |
+
# Coding-optimised defaults (OP's settings from reddit thread)
|
| 37 |
+
DEFAULT_TEMP = float(os.getenv("DEFAULT_TEMP", "0.3"))
|
| 38 |
+
DEFAULT_TOP_P = float(os.getenv("DEFAULT_TOP_P", "0.9"))
|
| 39 |
+
DEFAULT_MIN_P = float(os.getenv("DEFAULT_MIN_P", "0.1"))
|
| 40 |
+
DEFAULT_TOP_K = int(os.getenv("DEFAULT_TOP_K", "20"))
|
| 41 |
+
|
| 42 |
+
MODEL_ALIAS = "gemma-4-26b"
|
| 43 |
+
llm = None
|
| 44 |
+
|
| 45 |
+
# ββ Model download + load βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
+
def download_model():
|
| 47 |
+
from huggingface_hub import hf_hub_download
|
| 48 |
+
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 49 |
+
if not os.path.exists(MODEL_PATH):
|
| 50 |
+
print(f"[model] Downloading {MODEL_FILE} (~11.2 GB)...")
|
| 51 |
+
hf_hub_download(
|
| 52 |
+
repo_id=MODEL_REPO,
|
| 53 |
+
filename=MODEL_FILE,
|
| 54 |
+
local_dir=MODEL_DIR,
|
| 55 |
+
)
|
| 56 |
+
print("[model] Download complete.")
|
| 57 |
+
|
| 58 |
+
def load_model():
|
| 59 |
+
global llm
|
| 60 |
+
from llama_cpp import Llama
|
| 61 |
+
download_model()
|
| 62 |
+
print("[model] Loading Gemma 4 26B IQ3_XXS into RAM...")
|
| 63 |
+
llm = Llama(
|
| 64 |
+
model_path = MODEL_PATH,
|
| 65 |
+
n_ctx = N_CTX,
|
| 66 |
+
n_threads = N_THREADS,
|
| 67 |
+
n_batch = 512,
|
| 68 |
+
n_gpu_layers = 0, # HF free tier is CPU-only
|
| 69 |
+
verbose = False,
|
| 70 |
+
chat_format = None, # auto-detect from GGUF metadata (Gemma 4 template)
|
| 71 |
+
)
|
| 72 |
+
print(f"[model] Gemma 4 26B ready β ctx={N_CTX}, threads={N_THREADS}")
|
| 73 |
+
|
| 74 |
+
# ββ Self-ping βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 75 |
+
async def self_ping_loop():
|
| 76 |
+
while True:
|
| 77 |
+
await asyncio.sleep(25 * 60)
|
| 78 |
+
if SPACE_URL:
|
| 79 |
+
try:
|
| 80 |
+
async with httpx.AsyncClient(timeout=15) as c:
|
| 81 |
+
r = await c.get(f"{SPACE_URL}/health")
|
| 82 |
+
print(f"[ping] {r.status_code}")
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"[ping] failed: {e}")
|
| 85 |
+
|
| 86 |
+
# ββ App βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
+
@asynccontextmanager
|
| 88 |
+
async def lifespan(app: FastAPI):
|
| 89 |
+
threading.Thread(target=load_model, daemon=True).start()
|
| 90 |
+
asyncio.create_task(self_ping_loop())
|
| 91 |
+
yield
|
| 92 |
+
|
| 93 |
+
app = FastAPI(title="Gemma 4 Coding API", lifespan=lifespan)
|
| 94 |
+
app.add_middleware(
|
| 95 |
+
CORSMiddleware,
|
| 96 |
+
allow_origins=["*"],
|
| 97 |
+
allow_methods=["*"],
|
| 98 |
+
allow_headers=["*"],
|
| 99 |
+
allow_credentials=True,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 103 |
+
def _check_model():
|
| 104 |
+
if llm is None:
|
| 105 |
+
raise HTTPException(
|
| 106 |
+
503,
|
| 107 |
+
detail="Model still loading β first boot downloads ~11 GB, wait ~5-10 min"
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
def _extract_text(content) -> str:
|
| 111 |
+
if isinstance(content, str):
|
| 112 |
+
return content
|
| 113 |
+
if isinstance(content, list):
|
| 114 |
+
parts = []
|
| 115 |
+
for block in content:
|
| 116 |
+
if isinstance(block, dict):
|
| 117 |
+
if block.get("type") == "text":
|
| 118 |
+
parts.append(block.get("text", ""))
|
| 119 |
+
elif block.get("type") == "tool_result":
|
| 120 |
+
parts.append(_extract_text(block.get("content", "")))
|
| 121 |
+
else:
|
| 122 |
+
parts.append(str(block))
|
| 123 |
+
return "".join(parts)
|
| 124 |
+
return str(content)
|
| 125 |
+
|
| 126 |
+
# ββ Health ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 127 |
+
@app.get("/health")
|
| 128 |
+
async def health():
|
| 129 |
+
return {
|
| 130 |
+
"status": "ok",
|
| 131 |
+
"model_loaded": llm is not None,
|
| 132 |
+
"model": MODEL_FILE,
|
| 133 |
+
"ctx": N_CTX,
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
# ββ OpenAI-compatible /v1/chat/completions ββββββββββββββββββββββββββββββββββ
|
| 137 |
+
class OAIMessage(BaseModel):
|
| 138 |
+
role: str
|
| 139 |
+
content: Union[str, List[Any]]
|
| 140 |
+
|
| 141 |
+
class OAIRequest(BaseModel):
|
| 142 |
+
model: str = MODEL_ALIAS
|
| 143 |
+
messages: List[OAIMessage]
|
| 144 |
+
temperature: float = DEFAULT_TEMP
|
| 145 |
+
top_p: float = DEFAULT_TOP_P
|
| 146 |
+
min_p: float = DEFAULT_MIN_P
|
| 147 |
+
top_k: int = DEFAULT_TOP_K
|
| 148 |
+
max_tokens: int = 2048
|
| 149 |
+
stream: bool = False
|
| 150 |
+
stop: Optional[List[str]] = None
|
| 151 |
+
|
| 152 |
+
@app.get("/v1/models")
|
| 153 |
+
async def oai_models():
|
| 154 |
+
return {
|
| 155 |
+
"object": "list",
|
| 156 |
+
"data": [{
|
| 157 |
+
"id": MODEL_ALIAS,
|
| 158 |
+
"object": "model",
|
| 159 |
+
"created": int(time.time()),
|
| 160 |
+
"owned_by": "google-deepmind",
|
| 161 |
+
}],
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
@app.post("/v1/chat/completions")
|
| 165 |
+
async def oai_chat(req: OAIRequest):
|
| 166 |
+
_check_model()
|
| 167 |
+
msgs = [
|
| 168 |
+
{"role": m.role, "content": _extract_text(m.content)}
|
| 169 |
+
for m in req.messages
|
| 170 |
+
]
|
| 171 |
+
kwargs = dict(
|
| 172 |
+
messages = msgs,
|
| 173 |
+
temperature = req.temperature,
|
| 174 |
+
top_p = req.top_p,
|
| 175 |
+
min_p = req.min_p,
|
| 176 |
+
top_k = req.top_k,
|
| 177 |
+
max_tokens = req.max_tokens,
|
| 178 |
+
stop = req.stop,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
if req.stream:
|
| 182 |
+
async def gen():
|
| 183 |
+
rid = f"chatcmpl-{uuid.uuid4().hex[:8]}"
|
| 184 |
+
ts = int(time.time())
|
| 185 |
+
for chunk in llm.create_chat_completion(**kwargs, stream=True):
|
| 186 |
+
data = {
|
| 187 |
+
"id": rid,
|
| 188 |
+
"object": "chat.completion.chunk",
|
| 189 |
+
"created": ts,
|
| 190 |
+
"model": req.model,
|
| 191 |
+
"choices": [{
|
| 192 |
+
"index": 0,
|
| 193 |
+
"delta": chunk["choices"][0]["delta"],
|
| 194 |
+
"finish_reason": chunk["choices"][0]["finish_reason"],
|
| 195 |
+
}],
|
| 196 |
+
}
|
| 197 |
+
yield f"data: {json.dumps(data)}\n\n"
|
| 198 |
+
yield "data: [DONE]\n\n"
|
| 199 |
+
return StreamingResponse(gen(), media_type="text/event-stream")
|
| 200 |
+
|
| 201 |
+
result = llm.create_chat_completion(**kwargs, stream=False)
|
| 202 |
+
return JSONResponse(result)
|
| 203 |
+
|
| 204 |
+
# ββ Anthropic-compatible /v1/messages (Claude Code) βββββββββββββββββββββββ
|
| 205 |
+
class AnthropicMessage(BaseModel):
|
| 206 |
+
role: str
|
| 207 |
+
content: Union[str, List[Dict]]
|
| 208 |
+
|
| 209 |
+
class AnthropicRequest(BaseModel):
|
| 210 |
+
model: str = MODEL_ALIAS
|
| 211 |
+
messages: List[AnthropicMessage]
|
| 212 |
+
system: Optional[str] = None
|
| 213 |
+
max_tokens: int = 2048
|
| 214 |
+
temperature: float = DEFAULT_TEMP
|
| 215 |
+
top_p: float = DEFAULT_TOP_P
|
| 216 |
+
top_k: int = DEFAULT_TOP_K
|
| 217 |
+
stream: bool = False
|
| 218 |
+
stop_sequences: Optional[List[str]] = None
|
| 219 |
+
|
| 220 |
+
@app.post("/v1/messages")
|
| 221 |
+
async def anthropic_messages(req: AnthropicRequest):
|
| 222 |
+
_check_model()
|
| 223 |
+
msgs = []
|
| 224 |
+
if req.system:
|
| 225 |
+
msgs.append({"role": "system", "content": req.system})
|
| 226 |
+
for m in req.messages:
|
| 227 |
+
msgs.append({"role": m.role, "content": _extract_text(m.content)})
|
| 228 |
+
|
| 229 |
+
kwargs = dict(
|
| 230 |
+
messages = msgs,
|
| 231 |
+
temperature = req.temperature,
|
| 232 |
+
top_p = req.top_p,
|
| 233 |
+
min_p = DEFAULT_MIN_P, # always apply min_p for coding accuracy
|
| 234 |
+
top_k = req.top_k,
|
| 235 |
+
max_tokens = req.max_tokens,
|
| 236 |
+
stop = req.stop_sequences,
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
if req.stream:
|
| 240 |
+
async def gen():
|
| 241 |
+
msg_id = f"msg_{uuid.uuid4().hex[:20]}"
|
| 242 |
+
yield f"data: {json.dumps({'type':'message_start','message':{'id':msg_id,'type':'message','role':'assistant','content':[],'model':req.model,'stop_reason':None,'usage':{'input_tokens':0,'output_tokens':0}}})}\n\n"
|
| 243 |
+
yield f"data: {json.dumps({'type':'content_block_start','index':0,'content_block':{'type':'text','text':''}})}\n\n"
|
| 244 |
+
full = ""
|
| 245 |
+
for chunk in llm.create_chat_completion(**kwargs, stream=True):
|
| 246 |
+
dt = chunk["choices"][0]["delta"].get("content", "")
|
| 247 |
+
if dt:
|
| 248 |
+
full += dt
|
| 249 |
+
yield f"data: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':dt}})}\n\n"
|
| 250 |
+
yield f"data: {json.dumps({'type':'content_block_stop','index':0})}\n\n"
|
| 251 |
+
yield f"data: {json.dumps({'type':'message_delta','delta':{'stop_reason':'end_turn','stop_sequence':None},'usage':{'output_tokens':len(full.split())}})}\n\n"
|
| 252 |
+
yield f"data: {json.dumps({'type':'message_stop'})}\n\n"
|
| 253 |
+
return StreamingResponse(
|
| 254 |
+
gen(),
|
| 255 |
+
media_type="text/event-stream",
|
| 256 |
+
headers={"anthropic-version": "2023-06-01"},
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
result = llm.create_chat_completion(**kwargs, stream=False)
|
| 260 |
+
text = result["choices"][0]["message"]["content"]
|
| 261 |
+
usage = result.get("usage", {})
|
| 262 |
+
return JSONResponse({
|
| 263 |
+
"id": f"msg_{uuid.uuid4().hex[:20]}",
|
| 264 |
+
"type": "message",
|
| 265 |
+
"role": "assistant",
|
| 266 |
+
"content": [{"type": "text", "text": text}],
|
| 267 |
+
"model": req.model,
|
| 268 |
+
"stop_reason": "end_turn",
|
| 269 |
+
"stop_sequence": None,
|
| 270 |
+
"usage": {
|
| 271 |
+
"input_tokens": usage.get("prompt_tokens", 0),
|
| 272 |
+
"output_tokens": usage.get("completion_tokens", 0),
|
| 273 |
+
},
|
| 274 |
+
})
|
| 275 |
+
|
| 276 |
+
# ββ Landing page ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 277 |
+
@app.get("/", response_class=HTMLResponse)
|
| 278 |
+
async def landing():
|
| 279 |
+
sc = "#22c55e" if llm is not None else "#f59e0b"
|
| 280 |
+
st = "Model ready" if llm is not None else "Loading model... (~5-10 min on first boot)"
|
| 281 |
+
return LANDING_HTML.replace("{{SC}}", sc).replace("{{ST}}", st)
|
| 282 |
+
|
| 283 |
+
LANDING_HTML = r"""<!DOCTYPE html>
|
| 284 |
+
<html lang="en">
|
| 285 |
+
<head>
|
| 286 |
+
<meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1">
|
| 287 |
+
<title>Gemma 4 26B Coding API</title>
|
| 288 |
+
<style>
|
| 289 |
+
*{box-sizing:border-box;margin:0;padding:0}
|
| 290 |
+
body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;background:#0d0d12;color:#e2e2ed;min-height:100vh;display:flex;flex-direction:column;align-items:center;padding:3.5rem 1.5rem 4rem}
|
| 291 |
+
h1{font-size:2.1rem;font-weight:700;background:linear-gradient(130deg,#818cf8 20%,#34d399 80%);-webkit-background-clip:text;-webkit-text-fill-color:transparent;margin-bottom:.35rem;letter-spacing:-.5px}
|
| 292 |
+
.tagline{color:#6b7280;font-size:.93rem;margin-bottom:2.5rem;text-align:center;line-height:1.5}
|
| 293 |
+
.badge{display:inline-flex;align-items:center;gap:.45rem;background:#151520;border:1px solid #2a2a3a;border-radius:999px;padding:.3rem .9rem;font-size:.8rem;margin:.25rem}
|
| 294 |
+
.dot{width:7px;height:7px;border-radius:50%;background:{{SC}};flex-shrink:0}
|
| 295 |
+
.badges{display:flex;flex-wrap:wrap;justify-content:center;margin-bottom:2.8rem}
|
| 296 |
+
.cards{display:grid;grid-template-columns:repeat(auto-fit,minmax(290px,1fr));gap:1.1rem;width:100%;max-width:920px;margin-bottom:2.8rem}
|
| 297 |
+
.card{background:#13131c;border:1px solid #252535;border-radius:14px;padding:1.3rem 1.5rem}
|
| 298 |
+
.card-title{font-size:.72rem;font-weight:600;text-transform:uppercase;letter-spacing:.1em;color:#6b7280;margin-bottom:.75rem}
|
| 299 |
+
pre{background:#090910;border:1px solid #1e1e2e;border-radius:9px;padding:.85rem 1rem;font-family:'JetBrains Mono','Fira Code',monospace;font-size:.78rem;color:#a5b4fc;line-height:1.65;overflow-x:auto;white-space:pre-wrap;word-break:break-all}
|
| 300 |
+
.ep-table{width:100%;max-width:920px;border-collapse:collapse;margin-bottom:2rem}
|
| 301 |
+
.ep-table thead th{font-size:.72rem;text-transform:uppercase;letter-spacing:.08em;color:#4b5563;padding:.5rem .8rem;border-bottom:1px solid #1e1e2e;text-align:left}
|
| 302 |
+
.ep-table tbody tr{border-bottom:1px solid #161622}
|
| 303 |
+
.ep-table tbody td{padding:.7rem .8rem;font-size:.84rem}
|
| 304 |
+
.method{display:inline-block;font-size:.68rem;font-weight:700;padding:.18rem .5rem;border-radius:5px;min-width:42px;text-align:center}
|
| 305 |
+
.get{background:#064e3b;color:#34d399}.post{background:#1e3a5f;color:#60a5fa}
|
| 306 |
+
.path{font-family:monospace;color:#e2e8f0;font-size:.85rem}
|
| 307 |
+
.note{font-size:.78rem;color:#4b5563}
|
| 308 |
+
.tip{background:#131a1f;border:1px solid #1d3040;border-radius:10px;padding:1rem 1.25rem;width:100%;max-width:920px;font-size:.82rem;color:#7dd3fc;line-height:1.6;margin-bottom:1.2rem}
|
| 309 |
+
footer{margin-top:2.5rem;font-size:.75rem;color:#374151;text-align:center;line-height:1.8}
|
| 310 |
+
</style>
|
| 311 |
+
</head>
|
| 312 |
+
<body>
|
| 313 |
+
<h1>Gemma 4 26B A4B</h1>
|
| 314 |
+
<p class="tagline">Coding-tuned Β· Anthropic & OpenAI compatible Β· HuggingFace Spaces</p>
|
| 315 |
+
|
| 316 |
+
<div class="badges">
|
| 317 |
+
<span class="badge"><span class="dot"></span>{{ST}}</span>
|
| 318 |
+
<span class="badge" style="color:#9ca3af">IQ3_XXS Β· 11.2 GB</span>
|
| 319 |
+
<span class="badge" style="color:#9ca3af">ctx 4096 Β· 2 vCPU Β· 16 GB RAM</span>
|
| 320 |
+
<span class="badge" style="color:#9ca3af">temp 0.3 Β· top-k 20 Β· min-p 0.1</span>
|
| 321 |
+
</div>
|
| 322 |
+
|
| 323 |
+
<div class="cards">
|
| 324 |
+
<div class="card">
|
| 325 |
+
<div class="card-title">Claude Code setup</div>
|
| 326 |
+
<pre>export ANTHROPIC_BASE_URL=\
|
| 327 |
+
https://YOUR-USER-space-name.hf.space
|
| 328 |
+
export ANTHROPIC_API_KEY=gemma4-local
|
| 329 |
+
|
| 330 |
+
claude --model gemma-4-26b</pre>
|
| 331 |
+
</div>
|
| 332 |
+
<div class="card">
|
| 333 |
+
<div class="card-title">OpenAI Python client</div>
|
| 334 |
+
<pre>from openai import OpenAI
|
| 335 |
+
client = OpenAI(
|
| 336 |
+
base_url="https://YOUR-SPACE.hf.space/v1",
|
| 337 |
+
api_key="gemma4-local",
|
| 338 |
+
)
|
| 339 |
+
r = client.chat.completions.create(
|
| 340 |
+
model="gemma-4-26b",
|
| 341 |
+
messages=[{"role":"user",
|
| 342 |
+
"content":"write binary search"}],
|
| 343 |
+
)</pre>
|
| 344 |
+
</div>
|
| 345 |
+
<div class="card">
|
| 346 |
+
<div class="card-title">curl quick test</div>
|
| 347 |
+
<pre>curl YOUR-SPACE.hf.space/v1/chat/completions \
|
| 348 |
+
-H "Content-Type: application/json" \
|
| 349 |
+
-d '{
|
| 350 |
+
"model": "gemma-4-26b",
|
| 351 |
+
"messages": [
|
| 352 |
+
{"role":"user",
|
| 353 |
+
"content":"hello"}
|
| 354 |
+
]
|
| 355 |
+
}'</pre>
|
| 356 |
+
</div>
|
| 357 |
+
</div>
|
| 358 |
+
|
| 359 |
+
<div class="tip">
|
| 360 |
+
<strong>First boot:</strong> The model (~11.2 GB) downloads from HuggingFace on first start β allow 5β10 min.
|
| 361 |
+
<code style="background:#0d1b26;padding:1px 5px;border-radius:4px">/health</code> returns
|
| 362 |
+
<code style="background:#0d1b26;padding:1px 5px;border-radius:4px">model_loaded: false</code>
|
| 363 |
+
until ready. Subsequent restarts load from disk in ~60 s. Self-pings every 25 min to prevent sleep.
|
| 364 |
+
</div>
|
| 365 |
+
|
| 366 |
+
<table class="ep-table">
|
| 367 |
+
<thead><tr><th>Method</th><th>Path</th><th>Notes</th></tr></thead>
|
| 368 |
+
<tbody>
|
| 369 |
+
<tr><td><span class="method get">GET</span></td><td class="path">/health</td><td class="note">Status + model_loaded</td></tr>
|
| 370 |
+
<tr><td><span class="method get">GET</span></td><td class="path">/v1/models</td><td class="note">Model list (OpenAI)</td></tr>
|
| 371 |
+
<tr><td><span class="method post">POST</span></td><td class="path">/v1/chat/completions</td><td class="note">OpenAI-compatible Β· streaming supported</td></tr>
|
| 372 |
+
<tr><td><span class="method post">POST</span></td><td class="path">/v1/messages</td><td class="note">Anthropic-compatible Β· used by Claude Code</td></tr>
|
| 373 |
+
</tbody>
|
| 374 |
+
</table>
|
| 375 |
+
|
| 376 |
+
<footer>
|
| 377 |
+
Gemma 4 26B A4B Β· unsloth UD-IQ3_XXS Β· llama-cpp-python + OpenBLAS<br>
|
| 378 |
+
Self-pings /health every 25 min Β· April 2026
|
| 379 |
+
</footer>
|
| 380 |
+
</body>
|
| 381 |
+
</html>"""
|