tusarway commited on
Commit
7bf4f43
Β·
verified Β·
1 Parent(s): 4f20ce0
Files changed (1) hide show
  1. app.py +381 -0
app.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Space β€” Gemma 4 26B A4B Coding API
3
+ Model : unsloth/gemma-4-26B-A4B-it-GGUF β†’ UD-IQ3_XXS (11.2 GB)
4
+ RAM : fits in 16 GB with ~4 GB left for KV cache at ctx=4096
5
+ Params: temp=0.3, top_p=0.9, min_p=0.1, top_k=20 (tuned for coding per reddit)
6
+
7
+ Endpoints
8
+ GET / β†’ landing page
9
+ GET /health β†’ status (also used by self-ping)
10
+ GET /v1/models β†’ OpenAI model list
11
+ POST /v1/chat/completions β†’ OpenAI-compatible
12
+ POST /v1/messages β†’ Anthropic-compatible ← Claude Code uses this
13
+ """
14
+
15
+ import os, json, time, uuid, asyncio, threading
16
+ from contextlib import asynccontextmanager
17
+ from typing import Optional, List, Union, Any, Dict
18
+
19
+ import httpx
20
+ from fastapi import FastAPI, HTTPException
21
+ from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
22
+ from fastapi.middleware.cors import CORSMiddleware
23
+ from pydantic import BaseModel
24
+
25
+ # ── Config ────────────────────────────────────────────────────────────────────
26
+ MODEL_REPO = os.getenv("MODEL_REPO", "unsloth/gemma-4-26B-A4B-it-GGUF")
27
+ MODEL_FILE = os.getenv("MODEL_FILE", "gemma-4-26B-A4B-it-UD-IQ3_XXS.gguf")
28
+ MODEL_DIR = "/app/models"
29
+ MODEL_PATH = f"{MODEL_DIR}/{MODEL_FILE}"
30
+ SPACE_URL = os.getenv("SPACE_URL", "")
31
+
32
+ # Context 4096 keeps KV cache ≀2 GB β€” safe with 11.2 GB model on 16 GB RAM
33
+ N_CTX = int(os.getenv("N_CTX", "4096"))
34
+ N_THREADS = int(os.getenv("N_THREADS", "2"))
35
+
36
+ # Coding-optimised defaults (OP's settings from reddit thread)
37
+ DEFAULT_TEMP = float(os.getenv("DEFAULT_TEMP", "0.3"))
38
+ DEFAULT_TOP_P = float(os.getenv("DEFAULT_TOP_P", "0.9"))
39
+ DEFAULT_MIN_P = float(os.getenv("DEFAULT_MIN_P", "0.1"))
40
+ DEFAULT_TOP_K = int(os.getenv("DEFAULT_TOP_K", "20"))
41
+
42
+ MODEL_ALIAS = "gemma-4-26b"
43
+ llm = None
44
+
45
+ # ── Model download + load ─────────────────────────────────────────────────────
46
+ def download_model():
47
+ from huggingface_hub import hf_hub_download
48
+ os.makedirs(MODEL_DIR, exist_ok=True)
49
+ if not os.path.exists(MODEL_PATH):
50
+ print(f"[model] Downloading {MODEL_FILE} (~11.2 GB)...")
51
+ hf_hub_download(
52
+ repo_id=MODEL_REPO,
53
+ filename=MODEL_FILE,
54
+ local_dir=MODEL_DIR,
55
+ )
56
+ print("[model] Download complete.")
57
+
58
+ def load_model():
59
+ global llm
60
+ from llama_cpp import Llama
61
+ download_model()
62
+ print("[model] Loading Gemma 4 26B IQ3_XXS into RAM...")
63
+ llm = Llama(
64
+ model_path = MODEL_PATH,
65
+ n_ctx = N_CTX,
66
+ n_threads = N_THREADS,
67
+ n_batch = 512,
68
+ n_gpu_layers = 0, # HF free tier is CPU-only
69
+ verbose = False,
70
+ chat_format = None, # auto-detect from GGUF metadata (Gemma 4 template)
71
+ )
72
+ print(f"[model] Gemma 4 26B ready β€” ctx={N_CTX}, threads={N_THREADS}")
73
+
74
+ # ── Self-ping ─────────────────────────────────────────────────────────────────
75
+ async def self_ping_loop():
76
+ while True:
77
+ await asyncio.sleep(25 * 60)
78
+ if SPACE_URL:
79
+ try:
80
+ async with httpx.AsyncClient(timeout=15) as c:
81
+ r = await c.get(f"{SPACE_URL}/health")
82
+ print(f"[ping] {r.status_code}")
83
+ except Exception as e:
84
+ print(f"[ping] failed: {e}")
85
+
86
+ # ── App ───────────────────────────────────────────────────────────────────────
87
+ @asynccontextmanager
88
+ async def lifespan(app: FastAPI):
89
+ threading.Thread(target=load_model, daemon=True).start()
90
+ asyncio.create_task(self_ping_loop())
91
+ yield
92
+
93
+ app = FastAPI(title="Gemma 4 Coding API", lifespan=lifespan)
94
+ app.add_middleware(
95
+ CORSMiddleware,
96
+ allow_origins=["*"],
97
+ allow_methods=["*"],
98
+ allow_headers=["*"],
99
+ allow_credentials=True,
100
+ )
101
+
102
+ # ── Helpers ───────────────────────────────────────────────────────────────────
103
+ def _check_model():
104
+ if llm is None:
105
+ raise HTTPException(
106
+ 503,
107
+ detail="Model still loading β€” first boot downloads ~11 GB, wait ~5-10 min"
108
+ )
109
+
110
+ def _extract_text(content) -> str:
111
+ if isinstance(content, str):
112
+ return content
113
+ if isinstance(content, list):
114
+ parts = []
115
+ for block in content:
116
+ if isinstance(block, dict):
117
+ if block.get("type") == "text":
118
+ parts.append(block.get("text", ""))
119
+ elif block.get("type") == "tool_result":
120
+ parts.append(_extract_text(block.get("content", "")))
121
+ else:
122
+ parts.append(str(block))
123
+ return "".join(parts)
124
+ return str(content)
125
+
126
+ # ── Health ────────────────────────────────────────────────────────────────────
127
+ @app.get("/health")
128
+ async def health():
129
+ return {
130
+ "status": "ok",
131
+ "model_loaded": llm is not None,
132
+ "model": MODEL_FILE,
133
+ "ctx": N_CTX,
134
+ }
135
+
136
+ # ══ OpenAI-compatible /v1/chat/completions ══════════════════════════════════
137
+ class OAIMessage(BaseModel):
138
+ role: str
139
+ content: Union[str, List[Any]]
140
+
141
+ class OAIRequest(BaseModel):
142
+ model: str = MODEL_ALIAS
143
+ messages: List[OAIMessage]
144
+ temperature: float = DEFAULT_TEMP
145
+ top_p: float = DEFAULT_TOP_P
146
+ min_p: float = DEFAULT_MIN_P
147
+ top_k: int = DEFAULT_TOP_K
148
+ max_tokens: int = 2048
149
+ stream: bool = False
150
+ stop: Optional[List[str]] = None
151
+
152
+ @app.get("/v1/models")
153
+ async def oai_models():
154
+ return {
155
+ "object": "list",
156
+ "data": [{
157
+ "id": MODEL_ALIAS,
158
+ "object": "model",
159
+ "created": int(time.time()),
160
+ "owned_by": "google-deepmind",
161
+ }],
162
+ }
163
+
164
+ @app.post("/v1/chat/completions")
165
+ async def oai_chat(req: OAIRequest):
166
+ _check_model()
167
+ msgs = [
168
+ {"role": m.role, "content": _extract_text(m.content)}
169
+ for m in req.messages
170
+ ]
171
+ kwargs = dict(
172
+ messages = msgs,
173
+ temperature = req.temperature,
174
+ top_p = req.top_p,
175
+ min_p = req.min_p,
176
+ top_k = req.top_k,
177
+ max_tokens = req.max_tokens,
178
+ stop = req.stop,
179
+ )
180
+
181
+ if req.stream:
182
+ async def gen():
183
+ rid = f"chatcmpl-{uuid.uuid4().hex[:8]}"
184
+ ts = int(time.time())
185
+ for chunk in llm.create_chat_completion(**kwargs, stream=True):
186
+ data = {
187
+ "id": rid,
188
+ "object": "chat.completion.chunk",
189
+ "created": ts,
190
+ "model": req.model,
191
+ "choices": [{
192
+ "index": 0,
193
+ "delta": chunk["choices"][0]["delta"],
194
+ "finish_reason": chunk["choices"][0]["finish_reason"],
195
+ }],
196
+ }
197
+ yield f"data: {json.dumps(data)}\n\n"
198
+ yield "data: [DONE]\n\n"
199
+ return StreamingResponse(gen(), media_type="text/event-stream")
200
+
201
+ result = llm.create_chat_completion(**kwargs, stream=False)
202
+ return JSONResponse(result)
203
+
204
+ # ══ Anthropic-compatible /v1/messages (Claude Code) ═══════════════════════
205
+ class AnthropicMessage(BaseModel):
206
+ role: str
207
+ content: Union[str, List[Dict]]
208
+
209
+ class AnthropicRequest(BaseModel):
210
+ model: str = MODEL_ALIAS
211
+ messages: List[AnthropicMessage]
212
+ system: Optional[str] = None
213
+ max_tokens: int = 2048
214
+ temperature: float = DEFAULT_TEMP
215
+ top_p: float = DEFAULT_TOP_P
216
+ top_k: int = DEFAULT_TOP_K
217
+ stream: bool = False
218
+ stop_sequences: Optional[List[str]] = None
219
+
220
+ @app.post("/v1/messages")
221
+ async def anthropic_messages(req: AnthropicRequest):
222
+ _check_model()
223
+ msgs = []
224
+ if req.system:
225
+ msgs.append({"role": "system", "content": req.system})
226
+ for m in req.messages:
227
+ msgs.append({"role": m.role, "content": _extract_text(m.content)})
228
+
229
+ kwargs = dict(
230
+ messages = msgs,
231
+ temperature = req.temperature,
232
+ top_p = req.top_p,
233
+ min_p = DEFAULT_MIN_P, # always apply min_p for coding accuracy
234
+ top_k = req.top_k,
235
+ max_tokens = req.max_tokens,
236
+ stop = req.stop_sequences,
237
+ )
238
+
239
+ if req.stream:
240
+ async def gen():
241
+ msg_id = f"msg_{uuid.uuid4().hex[:20]}"
242
+ yield f"data: {json.dumps({'type':'message_start','message':{'id':msg_id,'type':'message','role':'assistant','content':[],'model':req.model,'stop_reason':None,'usage':{'input_tokens':0,'output_tokens':0}}})}\n\n"
243
+ yield f"data: {json.dumps({'type':'content_block_start','index':0,'content_block':{'type':'text','text':''}})}\n\n"
244
+ full = ""
245
+ for chunk in llm.create_chat_completion(**kwargs, stream=True):
246
+ dt = chunk["choices"][0]["delta"].get("content", "")
247
+ if dt:
248
+ full += dt
249
+ yield f"data: {json.dumps({'type':'content_block_delta','index':0,'delta':{'type':'text_delta','text':dt}})}\n\n"
250
+ yield f"data: {json.dumps({'type':'content_block_stop','index':0})}\n\n"
251
+ yield f"data: {json.dumps({'type':'message_delta','delta':{'stop_reason':'end_turn','stop_sequence':None},'usage':{'output_tokens':len(full.split())}})}\n\n"
252
+ yield f"data: {json.dumps({'type':'message_stop'})}\n\n"
253
+ return StreamingResponse(
254
+ gen(),
255
+ media_type="text/event-stream",
256
+ headers={"anthropic-version": "2023-06-01"},
257
+ )
258
+
259
+ result = llm.create_chat_completion(**kwargs, stream=False)
260
+ text = result["choices"][0]["message"]["content"]
261
+ usage = result.get("usage", {})
262
+ return JSONResponse({
263
+ "id": f"msg_{uuid.uuid4().hex[:20]}",
264
+ "type": "message",
265
+ "role": "assistant",
266
+ "content": [{"type": "text", "text": text}],
267
+ "model": req.model,
268
+ "stop_reason": "end_turn",
269
+ "stop_sequence": None,
270
+ "usage": {
271
+ "input_tokens": usage.get("prompt_tokens", 0),
272
+ "output_tokens": usage.get("completion_tokens", 0),
273
+ },
274
+ })
275
+
276
+ # ══ Landing page ══════════════════════════════════════════════════════════════
277
+ @app.get("/", response_class=HTMLResponse)
278
+ async def landing():
279
+ sc = "#22c55e" if llm is not None else "#f59e0b"
280
+ st = "Model ready" if llm is not None else "Loading model... (~5-10 min on first boot)"
281
+ return LANDING_HTML.replace("{{SC}}", sc).replace("{{ST}}", st)
282
+
283
+ LANDING_HTML = r"""<!DOCTYPE html>
284
+ <html lang="en">
285
+ <head>
286
+ <meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1">
287
+ <title>Gemma 4 26B Coding API</title>
288
+ <style>
289
+ *{box-sizing:border-box;margin:0;padding:0}
290
+ body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;background:#0d0d12;color:#e2e2ed;min-height:100vh;display:flex;flex-direction:column;align-items:center;padding:3.5rem 1.5rem 4rem}
291
+ h1{font-size:2.1rem;font-weight:700;background:linear-gradient(130deg,#818cf8 20%,#34d399 80%);-webkit-background-clip:text;-webkit-text-fill-color:transparent;margin-bottom:.35rem;letter-spacing:-.5px}
292
+ .tagline{color:#6b7280;font-size:.93rem;margin-bottom:2.5rem;text-align:center;line-height:1.5}
293
+ .badge{display:inline-flex;align-items:center;gap:.45rem;background:#151520;border:1px solid #2a2a3a;border-radius:999px;padding:.3rem .9rem;font-size:.8rem;margin:.25rem}
294
+ .dot{width:7px;height:7px;border-radius:50%;background:{{SC}};flex-shrink:0}
295
+ .badges{display:flex;flex-wrap:wrap;justify-content:center;margin-bottom:2.8rem}
296
+ .cards{display:grid;grid-template-columns:repeat(auto-fit,minmax(290px,1fr));gap:1.1rem;width:100%;max-width:920px;margin-bottom:2.8rem}
297
+ .card{background:#13131c;border:1px solid #252535;border-radius:14px;padding:1.3rem 1.5rem}
298
+ .card-title{font-size:.72rem;font-weight:600;text-transform:uppercase;letter-spacing:.1em;color:#6b7280;margin-bottom:.75rem}
299
+ pre{background:#090910;border:1px solid #1e1e2e;border-radius:9px;padding:.85rem 1rem;font-family:'JetBrains Mono','Fira Code',monospace;font-size:.78rem;color:#a5b4fc;line-height:1.65;overflow-x:auto;white-space:pre-wrap;word-break:break-all}
300
+ .ep-table{width:100%;max-width:920px;border-collapse:collapse;margin-bottom:2rem}
301
+ .ep-table thead th{font-size:.72rem;text-transform:uppercase;letter-spacing:.08em;color:#4b5563;padding:.5rem .8rem;border-bottom:1px solid #1e1e2e;text-align:left}
302
+ .ep-table tbody tr{border-bottom:1px solid #161622}
303
+ .ep-table tbody td{padding:.7rem .8rem;font-size:.84rem}
304
+ .method{display:inline-block;font-size:.68rem;font-weight:700;padding:.18rem .5rem;border-radius:5px;min-width:42px;text-align:center}
305
+ .get{background:#064e3b;color:#34d399}.post{background:#1e3a5f;color:#60a5fa}
306
+ .path{font-family:monospace;color:#e2e8f0;font-size:.85rem}
307
+ .note{font-size:.78rem;color:#4b5563}
308
+ .tip{background:#131a1f;border:1px solid #1d3040;border-radius:10px;padding:1rem 1.25rem;width:100%;max-width:920px;font-size:.82rem;color:#7dd3fc;line-height:1.6;margin-bottom:1.2rem}
309
+ footer{margin-top:2.5rem;font-size:.75rem;color:#374151;text-align:center;line-height:1.8}
310
+ </style>
311
+ </head>
312
+ <body>
313
+ <h1>Gemma 4 26B A4B</h1>
314
+ <p class="tagline">Coding-tuned Β· Anthropic &amp; OpenAI compatible Β· HuggingFace Spaces</p>
315
+
316
+ <div class="badges">
317
+ <span class="badge"><span class="dot"></span>{{ST}}</span>
318
+ <span class="badge" style="color:#9ca3af">IQ3_XXS Β· 11.2 GB</span>
319
+ <span class="badge" style="color:#9ca3af">ctx 4096 Β· 2 vCPU Β· 16 GB RAM</span>
320
+ <span class="badge" style="color:#9ca3af">temp 0.3 Β· top-k 20 Β· min-p 0.1</span>
321
+ </div>
322
+
323
+ <div class="cards">
324
+ <div class="card">
325
+ <div class="card-title">Claude Code setup</div>
326
+ <pre>export ANTHROPIC_BASE_URL=\
327
+ https://YOUR-USER-space-name.hf.space
328
+ export ANTHROPIC_API_KEY=gemma4-local
329
+
330
+ claude --model gemma-4-26b</pre>
331
+ </div>
332
+ <div class="card">
333
+ <div class="card-title">OpenAI Python client</div>
334
+ <pre>from openai import OpenAI
335
+ client = OpenAI(
336
+ base_url="https://YOUR-SPACE.hf.space/v1",
337
+ api_key="gemma4-local",
338
+ )
339
+ r = client.chat.completions.create(
340
+ model="gemma-4-26b",
341
+ messages=[{"role":"user",
342
+ "content":"write binary search"}],
343
+ )</pre>
344
+ </div>
345
+ <div class="card">
346
+ <div class="card-title">curl quick test</div>
347
+ <pre>curl YOUR-SPACE.hf.space/v1/chat/completions \
348
+ -H "Content-Type: application/json" \
349
+ -d '{
350
+ "model": "gemma-4-26b",
351
+ "messages": [
352
+ {"role":"user",
353
+ "content":"hello"}
354
+ ]
355
+ }'</pre>
356
+ </div>
357
+ </div>
358
+
359
+ <div class="tip">
360
+ <strong>First boot:</strong> The model (~11.2 GB) downloads from HuggingFace on first start β€” allow 5–10 min.
361
+ <code style="background:#0d1b26;padding:1px 5px;border-radius:4px">/health</code> returns
362
+ <code style="background:#0d1b26;padding:1px 5px;border-radius:4px">model_loaded: false</code>
363
+ until ready. Subsequent restarts load from disk in ~60 s. Self-pings every 25 min to prevent sleep.
364
+ </div>
365
+
366
+ <table class="ep-table">
367
+ <thead><tr><th>Method</th><th>Path</th><th>Notes</th></tr></thead>
368
+ <tbody>
369
+ <tr><td><span class="method get">GET</span></td><td class="path">/health</td><td class="note">Status + model_loaded</td></tr>
370
+ <tr><td><span class="method get">GET</span></td><td class="path">/v1/models</td><td class="note">Model list (OpenAI)</td></tr>
371
+ <tr><td><span class="method post">POST</span></td><td class="path">/v1/chat/completions</td><td class="note">OpenAI-compatible Β· streaming supported</td></tr>
372
+ <tr><td><span class="method post">POST</span></td><td class="path">/v1/messages</td><td class="note">Anthropic-compatible Β· used by Claude Code</td></tr>
373
+ </tbody>
374
+ </table>
375
+
376
+ <footer>
377
+ Gemma 4 26B A4B Β· unsloth UD-IQ3_XXS Β· llama-cpp-python + OpenBLAS<br>
378
+ Self-pings /health every 25 min Β· April 2026
379
+ </footer>
380
+ </body>
381
+ </html>"""