"""axentx coder-zero-gpu-1 — proxy to HF Inference Router for Qwen3-Coder. No model loading on Space (avoid GPU init issues). Just a thin wrapper that forwards to HF Router with the Space owner's token. Adds independent rate-limit bucket for the pipeline. """ import os, json, urllib.request from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel import gradio as gr HF_TOKEN = os.environ.get("HF_TOKEN", "") # auto-set by Space MODEL = os.environ.get("MODEL_ID", "Qwen/Qwen3-Coder-30B-A3B-Instruct") def _call_hf_router(messages, max_tokens=1024, temperature=0.3): body = json.dumps({ "model": MODEL, "messages": messages, "max_tokens": max_tokens, "temperature": temperature, }).encode() req = urllib.request.Request( "https://router.huggingface.co/v1/chat/completions", data=body, method="POST", headers={"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"}, ) with urllib.request.urlopen(req, timeout=60) as r: return json.loads(r.read()) app = FastAPI() app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) class ChatRequest(BaseModel): messages: list max_tokens: int = 1024 temperature: float = 0.3 model: str = "axentx-coder-1" @app.post("/v1/chat/completions") def chat(req: ChatRequest): return _call_hf_router(req.messages, req.max_tokens, req.temperature) @app.get("/health") def h(): return {"status": "ok", "backend": "hf-router", "model": MODEL} def _ui(message, history): msgs = [{"role": h["role"], "content": h["content"]} for h in (history or []) if h.get("role")] msgs.append({"role": "user", "content": message}) r = _call_hf_router(msgs) return r["choices"][0]["message"]["content"] demo = gr.ChatInterface(_ui, title=f"axentx Coder Proxy — {MODEL}", type="messages") app = gr.mount_gradio_app(app, demo, path="/")