surrogate1's picture
switch to thin HF Router proxy (no model loading)
c92f8a7 verified
raw
history blame
2.02 kB
"""axentx coder-zero-gpu-1 — proxy to HF Inference Router for Qwen3-Coder.
No model loading on Space (avoid GPU init issues). Just a thin wrapper
that forwards to HF Router with the Space owner's token. Adds independent
rate-limit bucket for the pipeline.
"""
import os, json, urllib.request
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import gradio as gr
HF_TOKEN = os.environ.get("HF_TOKEN", "") # auto-set by Space
MODEL = os.environ.get("MODEL_ID", "Qwen/Qwen3-Coder-30B-A3B-Instruct")
def _call_hf_router(messages, max_tokens=1024, temperature=0.3):
body = json.dumps({
"model": MODEL, "messages": messages,
"max_tokens": max_tokens, "temperature": temperature,
}).encode()
req = urllib.request.Request(
"https://router.huggingface.co/v1/chat/completions",
data=body, method="POST",
headers={"Authorization": f"Bearer {HF_TOKEN}",
"Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=60) as r:
return json.loads(r.read())
app = FastAPI()
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
class ChatRequest(BaseModel):
messages: list
max_tokens: int = 1024
temperature: float = 0.3
model: str = "axentx-coder-1"
@app.post("/v1/chat/completions")
def chat(req: ChatRequest):
return _call_hf_router(req.messages, req.max_tokens, req.temperature)
@app.get("/health")
def h():
return {"status": "ok", "backend": "hf-router", "model": MODEL}
def _ui(message, history):
msgs = [{"role": h["role"], "content": h["content"]}
for h in (history or []) if h.get("role")]
msgs.append({"role": "user", "content": message})
r = _call_hf_router(msgs)
return r["choices"][0]["message"]["content"]
demo = gr.ChatInterface(_ui, title=f"axentx Coder Proxy — {MODEL}", type="messages")
app = gr.mount_gradio_app(app, demo, path="/")