"""axentx coder-zero-gpu — thin proxy to HF Inference Router. Why simple gradio? Previous attempts mixed FastAPI + gradio mount which crashed on zero-a10g startup. This version is pure gradio app — gradio auto-exposes a Predict API at /api/predict that we call from the LLM chain. ZeroGPU not actually used (we don't load a model — just proxy) so any tier works. """ import json import os import urllib.request import gradio as gr HF_TOKEN = os.environ.get("HF_TOKEN", "") MODEL = os.environ.get("MODEL_ID", "Qwen/Qwen3-Coder-30B-A3B-Instruct") def chat(prompt: str, system: str = "", max_tokens: int = 1024, temperature: float = 0.3) -> str: messages = [] if system: messages.append({"role": "system", "content": system}) messages.append({"role": "user", "content": prompt}) body = json.dumps({ "model": MODEL, "messages": messages, "max_tokens": int(max_tokens), "temperature": float(temperature), }).encode() req = urllib.request.Request( "https://router.huggingface.co/v1/chat/completions", data=body, method="POST", headers={"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"}) try: with urllib.request.urlopen(req, timeout=60) as r: d = json.loads(r.read()) return d["choices"][0]["message"]["content"] except Exception as e: return f"ERROR: {type(e).__name__}: {e}" # Simple gradio app — Space's auto-exposed API at /api/predict accepts # {data: [prompt, system, max_tokens, temperature]} and returns {data:[output]} demo = gr.Interface( fn=chat, inputs=[ gr.Textbox(label="prompt", lines=4), gr.Textbox(label="system", lines=2, value=""), gr.Number(label="max_tokens", value=1024), gr.Number(label="temperature", value=0.3), ], outputs=gr.Textbox(label="output", lines=10), title="axentx coder zero-gpu", description="Thin proxy to HF Router. Use /api/predict from LLM chain.", ) if __name__ == "__main__": demo.queue().launch(server_name="0.0.0.0", server_port=7860)