axentx-deploy
redeploy: simple gradio proxy
2cfc50a
"""axentx coder-zero-gpu — thin proxy to HF Inference Router.
Why simple gradio? Previous attempts mixed FastAPI + gradio mount which
crashed on zero-a10g startup. This version is pure gradio app — gradio
auto-exposes a Predict API at /api/predict that we call from the LLM
chain. ZeroGPU not actually used (we don't load a model — just proxy)
so any tier works.
"""
import json
import os
import urllib.request
import gradio as gr
HF_TOKEN = os.environ.get("HF_TOKEN", "")
MODEL = os.environ.get("MODEL_ID", "Qwen/Qwen3-Coder-30B-A3B-Instruct")
def chat(prompt: str, system: str = "", max_tokens: int = 1024,
temperature: float = 0.3) -> str:
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
body = json.dumps({
"model": MODEL,
"messages": messages,
"max_tokens": int(max_tokens),
"temperature": float(temperature),
}).encode()
req = urllib.request.Request(
"https://router.huggingface.co/v1/chat/completions",
data=body, method="POST",
headers={"Authorization": f"Bearer {HF_TOKEN}",
"Content-Type": "application/json"})
try:
with urllib.request.urlopen(req, timeout=60) as r:
d = json.loads(r.read())
return d["choices"][0]["message"]["content"]
except Exception as e:
return f"ERROR: {type(e).__name__}: {e}"
# Simple gradio app — Space's auto-exposed API at /api/predict accepts
# {data: [prompt, system, max_tokens, temperature]} and returns {data:[output]}
demo = gr.Interface(
fn=chat,
inputs=[
gr.Textbox(label="prompt", lines=4),
gr.Textbox(label="system", lines=2, value=""),
gr.Number(label="max_tokens", value=1024),
gr.Number(label="temperature", value=0.3),
],
outputs=gr.Textbox(label="output", lines=10),
title="axentx coder zero-gpu",
description="Thin proxy to HF Router. Use /api/predict from LLM chain.",
)
if __name__ == "__main__":
demo.queue().launch(server_name="0.0.0.0", server_port=7860)