Spaces:

surrogate1
/

coder-zero-gpu-1

Building

App Files Files Community

axentx-deploy commited on 4 days ago

Commit

58bc994

1 Parent(s): c92f8a7

redeploy: simple gradio proxy

Browse files

Files changed (3) hide show

README.md +7 -8
app.py +46 -49
requirements.txt +1 -3

README.md CHANGED Viewed

@@ -1,14 +1,13 @@
 ---
-title: axentx Coder Proxy 1
-emoji: 🦉
-colorFrom: indigo
-colorTo: purple
 sdk: gradio
-sdk_version: 5.9.1
 app_file: app.py
 pinned: false
-short_description: HF Router proxy for Qwen3-Coder (no GPU needed)
 ---
-OpenAI-compatible proxy to HF Inference Router. Adds independent rate-limit
-bucket for the axentx pipeline.

 ---
+title: axentx coder zero-gpu
+emoji: 🛠️
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: 4.44.1
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
+axentx LLM proxy Space. Routes /api/predict to HF Inference Router.

app.py CHANGED Viewed

@@ -1,62 +1,59 @@
-"""axentx coder-zero-gpu-1 — proxy to HF Inference Router for Qwen3-Coder.
-No model loading on Space (avoid GPU init issues). Just a thin wrapper
-that forwards to HF Router with the Space owner's token. Adds independent
-rate-limit bucket for the pipeline.
 """
-import os, json, urllib.request
-from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
 import gradio as gr
-HF_TOKEN = os.environ.get("HF_TOKEN", "")  # auto-set by Space
 MODEL = os.environ.get("MODEL_ID", "Qwen/Qwen3-Coder-30B-A3B-Instruct")
-def _call_hf_router(messages, max_tokens=1024, temperature=0.3):
     body = json.dumps({
-        "model": MODEL, "messages": messages,
-        "max_tokens": max_tokens, "temperature": temperature,
     }).encode()
     req = urllib.request.Request(
         "https://router.huggingface.co/v1/chat/completions",
         data=body, method="POST",
         headers={"Authorization": f"Bearer {HF_TOKEN}",
-                 "Content-Type": "application/json"},
-    )
-    with urllib.request.urlopen(req, timeout=60) as r:
-        return json.loads(r.read())
-app = FastAPI()
-app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
-class ChatRequest(BaseModel):
-    messages: list
-    max_tokens: int = 1024
-    temperature: float = 0.3
-    model: str = "axentx-coder-1"
-@app.post("/v1/chat/completions")
-def chat(req: ChatRequest):
-    return _call_hf_router(req.messages, req.max_tokens, req.temperature)
-@app.get("/health")
-def h():
-    return {"status": "ok", "backend": "hf-router", "model": MODEL}
-def _ui(message, history):
-    msgs = [{"role": h["role"], "content": h["content"]}
-            for h in (history or []) if h.get("role")]
-    msgs.append({"role": "user", "content": message})
-    r = _call_hf_router(msgs)
-    return r["choices"][0]["message"]["content"]
-demo = gr.ChatInterface(_ui, title=f"axentx Coder Proxy — {MODEL}", type="messages")
-app = gr.mount_gradio_app(app, demo, path="/")

+"""axentx coder-zero-gpu — thin proxy to HF Inference Router.
+Why simple gradio? Previous attempts mixed FastAPI + gradio mount which
+crashed on zero-a10g startup. This version is pure gradio app — gradio
+auto-exposes a Predict API at /api/predict that we call from the LLM
+chain. ZeroGPU not actually used (we don't load a model — just proxy)
+so any tier works.
 """
+import json
+import os
+import urllib.request
 import gradio as gr
+HF_TOKEN = os.environ.get("HF_TOKEN", "")
 MODEL = os.environ.get("MODEL_ID", "Qwen/Qwen3-Coder-30B-A3B-Instruct")
+def chat(prompt: str, system: str = "", max_tokens: int = 1024,
+         temperature: float = 0.3) -> str:
+    messages = []
+    if system:
+        messages.append({"role": "system", "content": system})
+    messages.append({"role": "user", "content": prompt})
     body = json.dumps({
+        "model": MODEL,
+        "messages": messages,
+        "max_tokens": int(max_tokens),
+        "temperature": float(temperature),
     }).encode()
     req = urllib.request.Request(
         "https://router.huggingface.co/v1/chat/completions",
         data=body, method="POST",
         headers={"Authorization": f"Bearer {HF_TOKEN}",
+                 "Content-Type": "application/json"})
+    try:
+        with urllib.request.urlopen(req, timeout=60) as r:
+            d = json.loads(r.read())
+        return d["choices"][0]["message"]["content"]
+    except Exception as e:
+        return f"ERROR: {type(e).__name__}: {e}"
+# Simple gradio app — Space's auto-exposed API at /api/predict accepts
+# {data: [prompt, system, max_tokens, temperature]} and returns {data:[output]}
+demo = gr.Interface(
+    fn=chat,
+    inputs=[
+        gr.Textbox(label="prompt", lines=4),
+        gr.Textbox(label="system", lines=2, value=""),
+        gr.Number(label="max_tokens", value=1024),
+        gr.Number(label="temperature", value=0.3),
+    ],
+    outputs=gr.Textbox(label="output", lines=10),
+    title="axentx coder zero-gpu",
+    description="Thin proxy to HF Router. Use /api/predict from LLM chain.",
+)
+if __name__ == "__main__":
+    demo.queue().launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,4 +1,2 @@
-fastapi
-pydantic>=2
-gradio>=5.0.0
 huggingface_hub>=0.25


1	+ gradio>=4.44,<5


2	huggingface_hub>=0.25