axentx-deploy commited on
Commit
58bc994
·
1 Parent(s): c92f8a7

redeploy: simple gradio proxy

Browse files
Files changed (3) hide show
  1. README.md +7 -8
  2. app.py +46 -49
  3. requirements.txt +1 -3
README.md CHANGED
@@ -1,14 +1,13 @@
1
  ---
2
- title: axentx Coder Proxy 1
3
- emoji: 🦉
4
- colorFrom: indigo
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
- short_description: HF Router proxy for Qwen3-Coder (no GPU needed)
11
  ---
12
 
13
- OpenAI-compatible proxy to HF Inference Router. Adds independent rate-limit
14
- bucket for the axentx pipeline.
 
1
  ---
2
+ title: axentx coder zero-gpu
3
+ emoji: 🛠️
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
+ axentx LLM proxy Space. Routes /api/predict to HF Inference Router.
 
app.py CHANGED
@@ -1,62 +1,59 @@
1
- """axentx coder-zero-gpu-1 — proxy to HF Inference Router for Qwen3-Coder.
2
 
3
- No model loading on Space (avoid GPU init issues). Just a thin wrapper
4
- that forwards to HF Router with the Space owner's token. Adds independent
5
- rate-limit bucket for the pipeline.
 
 
6
  """
7
- import os, json, urllib.request
8
- from fastapi import FastAPI
9
- from fastapi.middleware.cors import CORSMiddleware
10
- from pydantic import BaseModel
11
  import gradio as gr
12
 
13
- HF_TOKEN = os.environ.get("HF_TOKEN", "") # auto-set by Space
14
  MODEL = os.environ.get("MODEL_ID", "Qwen/Qwen3-Coder-30B-A3B-Instruct")
15
 
16
 
17
- def _call_hf_router(messages, max_tokens=1024, temperature=0.3):
 
 
 
 
 
18
  body = json.dumps({
19
- "model": MODEL, "messages": messages,
20
- "max_tokens": max_tokens, "temperature": temperature,
 
 
21
  }).encode()
22
  req = urllib.request.Request(
23
  "https://router.huggingface.co/v1/chat/completions",
24
  data=body, method="POST",
25
  headers={"Authorization": f"Bearer {HF_TOKEN}",
26
- "Content-Type": "application/json"},
27
- )
28
- with urllib.request.urlopen(req, timeout=60) as r:
29
- return json.loads(r.read())
30
-
31
-
32
- app = FastAPI()
33
- app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
34
-
35
-
36
- class ChatRequest(BaseModel):
37
- messages: list
38
- max_tokens: int = 1024
39
- temperature: float = 0.3
40
- model: str = "axentx-coder-1"
41
-
42
-
43
- @app.post("/v1/chat/completions")
44
- def chat(req: ChatRequest):
45
- return _call_hf_router(req.messages, req.max_tokens, req.temperature)
46
-
47
-
48
- @app.get("/health")
49
- def h():
50
- return {"status": "ok", "backend": "hf-router", "model": MODEL}
51
-
52
-
53
- def _ui(message, history):
54
- msgs = [{"role": h["role"], "content": h["content"]}
55
- for h in (history or []) if h.get("role")]
56
- msgs.append({"role": "user", "content": message})
57
- r = _call_hf_router(msgs)
58
- return r["choices"][0]["message"]["content"]
59
-
60
-
61
- demo = gr.ChatInterface(_ui, title=f"axentx Coder Proxy — {MODEL}", type="messages")
62
- app = gr.mount_gradio_app(app, demo, path="/")
 
1
+ """axentx coder-zero-gpu — thin proxy to HF Inference Router.
2
 
3
+ Why simple gradio? Previous attempts mixed FastAPI + gradio mount which
4
+ crashed on zero-a10g startup. This version is pure gradio app gradio
5
+ auto-exposes a Predict API at /api/predict that we call from the LLM
6
+ chain. ZeroGPU not actually used (we don't load a model — just proxy)
7
+ so any tier works.
8
  """
9
+ import json
10
+ import os
11
+ import urllib.request
 
12
  import gradio as gr
13
 
14
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
15
  MODEL = os.environ.get("MODEL_ID", "Qwen/Qwen3-Coder-30B-A3B-Instruct")
16
 
17
 
18
+ def chat(prompt: str, system: str = "", max_tokens: int = 1024,
19
+ temperature: float = 0.3) -> str:
20
+ messages = []
21
+ if system:
22
+ messages.append({"role": "system", "content": system})
23
+ messages.append({"role": "user", "content": prompt})
24
  body = json.dumps({
25
+ "model": MODEL,
26
+ "messages": messages,
27
+ "max_tokens": int(max_tokens),
28
+ "temperature": float(temperature),
29
  }).encode()
30
  req = urllib.request.Request(
31
  "https://router.huggingface.co/v1/chat/completions",
32
  data=body, method="POST",
33
  headers={"Authorization": f"Bearer {HF_TOKEN}",
34
+ "Content-Type": "application/json"})
35
+ try:
36
+ with urllib.request.urlopen(req, timeout=60) as r:
37
+ d = json.loads(r.read())
38
+ return d["choices"][0]["message"]["content"]
39
+ except Exception as e:
40
+ return f"ERROR: {type(e).__name__}: {e}"
41
+
42
+
43
+ # Simple gradio app — Space's auto-exposed API at /api/predict accepts
44
+ # {data: [prompt, system, max_tokens, temperature]} and returns {data:[output]}
45
+ demo = gr.Interface(
46
+ fn=chat,
47
+ inputs=[
48
+ gr.Textbox(label="prompt", lines=4),
49
+ gr.Textbox(label="system", lines=2, value=""),
50
+ gr.Number(label="max_tokens", value=1024),
51
+ gr.Number(label="temperature", value=0.3),
52
+ ],
53
+ outputs=gr.Textbox(label="output", lines=10),
54
+ title="axentx coder zero-gpu",
55
+ description="Thin proxy to HF Router. Use /api/predict from LLM chain.",
56
+ )
57
+
58
+ if __name__ == "__main__":
59
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,2 @@
1
- fastapi
2
- pydantic>=2
3
- gradio>=5.0.0
4
  huggingface_hub>=0.25
 
1
+ gradio>=4.44,<5
 
 
2
  huggingface_hub>=0.25