ashirato commited on
Commit
10a8fdd
·
verified ·
1 Parent(s): c3cb41d

switch to thin HF Router proxy (no model loading)

Browse files
Files changed (3) hide show
  1. README.md +5 -11
  2. app.py +30 -52
  3. requirements.txt +0 -5
README.md CHANGED
@@ -1,20 +1,14 @@
1
  ---
2
- title: axentx Coder ZeroGPU 2
3
- emoji: 🐬
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
- short_description: DeepSeek-Coder-V2-Lite-Instruct on ZeroGPU
11
  ---
12
 
13
- # axentx coder-zero-gpu-2
14
-
15
- OpenAI-compatible code generation endpoint backed by `Qwen2.5-Coder-32B-Instruct-AWQ`.
16
-
17
- ## Endpoints
18
- - `POST /v1/chat/completions` — OpenAI-compatible chat
19
- - `GET /health` — model + status
20
- - `/` — Gradio chat UI
 
1
  ---
2
+ title: axentx Coder Proxy 1
3
+ emoji: 🦉
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
+ short_description: HF Router proxy for Qwen3-Coder (no GPU needed)
11
  ---
12
 
13
+ OpenAI-compatible proxy to HF Inference Router. Adds independent rate-limit
14
+ bucket for the axentx pipeline.
 
 
 
 
 
 
app.py CHANGED
@@ -1,53 +1,36 @@
1
- """axentx coder-zero-gpu-1 — Qwen2.5-Coder-7B-Instruct on ZeroGPU.
2
 
3
- Smaller model = faster cold start = more calls/min. 7B is plenty for
4
- feature-builder code-gen workload. OpenAI-compatible /v1/chat/completions
5
- endpoint for direct chain integration.
6
  """
7
- import os, time
8
- import spaces
9
- import torch
10
- import gradio as gr
11
  from fastapi import FastAPI
12
  from fastapi.middleware.cors import CORSMiddleware
13
  from pydantic import BaseModel
14
- from transformers import AutoModelForCausalLM, AutoTokenizer
15
-
16
- MODEL_ID = os.environ.get("MODEL_ID", "deepseek-ai/deepseek-coder-6.7b-instruct")
17
 
18
- print(f"[init] loading {MODEL_ID}")
19
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
20
- model = AutoModelForCausalLM.from_pretrained(
21
- MODEL_ID,
22
- torch_dtype=torch.bfloat16,
23
- device_map="cuda",
24
- trust_remote_code=True,
25
- )
26
- print("[init] ready")
27
 
28
 
29
- @spaces.GPU(duration=60)
30
- def _generate(messages, max_tokens=1024, temperature=0.3):
31
- prompt = tokenizer.apply_chat_template(
32
- messages, tokenize=False, add_generation_prompt=True
33
- )
34
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
35
- out = model.generate(
36
- **inputs,
37
- max_new_tokens=max_tokens,
38
- temperature=max(temperature, 0.01),
39
- do_sample=temperature > 0,
40
- pad_token_id=tokenizer.eos_token_id,
41
- )
42
- return tokenizer.decode(
43
- out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True,
44
  )
 
 
45
 
46
 
47
  app = FastAPI()
48
- app.add_middleware(
49
- CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
50
- )
51
 
52
 
53
  class ChatRequest(BaseModel):
@@ -58,27 +41,22 @@ class ChatRequest(BaseModel):
58
 
59
 
60
  @app.post("/v1/chat/completions")
61
- def chat_completions(req: ChatRequest):
62
- t0 = time.time()
63
- text = _generate(req.messages, req.max_tokens, req.temperature)
64
- return {
65
- "id": f"axentx-{int(t0)}", "object": "chat.completion",
66
- "created": int(t0), "model": req.model,
67
- "choices": [{"index": 0, "message": {"role": "assistant", "content": text}, "finish_reason": "stop"}],
68
- "usage": {"prompt_tokens": 0, "completion_tokens": len(text.split()), "total_tokens": len(text.split())},
69
- }
70
 
71
 
72
  @app.get("/health")
73
- def health():
74
- return {"status": "ok", "model": MODEL_ID}
75
 
76
 
77
  def _ui(message, history):
78
- msgs = [{"role": h["role"], "content": h["content"]} for h in (history or []) if h.get("role")]
 
79
  msgs.append({"role": "user", "content": message})
80
- return _generate(msgs)
 
81
 
82
 
83
- demo = gr.ChatInterface(_ui, title=f"axentx Coder — {MODEL_ID}", type="messages")
84
  app = gr.mount_gradio_app(app, demo, path="/")
 
1
+ """axentx coder-zero-gpu-1 — proxy to HF Inference Router for Qwen3-Coder.
2
 
3
+ No model loading on Space (avoid GPU init issues). Just a thin wrapper
4
+ that forwards to HF Router with the Space owner's token. Adds independent
5
+ rate-limit bucket for the pipeline.
6
  """
7
+ import os, json, urllib.request
 
 
 
8
  from fastapi import FastAPI
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from pydantic import BaseModel
11
+ import gradio as gr
 
 
12
 
13
+ HF_TOKEN = os.environ.get("HF_TOKEN", "") # auto-set by Space
14
+ MODEL = os.environ.get("MODEL_ID", "Qwen/Qwen3-Coder-30B-A3B-Instruct")
 
 
 
 
 
 
 
15
 
16
 
17
+ def _call_hf_router(messages, max_tokens=1024, temperature=0.3):
18
+ body = json.dumps({
19
+ "model": MODEL, "messages": messages,
20
+ "max_tokens": max_tokens, "temperature": temperature,
21
+ }).encode()
22
+ req = urllib.request.Request(
23
+ "https://router.huggingface.co/v1/chat/completions",
24
+ data=body, method="POST",
25
+ headers={"Authorization": f"Bearer {HF_TOKEN}",
26
+ "Content-Type": "application/json"},
 
 
 
 
 
27
  )
28
+ with urllib.request.urlopen(req, timeout=60) as r:
29
+ return json.loads(r.read())
30
 
31
 
32
  app = FastAPI()
33
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 
 
34
 
35
 
36
  class ChatRequest(BaseModel):
 
41
 
42
 
43
  @app.post("/v1/chat/completions")
44
+ def chat(req: ChatRequest):
45
+ return _call_hf_router(req.messages, req.max_tokens, req.temperature)
 
 
 
 
 
 
 
46
 
47
 
48
  @app.get("/health")
49
+ def h():
50
+ return {"status": "ok", "backend": "hf-router", "model": MODEL}
51
 
52
 
53
  def _ui(message, history):
54
+ msgs = [{"role": h["role"], "content": h["content"]}
55
+ for h in (history or []) if h.get("role")]
56
  msgs.append({"role": "user", "content": message})
57
+ r = _call_hf_router(msgs)
58
+ return r["choices"][0]["message"]["content"]
59
 
60
 
61
+ demo = gr.ChatInterface(_ui, title=f"axentx Coder Proxy — {MODEL}", type="messages")
62
  app = gr.mount_gradio_app(app, demo, path="/")
requirements.txt CHANGED
@@ -1,9 +1,4 @@
1
- torch
2
- transformers>=4.45.0
3
- accelerate
4
- spaces
5
  fastapi
6
  pydantic>=2
7
  gradio>=5.0.0
8
  huggingface_hub>=0.25
9
- sentencepiece
 
 
 
 
 
1
  fastapi
2
  pydantic>=2
3
  gradio>=5.0.0
4
  huggingface_hub>=0.25