Spaces:

surrogate1
/

coder-zero-gpu-1

Runtime error

App Files Files Community

surrogate1 commited on 5 days ago

Commit

f261990

verified ·

1 Parent(s): a3f5486

switch to Qwen2.5-Coder-7B-Instruct (drop AWQ)

Browse files

Files changed (2) hide show

app.py +20 -41
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
-"""axentx coder-zero-gpu-1 — Qwen2.5-Coder-32B-Instruct-AWQ on ZeroGPU.
-Exposes OpenAI-compatible /v1/chat/completions so the axentx pipeline's
-LLM chain can hit it like any other upstream provider.
 """
-import os
-import time
 import spaces
 import torch
 import gradio as gr
@@ -13,21 +13,20 @@ from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
-MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ")
-print(f"[init] loading tokenizer: {MODEL_ID}")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-print(f"[init] loading model")
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.bfloat16,
     device_map="cuda",
     trust_remote_code=True,
 )
-print(f"[init] ready")
-@spaces.GPU(duration=120)
 def _generate(messages, max_tokens=1024, temperature=0.3):
     prompt = tokenizer.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
@@ -40,14 +39,12 @@ def _generate(messages, max_tokens=1024, temperature=0.3):
         do_sample=temperature > 0,
         pad_token_id=tokenizer.eos_token_id,
     )
-    text = tokenizer.decode(
-        out[0][inputs.input_ids.shape[1]:],
-        skip_special_tokens=True,
     )
-    return text
-app = FastAPI(title="axentx coder ZeroGPU")
 app.add_middleware(
     CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
 )
@@ -65,20 +62,10 @@ def chat_completions(req: ChatRequest):
     t0 = time.time()
     text = _generate(req.messages, req.max_tokens, req.temperature)
     return {
-        "id": f"axentx-{int(t0)}",
-        "object": "chat.completion",
-        "created": int(t0),
-        "model": req.model,
-        "choices": [{
-            "index": 0,
-            "message": {"role": "assistant", "content": text},
-            "finish_reason": "stop",
-        }],
-        "usage": {
-            "prompt_tokens": 0,
-            "completion_tokens": len(text.split()),
-            "total_tokens": len(text.split()),
-        },
     }
@@ -87,19 +74,11 @@ def health():
     return {"status": "ok", "model": MODEL_ID}
-def _ui_chat(message, history):
-    msgs = []
-    for h in history:
-        if h.get("role") and h.get("content"):
-            msgs.append({"role": h["role"], "content": h["content"]})
     msgs.append({"role": "user", "content": message})
-    return _generate(msgs, max_tokens=1024, temperature=0.3)
-demo = gr.ChatInterface(
-    _ui_chat,
-    title="axentx Coder — Qwen2.5-Coder-32B-Instruct (ZeroGPU)",
-    type="messages",
-)
 app = gr.mount_gradio_app(app, demo, path="/")

+"""axentx coder-zero-gpu-1 — Qwen2.5-Coder-7B-Instruct on ZeroGPU.
+Smaller model = faster cold start = more calls/min. 7B is plenty for
+feature-builder code-gen workload. OpenAI-compatible /v1/chat/completions
+endpoint for direct chain integration.
 """
+import os, time
 import spaces
 import torch
 import gradio as gr
 from pydantic import BaseModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
+MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-Coder-7B-Instruct")
+print(f"[init] loading {MODEL_ID}")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.bfloat16,
     device_map="cuda",
     trust_remote_code=True,
 )
+print("[init] ready")
+@spaces.GPU(duration=60)
 def _generate(messages, max_tokens=1024, temperature=0.3):
     prompt = tokenizer.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
         do_sample=temperature > 0,
         pad_token_id=tokenizer.eos_token_id,
     )
+    return tokenizer.decode(
+        out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True,
     )
+app = FastAPI()
 app.add_middleware(
     CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
 )
     t0 = time.time()
     text = _generate(req.messages, req.max_tokens, req.temperature)
     return {
+        "id": f"axentx-{int(t0)}", "object": "chat.completion",
+        "created": int(t0), "model": req.model,
+        "choices": [{"index": 0, "message": {"role": "assistant", "content": text}, "finish_reason": "stop"}],
+        "usage": {"prompt_tokens": 0, "completion_tokens": len(text.split()), "total_tokens": len(text.split())},
     }
     return {"status": "ok", "model": MODEL_ID}
+def _ui(message, history):
+    msgs = [{"role": h["role"], "content": h["content"]} for h in (history or []) if h.get("role")]
     msgs.append({"role": "user", "content": message})
+    return _generate(msgs)
+demo = gr.ChatInterface(_ui, title=f"axentx Coder — {MODEL_ID}", type="messages")
 app = gr.mount_gradio_app(app, demo, path="/")

requirements.txt CHANGED Viewed

@@ -6,5 +6,4 @@ fastapi
 pydantic>=2
 gradio>=5.0.0
 huggingface_hub>=0.25
-autoawq
 sentencepiece

 pydantic>=2
 gradio>=5.0.0
 huggingface_hub>=0.25
 sentencepiece