Spaces:
Runtime error
Runtime error
File size: 2,637 Bytes
c3cb41d cef92a0 c3cb41d cef92a0 c3cb41d cef92a0 c3cb41d cef92a0 c3cb41d cef92a0 c3cb41d cef92a0 c3cb41d cef92a0 c3cb41d cef92a0 c3cb41d cef92a0 c3cb41d cef92a0 c3cb41d cef92a0 c3cb41d cef92a0 c3cb41d cef92a0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | """axentx coder-zero-gpu-1 — Qwen2.5-Coder-7B-Instruct on ZeroGPU.
Smaller model = faster cold start = more calls/min. 7B is plenty for
feature-builder code-gen workload. OpenAI-compatible /v1/chat/completions
endpoint for direct chain integration.
"""
import os, time
import spaces
import torch
import gradio as gr
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID = os.environ.get("MODEL_ID", "deepseek-ai/deepseek-coder-6.7b-instruct")
print(f"[init] loading {MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="cuda",
trust_remote_code=True,
)
print("[init] ready")
@spaces.GPU(duration=60)
def _generate(messages, max_tokens=1024, temperature=0.3):
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
out = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=max(temperature, 0.01),
do_sample=temperature > 0,
pad_token_id=tokenizer.eos_token_id,
)
return tokenizer.decode(
out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True,
)
app = FastAPI()
app.add_middleware(
CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
)
class ChatRequest(BaseModel):
messages: list
max_tokens: int = 1024
temperature: float = 0.3
model: str = "axentx-coder-2"
@app.post("/v1/chat/completions")
def chat_completions(req: ChatRequest):
t0 = time.time()
text = _generate(req.messages, req.max_tokens, req.temperature)
return {
"id": f"axentx-{int(t0)}", "object": "chat.completion",
"created": int(t0), "model": req.model,
"choices": [{"index": 0, "message": {"role": "assistant", "content": text}, "finish_reason": "stop"}],
"usage": {"prompt_tokens": 0, "completion_tokens": len(text.split()), "total_tokens": len(text.split())},
}
@app.get("/health")
def health():
return {"status": "ok", "model": MODEL_ID}
def _ui(message, history):
msgs = [{"role": h["role"], "content": h["content"]} for h in (history or []) if h.get("role")]
msgs.append({"role": "user", "content": message})
return _generate(msgs)
demo = gr.ChatInterface(_ui, title=f"axentx Coder — {MODEL_ID}", type="messages")
app = gr.mount_gradio_app(app, demo, path="/")
|