ashirato's picture
switch to deepseek-coder-6.7b-instruct
c3cb41d verified
raw
history blame
2.64 kB
"""axentx coder-zero-gpu-1 — Qwen2.5-Coder-7B-Instruct on ZeroGPU.
Smaller model = faster cold start = more calls/min. 7B is plenty for
feature-builder code-gen workload. OpenAI-compatible /v1/chat/completions
endpoint for direct chain integration.
"""
import os, time
import spaces
import torch
import gradio as gr
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID = os.environ.get("MODEL_ID", "deepseek-ai/deepseek-coder-6.7b-instruct")
print(f"[init] loading {MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="cuda",
trust_remote_code=True,
)
print("[init] ready")
@spaces.GPU(duration=60)
def _generate(messages, max_tokens=1024, temperature=0.3):
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
out = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=max(temperature, 0.01),
do_sample=temperature > 0,
pad_token_id=tokenizer.eos_token_id,
)
return tokenizer.decode(
out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True,
)
app = FastAPI()
app.add_middleware(
CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
)
class ChatRequest(BaseModel):
messages: list
max_tokens: int = 1024
temperature: float = 0.3
model: str = "axentx-coder-2"
@app.post("/v1/chat/completions")
def chat_completions(req: ChatRequest):
t0 = time.time()
text = _generate(req.messages, req.max_tokens, req.temperature)
return {
"id": f"axentx-{int(t0)}", "object": "chat.completion",
"created": int(t0), "model": req.model,
"choices": [{"index": 0, "message": {"role": "assistant", "content": text}, "finish_reason": "stop"}],
"usage": {"prompt_tokens": 0, "completion_tokens": len(text.split()), "total_tokens": len(text.split())},
}
@app.get("/health")
def health():
return {"status": "ok", "model": MODEL_ID}
def _ui(message, history):
msgs = [{"role": h["role"], "content": h["content"]} for h in (history or []) if h.get("role")]
msgs.append({"role": "user", "content": message})
return _generate(msgs)
demo = gr.ChatInterface(_ui, title=f"axentx Coder — {MODEL_ID}", type="messages")
app = gr.mount_gradio_app(app, demo, path="/")