File size: 2,637 Bytes
c3cb41d
cef92a0
c3cb41d
 
 
cef92a0
c3cb41d
cef92a0
 
 
 
 
 
 
 
c3cb41d
cef92a0
c3cb41d
cef92a0
 
 
 
 
 
 
c3cb41d
cef92a0
 
c3cb41d
cef92a0
 
 
 
 
 
 
 
 
 
 
 
c3cb41d
 
cef92a0
 
 
c3cb41d
cef92a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3cb41d
 
 
 
cef92a0
 
 
 
 
 
 
 
c3cb41d
 
cef92a0
c3cb41d
cef92a0
 
c3cb41d
cef92a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""axentx coder-zero-gpu-1 — Qwen2.5-Coder-7B-Instruct on ZeroGPU.

Smaller model = faster cold start = more calls/min. 7B is plenty for
feature-builder code-gen workload. OpenAI-compatible /v1/chat/completions
endpoint for direct chain integration.
"""
import os, time
import spaces
import torch
import gradio as gr
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_ID = os.environ.get("MODEL_ID", "deepseek-ai/deepseek-coder-6.7b-instruct")

print(f"[init] loading {MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
    trust_remote_code=True,
)
print("[init] ready")


@spaces.GPU(duration=60)
def _generate(messages, max_tokens=1024, temperature=0.3):
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    out = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=max(temperature, 0.01),
        do_sample=temperature > 0,
        pad_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(
        out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True,
    )


app = FastAPI()
app.add_middleware(
    CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
)


class ChatRequest(BaseModel):
    messages: list
    max_tokens: int = 1024
    temperature: float = 0.3
    model: str = "axentx-coder-2"


@app.post("/v1/chat/completions")
def chat_completions(req: ChatRequest):
    t0 = time.time()
    text = _generate(req.messages, req.max_tokens, req.temperature)
    return {
        "id": f"axentx-{int(t0)}", "object": "chat.completion",
        "created": int(t0), "model": req.model,
        "choices": [{"index": 0, "message": {"role": "assistant", "content": text}, "finish_reason": "stop"}],
        "usage": {"prompt_tokens": 0, "completion_tokens": len(text.split()), "total_tokens": len(text.split())},
    }


@app.get("/health")
def health():
    return {"status": "ok", "model": MODEL_ID}


def _ui(message, history):
    msgs = [{"role": h["role"], "content": h["content"]} for h in (history or []) if h.get("role")]
    msgs.append({"role": "user", "content": message})
    return _generate(msgs)


demo = gr.ChatInterface(_ui, title=f"axentx Coder — {MODEL_ID}", type="messages")
app = gr.mount_gradio_app(app, demo, path="/")