"""axentx coder-zero-gpu-1 — Qwen2.5-Coder-7B-Instruct on ZeroGPU. Smaller model = faster cold start = more calls/min. 7B is plenty for feature-builder code-gen workload. OpenAI-compatible /v1/chat/completions endpoint for direct chain integration. """ import os, time import spaces import torch import gradio as gr from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer MODEL_ID = os.environ.get("MODEL_ID", "deepseek-ai/deepseek-coder-6.7b-instruct") print(f"[init] loading {MODEL_ID}") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, device_map="cuda", trust_remote_code=True, ) print("[init] ready") @spaces.GPU(duration=60) def _generate(messages, max_tokens=1024, temperature=0.3): prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer(prompt, return_tensors="pt").to("cuda") out = model.generate( **inputs, max_new_tokens=max_tokens, temperature=max(temperature, 0.01), do_sample=temperature > 0, pad_token_id=tokenizer.eos_token_id, ) return tokenizer.decode( out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True, ) app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"] ) class ChatRequest(BaseModel): messages: list max_tokens: int = 1024 temperature: float = 0.3 model: str = "axentx-coder-2" @app.post("/v1/chat/completions") def chat_completions(req: ChatRequest): t0 = time.time() text = _generate(req.messages, req.max_tokens, req.temperature) return { "id": f"axentx-{int(t0)}", "object": "chat.completion", "created": int(t0), "model": req.model, "choices": [{"index": 0, "message": {"role": "assistant", "content": text}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 0, "completion_tokens": len(text.split()), "total_tokens": len(text.split())}, } @app.get("/health") def health(): return {"status": "ok", "model": MODEL_ID} def _ui(message, history): msgs = [{"role": h["role"], "content": h["content"]} for h in (history or []) if h.get("role")] msgs.append({"role": "user", "content": message}) return _generate(msgs) demo = gr.ChatInterface(_ui, title=f"axentx Coder — {MODEL_ID}", type="messages") app = gr.mount_gradio_app(app, demo, path="/")