Spaces:
Runtime error
Runtime error
| """axentx coder-zero-gpu-1 — Qwen2.5-Coder-7B-Instruct on ZeroGPU. | |
| Smaller model = faster cold start = more calls/min. 7B is plenty for | |
| feature-builder code-gen workload. OpenAI-compatible /v1/chat/completions | |
| endpoint for direct chain integration. | |
| """ | |
| import os, time | |
| import spaces | |
| import torch | |
| import gradio as gr | |
| from fastapi import FastAPI | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| MODEL_ID = os.environ.get("MODEL_ID", "deepseek-ai/deepseek-coder-6.7b-instruct") | |
| print(f"[init] loading {MODEL_ID}") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16, | |
| device_map="cuda", | |
| trust_remote_code=True, | |
| ) | |
| print("[init] ready") | |
| def _generate(messages, max_tokens=1024, temperature=0.3): | |
| prompt = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt").to("cuda") | |
| out = model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=max(temperature, 0.01), | |
| do_sample=temperature > 0, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| return tokenizer.decode( | |
| out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True, | |
| ) | |
| app = FastAPI() | |
| app.add_middleware( | |
| CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"] | |
| ) | |
| class ChatRequest(BaseModel): | |
| messages: list | |
| max_tokens: int = 1024 | |
| temperature: float = 0.3 | |
| model: str = "axentx-coder-2" | |
| def chat_completions(req: ChatRequest): | |
| t0 = time.time() | |
| text = _generate(req.messages, req.max_tokens, req.temperature) | |
| return { | |
| "id": f"axentx-{int(t0)}", "object": "chat.completion", | |
| "created": int(t0), "model": req.model, | |
| "choices": [{"index": 0, "message": {"role": "assistant", "content": text}, "finish_reason": "stop"}], | |
| "usage": {"prompt_tokens": 0, "completion_tokens": len(text.split()), "total_tokens": len(text.split())}, | |
| } | |
| def health(): | |
| return {"status": "ok", "model": MODEL_ID} | |
| def _ui(message, history): | |
| msgs = [{"role": h["role"], "content": h["content"]} for h in (history or []) if h.get("role")] | |
| msgs.append({"role": "user", "content": message}) | |
| return _generate(msgs) | |
| demo = gr.ChatInterface(_ui, title=f"axentx Coder — {MODEL_ID}", type="messages") | |
| app = gr.mount_gradio_app(app, demo, path="/") | |