Spaces:
Runtime error
Runtime error
| """axentx coder-zero-gpu-1 — Qwen2.5-Coder-32B-Instruct-AWQ on ZeroGPU. | |
| Exposes OpenAI-compatible /v1/chat/completions so the axentx pipeline's | |
| LLM chain can hit it like any other upstream provider. | |
| """ | |
| import os | |
| import time | |
| import spaces | |
| import torch | |
| import gradio as gr | |
| from fastapi import FastAPI | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| MODEL_ID = os.environ.get("MODEL_ID", "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct") | |
| print(f"[init] loading tokenizer: {MODEL_ID}") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| print(f"[init] loading model") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16, | |
| device_map="cuda", | |
| trust_remote_code=True, | |
| ) | |
| print(f"[init] ready") | |
| def _generate(messages, max_tokens=1024, temperature=0.3): | |
| prompt = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt").to("cuda") | |
| out = model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=max(temperature, 0.01), | |
| do_sample=temperature > 0, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| text = tokenizer.decode( | |
| out[0][inputs.input_ids.shape[1]:], | |
| skip_special_tokens=True, | |
| ) | |
| return text | |
| app = FastAPI(title="axentx coder ZeroGPU") | |
| app.add_middleware( | |
| CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"] | |
| ) | |
| class ChatRequest(BaseModel): | |
| messages: list | |
| max_tokens: int = 1024 | |
| temperature: float = 0.3 | |
| model: str = "axentx-coder-2" | |
| def chat_completions(req: ChatRequest): | |
| t0 = time.time() | |
| text = _generate(req.messages, req.max_tokens, req.temperature) | |
| return { | |
| "id": f"axentx-{int(t0)}", | |
| "object": "chat.completion", | |
| "created": int(t0), | |
| "model": req.model, | |
| "choices": [{ | |
| "index": 0, | |
| "message": {"role": "assistant", "content": text}, | |
| "finish_reason": "stop", | |
| }], | |
| "usage": { | |
| "prompt_tokens": 0, | |
| "completion_tokens": len(text.split()), | |
| "total_tokens": len(text.split()), | |
| }, | |
| } | |
| def health(): | |
| return {"status": "ok", "model": MODEL_ID} | |
| def _ui_chat(message, history): | |
| msgs = [] | |
| for h in history: | |
| if h.get("role") and h.get("content"): | |
| msgs.append({"role": h["role"], "content": h["content"]}) | |
| msgs.append({"role": "user", "content": message}) | |
| return _generate(msgs, max_tokens=1024, temperature=0.3) | |
| demo = gr.ChatInterface( | |
| _ui_chat, | |
| title="axentx Coder — Qwen2.5-Coder-32B-Instruct (ZeroGPU)", | |
| type="messages", | |
| ) | |
| app = gr.mount_gradio_app(app, demo, path="/") | |