ashirato commited on
Commit
cef92a0
·
verified ·
1 Parent(s): 3e5db13

init: DeepSeek-Coder-V2-Lite via FastAPI/Gradio

Browse files
Files changed (3) hide show
  1. README.md +14 -7
  2. app.py +105 -0
  3. requirements.txt +9 -0
README.md CHANGED
@@ -1,13 +1,20 @@
1
  ---
2
- title: Coder Zero Gpu 2
3
- emoji: 📈
4
- colorFrom: red
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.14.0
8
- python_version: '3.13'
9
  app_file: app.py
10
  pinned: false
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
1
  ---
2
+ title: axentx Coder ZeroGPU 2
3
+ emoji: 🐬
4
+ colorFrom: indigo
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.44.0
 
8
  app_file: app.py
9
  pinned: false
10
+ short_description: DeepSeek-Coder-V2-Lite-Instruct on ZeroGPU
11
  ---
12
 
13
+ # axentx coder-zero-gpu-2
14
+
15
+ OpenAI-compatible code generation endpoint backed by `Qwen2.5-Coder-32B-Instruct-AWQ`.
16
+
17
+ ## Endpoints
18
+ - `POST /v1/chat/completions` — OpenAI-compatible chat
19
+ - `GET /health` — model + status
20
+ - `/` — Gradio chat UI
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """axentx coder-zero-gpu-1 — Qwen2.5-Coder-32B-Instruct-AWQ on ZeroGPU.
2
+
3
+ Exposes OpenAI-compatible /v1/chat/completions so the axentx pipeline's
4
+ LLM chain can hit it like any other upstream provider.
5
+ """
6
+ import os
7
+ import time
8
+ import spaces
9
+ import torch
10
+ import gradio as gr
11
+ from fastapi import FastAPI
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from pydantic import BaseModel
14
+ from transformers import AutoModelForCausalLM, AutoTokenizer
15
+
16
+ MODEL_ID = os.environ.get("MODEL_ID", "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct")
17
+
18
+ print(f"[init] loading tokenizer: {MODEL_ID}")
19
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
20
+ print(f"[init] loading model")
21
+ model = AutoModelForCausalLM.from_pretrained(
22
+ MODEL_ID,
23
+ torch_dtype=torch.bfloat16,
24
+ device_map="cuda",
25
+ trust_remote_code=True,
26
+ )
27
+ print(f"[init] ready")
28
+
29
+
30
+ @spaces.GPU(duration=120)
31
+ def _generate(messages, max_tokens=1024, temperature=0.3):
32
+ prompt = tokenizer.apply_chat_template(
33
+ messages, tokenize=False, add_generation_prompt=True
34
+ )
35
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
36
+ out = model.generate(
37
+ **inputs,
38
+ max_new_tokens=max_tokens,
39
+ temperature=max(temperature, 0.01),
40
+ do_sample=temperature > 0,
41
+ pad_token_id=tokenizer.eos_token_id,
42
+ )
43
+ text = tokenizer.decode(
44
+ out[0][inputs.input_ids.shape[1]:],
45
+ skip_special_tokens=True,
46
+ )
47
+ return text
48
+
49
+
50
+ app = FastAPI(title="axentx coder ZeroGPU")
51
+ app.add_middleware(
52
+ CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
53
+ )
54
+
55
+
56
+ class ChatRequest(BaseModel):
57
+ messages: list
58
+ max_tokens: int = 1024
59
+ temperature: float = 0.3
60
+ model: str = "axentx-coder-2"
61
+
62
+
63
+ @app.post("/v1/chat/completions")
64
+ def chat_completions(req: ChatRequest):
65
+ t0 = time.time()
66
+ text = _generate(req.messages, req.max_tokens, req.temperature)
67
+ return {
68
+ "id": f"axentx-{int(t0)}",
69
+ "object": "chat.completion",
70
+ "created": int(t0),
71
+ "model": req.model,
72
+ "choices": [{
73
+ "index": 0,
74
+ "message": {"role": "assistant", "content": text},
75
+ "finish_reason": "stop",
76
+ }],
77
+ "usage": {
78
+ "prompt_tokens": 0,
79
+ "completion_tokens": len(text.split()),
80
+ "total_tokens": len(text.split()),
81
+ },
82
+ }
83
+
84
+
85
+ @app.get("/health")
86
+ def health():
87
+ return {"status": "ok", "model": MODEL_ID}
88
+
89
+
90
+ def _ui_chat(message, history):
91
+ msgs = []
92
+ for h in history:
93
+ if h.get("role") and h.get("content"):
94
+ msgs.append({"role": h["role"], "content": h["content"]})
95
+ msgs.append({"role": "user", "content": message})
96
+ return _generate(msgs, max_tokens=1024, temperature=0.3)
97
+
98
+
99
+ demo = gr.ChatInterface(
100
+ _ui_chat,
101
+ title="axentx Coder — Qwen2.5-Coder-32B-Instruct (ZeroGPU)",
102
+ type="messages",
103
+ )
104
+
105
+ app = gr.mount_gradio_app(app, demo, path="/")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers>=4.45.0
3
+ accelerate
4
+ spaces
5
+ fastapi
6
+ pydantic>=2
7
+ gradio>=4.40
8
+ autoawq
9
+ sentencepiece