ashirato commited on
Commit
c3cb41d
·
verified ·
1 Parent(s): ce30beb

switch to deepseek-coder-6.7b-instruct

Browse files
Files changed (2) hide show
  1. app.py +20 -41
  2. requirements.txt +0 -1
app.py CHANGED
@@ -1,10 +1,10 @@
1
- """axentx coder-zero-gpu-1 — Qwen2.5-Coder-32B-Instruct-AWQ on ZeroGPU.
2
 
3
- Exposes OpenAI-compatible /v1/chat/completions so the axentx pipeline's
4
- LLM chain can hit it like any other upstream provider.
 
5
  """
6
- import os
7
- import time
8
  import spaces
9
  import torch
10
  import gradio as gr
@@ -13,21 +13,20 @@ from fastapi.middleware.cors import CORSMiddleware
13
  from pydantic import BaseModel
14
  from transformers import AutoModelForCausalLM, AutoTokenizer
15
 
16
- MODEL_ID = os.environ.get("MODEL_ID", "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct")
17
 
18
- print(f"[init] loading tokenizer: {MODEL_ID}")
19
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
20
- print(f"[init] loading model")
21
  model = AutoModelForCausalLM.from_pretrained(
22
  MODEL_ID,
23
  torch_dtype=torch.bfloat16,
24
  device_map="cuda",
25
  trust_remote_code=True,
26
  )
27
- print(f"[init] ready")
28
 
29
 
30
- @spaces.GPU(duration=120)
31
  def _generate(messages, max_tokens=1024, temperature=0.3):
32
  prompt = tokenizer.apply_chat_template(
33
  messages, tokenize=False, add_generation_prompt=True
@@ -40,14 +39,12 @@ def _generate(messages, max_tokens=1024, temperature=0.3):
40
  do_sample=temperature > 0,
41
  pad_token_id=tokenizer.eos_token_id,
42
  )
43
- text = tokenizer.decode(
44
- out[0][inputs.input_ids.shape[1]:],
45
- skip_special_tokens=True,
46
  )
47
- return text
48
 
49
 
50
- app = FastAPI(title="axentx coder ZeroGPU")
51
  app.add_middleware(
52
  CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
53
  )
@@ -65,20 +62,10 @@ def chat_completions(req: ChatRequest):
65
  t0 = time.time()
66
  text = _generate(req.messages, req.max_tokens, req.temperature)
67
  return {
68
- "id": f"axentx-{int(t0)}",
69
- "object": "chat.completion",
70
- "created": int(t0),
71
- "model": req.model,
72
- "choices": [{
73
- "index": 0,
74
- "message": {"role": "assistant", "content": text},
75
- "finish_reason": "stop",
76
- }],
77
- "usage": {
78
- "prompt_tokens": 0,
79
- "completion_tokens": len(text.split()),
80
- "total_tokens": len(text.split()),
81
- },
82
  }
83
 
84
 
@@ -87,19 +74,11 @@ def health():
87
  return {"status": "ok", "model": MODEL_ID}
88
 
89
 
90
- def _ui_chat(message, history):
91
- msgs = []
92
- for h in history:
93
- if h.get("role") and h.get("content"):
94
- msgs.append({"role": h["role"], "content": h["content"]})
95
  msgs.append({"role": "user", "content": message})
96
- return _generate(msgs, max_tokens=1024, temperature=0.3)
97
 
98
 
99
- demo = gr.ChatInterface(
100
- _ui_chat,
101
- title="axentx Coder — Qwen2.5-Coder-32B-Instruct (ZeroGPU)",
102
- type="messages",
103
- )
104
-
105
  app = gr.mount_gradio_app(app, demo, path="/")
 
1
+ """axentx coder-zero-gpu-1 — Qwen2.5-Coder-7B-Instruct on ZeroGPU.
2
 
3
+ Smaller model = faster cold start = more calls/min. 7B is plenty for
4
+ feature-builder code-gen workload. OpenAI-compatible /v1/chat/completions
5
+ endpoint for direct chain integration.
6
  """
7
+ import os, time
 
8
  import spaces
9
  import torch
10
  import gradio as gr
 
13
  from pydantic import BaseModel
14
  from transformers import AutoModelForCausalLM, AutoTokenizer
15
 
16
+ MODEL_ID = os.environ.get("MODEL_ID", "deepseek-ai/deepseek-coder-6.7b-instruct")
17
 
18
+ print(f"[init] loading {MODEL_ID}")
19
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
20
  model = AutoModelForCausalLM.from_pretrained(
21
  MODEL_ID,
22
  torch_dtype=torch.bfloat16,
23
  device_map="cuda",
24
  trust_remote_code=True,
25
  )
26
+ print("[init] ready")
27
 
28
 
29
+ @spaces.GPU(duration=60)
30
  def _generate(messages, max_tokens=1024, temperature=0.3):
31
  prompt = tokenizer.apply_chat_template(
32
  messages, tokenize=False, add_generation_prompt=True
 
39
  do_sample=temperature > 0,
40
  pad_token_id=tokenizer.eos_token_id,
41
  )
42
+ return tokenizer.decode(
43
+ out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True,
 
44
  )
 
45
 
46
 
47
+ app = FastAPI()
48
  app.add_middleware(
49
  CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]
50
  )
 
62
  t0 = time.time()
63
  text = _generate(req.messages, req.max_tokens, req.temperature)
64
  return {
65
+ "id": f"axentx-{int(t0)}", "object": "chat.completion",
66
+ "created": int(t0), "model": req.model,
67
+ "choices": [{"index": 0, "message": {"role": "assistant", "content": text}, "finish_reason": "stop"}],
68
+ "usage": {"prompt_tokens": 0, "completion_tokens": len(text.split()), "total_tokens": len(text.split())},
 
 
 
 
 
 
 
 
 
 
69
  }
70
 
71
 
 
74
  return {"status": "ok", "model": MODEL_ID}
75
 
76
 
77
+ def _ui(message, history):
78
+ msgs = [{"role": h["role"], "content": h["content"]} for h in (history or []) if h.get("role")]
 
 
 
79
  msgs.append({"role": "user", "content": message})
80
+ return _generate(msgs)
81
 
82
 
83
+ demo = gr.ChatInterface(_ui, title=f"axentx Coder — {MODEL_ID}", type="messages")
 
 
 
 
 
84
  app = gr.mount_gradio_app(app, demo, path="/")
requirements.txt CHANGED
@@ -6,5 +6,4 @@ fastapi
6
  pydantic>=2
7
  gradio>=5.0.0
8
  huggingface_hub>=0.25
9
- autoawq
10
  sentencepiece
 
6
  pydantic>=2
7
  gradio>=5.0.0
8
  huggingface_hub>=0.25
 
9
  sentencepiece