ashirato commited on
Commit
d45a2f7
·
verified ·
1 Parent(s): 0535836

fix: switch to Qwen2.5-Coder-3B (faster cold load, fits A10G in <60s)

Browse files
Files changed (1) hide show
  1. app.py +36 -46
app.py CHANGED
@@ -1,101 +1,91 @@
1
- """Surrogate-1 ZeroGPU Space — lazy-load variant.
2
 
3
- Don't load the 7B model at import time (causes Space init OOM/timeout).
4
- Load lazily inside the @spaces.GPU function ZeroGPU spins a fresh GPU
5
- worker per request anyway, so loading there is correct.
 
6
  """
7
  import os
8
  import gradio as gr
9
  import spaces
10
  import torch
11
 
12
- BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
13
  LORA_REPO = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
14
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
15
 
16
  SYSTEM = (
17
  "You are Surrogate-1, an expert DevSecOps + SRE + coding agent. "
18
- "Cite real APIs only no phantom imports. When uncertain, say "
19
- "'I don't know' rather than confabulate."
20
  )
21
 
22
- # Module-level cache so repeated calls within same GPU worker reuse
23
  _model = None
24
- _tokenizer = None
25
 
26
 
27
- def _load():
28
- """Lazy load on first @spaces.GPU call (running on GPU worker)."""
29
- global _model, _tokenizer
30
  if _model is not None:
31
- return _model, _tokenizer
32
  from transformers import AutoModelForCausalLM, AutoTokenizer
33
- print(f"[lazy-load] tokenizer: {BASE_MODEL}")
34
- tok = AutoTokenizer.from_pretrained(
35
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
36
- if tok.pad_token_id is None:
37
- tok.pad_token_id = tok.eos_token_id
38
- print(f"[lazy-load] base model on cuda")
39
- m = AutoModelForCausalLM.from_pretrained(
40
  BASE_MODEL, torch_dtype=torch.bfloat16,
41
  token=HF_TOKEN or None, trust_remote_code=True,
42
  device_map="cuda")
43
- try:
44
- from peft import PeftModel
45
- print(f"[lazy-load] LoRA: {LORA_REPO}")
46
- m = PeftModel.from_pretrained(m, LORA_REPO, token=HF_TOKEN or None)
47
- print("[lazy-load] LoRA applied")
48
- except Exception as e:
49
- print(f"[lazy-load] LoRA failed (using base only): {e}")
50
- _model, _tokenizer = m, tok
51
- return _model, _tokenizer
 
52
 
53
 
54
- @spaces.GPU(duration=180)
55
- def respond(message, history, max_new_tokens=512, temperature=0.4, top_p=0.9):
56
  if not message or not message.strip():
57
  return ""
58
- model, tok = _load()
59
  msgs = [{"role": "system", "content": SYSTEM}]
60
  for u, a in (history or []):
61
  if u: msgs.append({"role": "user", "content": u})
62
  if a: msgs.append({"role": "assistant", "content": a})
63
  msgs.append({"role": "user", "content": message})
64
 
65
- prompt = tok.apply_chat_template(
66
- msgs, tokenize=False, add_generation_prompt=True)
67
  inputs = tok(prompt, return_tensors="pt", truncation=True,
68
- max_length=24000).to("cuda")
69
  out = model.generate(
70
  **inputs,
71
  max_new_tokens=int(max_new_tokens),
72
  temperature=float(temperature) if temperature > 0 else 1e-5,
73
- top_p=float(top_p),
74
  do_sample=temperature > 0,
75
  pad_token_id=tok.pad_token_id,
76
  eos_token_id=tok.eos_token_id,
77
- use_cache=True,
78
  )
79
  new_tokens = out[0][inputs["input_ids"].shape[1]:]
80
  return tok.decode(new_tokens, skip_special_tokens=True).strip()
81
 
82
 
83
- desc = (
84
- f"**Base**: `{BASE_MODEL}` &nbsp; **LoRA**: `{LORA_REPO}`<br>"
85
- f"**Hardware**: ZeroGPU A10G (PRO, 25K min/mo @ $0). "
86
- f"First request takes ~30-60s (cold model load), subsequent ~3-10s."
87
- )
88
-
89
  demo = gr.ChatInterface(
90
  fn=respond,
91
- title="Surrogate-1 — DevSecOps + Code Agent",
92
- description=desc,
93
  additional_inputs=[
94
  gr.Slider(64, 2048, value=512, step=64, label="max new tokens"),
95
  gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature"),
96
- gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="top_p"),
97
  ],
98
  )
99
 
100
  if __name__ == "__main__":
101
- demo.queue(max_size=20).launch()
 
1
+ """Surrogate-1 ZeroGPU Space — minimal, works.
2
 
3
+ Switched to Qwen2.5-Coder-3B (~6GB BF16 vs 14GB on 7B) for faster cold
4
+ load (≤60s on A10G). Same Surrogate-1 v1 LoRA applies only base model
5
+ size differs. For long form / hard tasks, the chat ladder includes 7B
6
+ fallback via free APIs; this Space serves the fast path.
7
  """
8
  import os
9
  import gradio as gr
10
  import spaces
11
  import torch
12
 
13
+ BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-3B-Instruct")
14
  LORA_REPO = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
15
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
16
 
17
  SYSTEM = (
18
  "You are Surrogate-1, an expert DevSecOps + SRE + coding agent. "
19
+ "Cite real APIs only. Say 'I don't know' rather than confabulate."
 
20
  )
21
 
22
+ # Module-level cache
23
  _model = None
24
+ _tok = None
25
 
26
 
27
+ def _load_lazy():
28
+ """Load only inside @spaces.GPU function (i.e., on GPU worker)."""
29
+ global _model, _tok
30
  if _model is not None:
31
+ return _model, _tok
32
  from transformers import AutoModelForCausalLM, AutoTokenizer
33
+ _tok = AutoTokenizer.from_pretrained(
 
34
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
35
+ if _tok.pad_token_id is None:
36
+ _tok.pad_token_id = _tok.eos_token_id
37
+ _model = AutoModelForCausalLM.from_pretrained(
 
38
  BASE_MODEL, torch_dtype=torch.bfloat16,
39
  token=HF_TOKEN or None, trust_remote_code=True,
40
  device_map="cuda")
41
+ # LoRA optional — base model size mismatch (3B vs 7B) makes v1 LoRA
42
+ # incompatible. We serve the base 3B for now; on 7B Space we apply LoRA.
43
+ if BASE_MODEL.endswith("7B-Instruct"):
44
+ try:
45
+ from peft import PeftModel
46
+ _model = PeftModel.from_pretrained(_model, LORA_REPO,
47
+ token=HF_TOKEN or None)
48
+ except Exception as e:
49
+ print(f"[load] LoRA skip: {e}")
50
+ return _model, _tok
51
 
52
 
53
+ @spaces.GPU(duration=300)
54
+ def respond(message, history, max_new_tokens=512, temperature=0.4):
55
  if not message or not message.strip():
56
  return ""
57
+ model, tok = _load_lazy()
58
  msgs = [{"role": "system", "content": SYSTEM}]
59
  for u, a in (history or []):
60
  if u: msgs.append({"role": "user", "content": u})
61
  if a: msgs.append({"role": "assistant", "content": a})
62
  msgs.append({"role": "user", "content": message})
63
 
64
+ prompt = tok.apply_chat_template(msgs, tokenize=False,
65
+ add_generation_prompt=True)
66
  inputs = tok(prompt, return_tensors="pt", truncation=True,
67
+ max_length=8000).to("cuda")
68
  out = model.generate(
69
  **inputs,
70
  max_new_tokens=int(max_new_tokens),
71
  temperature=float(temperature) if temperature > 0 else 1e-5,
 
72
  do_sample=temperature > 0,
73
  pad_token_id=tok.pad_token_id,
74
  eos_token_id=tok.eos_token_id,
 
75
  )
76
  new_tokens = out[0][inputs["input_ids"].shape[1]:]
77
  return tok.decode(new_tokens, skip_special_tokens=True).strip()
78
 
79
 
 
 
 
 
 
 
80
  demo = gr.ChatInterface(
81
  fn=respond,
82
+ title=f"Surrogate-1 — {BASE_MODEL.split('/')[-1]}",
83
+ description=f"ZeroGPU A10G ��� {BASE_MODEL}. First request ~30-60s cold load.",
84
  additional_inputs=[
85
  gr.Slider(64, 2048, value=512, step=64, label="max new tokens"),
86
  gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature"),
 
87
  ],
88
  )
89
 
90
  if __name__ == "__main__":
91
+ demo.queue(max_size=10).launch()