ashirato commited on
Commit
0535836
·
verified ·
1 Parent(s): 0367d10

fix: lazy load (avoid Space init OOM with 7B+LoRA)

Browse files
Files changed (1) hide show
  1. app.py +46 -44
app.py CHANGED
@@ -1,14 +1,13 @@
1
- """Surrogate-1 ZeroGPU Space — Qwen2.5-Coder-7B + v1 LoRA.
2
 
3
- Rewritten 2026-04-30 to use gr.ChatInterface (simpler signature, avoids
4
- the gradio_client._json_schema_to_python_type recursion bug that broke
5
- the previous custom-Blocks app.py).
6
  """
7
  import os
8
  import gradio as gr
9
  import spaces
10
  import torch
11
- from transformers import AutoModelForCausalLM, AutoTokenizer
12
 
13
  BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
14
  LORA_REPO = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
@@ -20,79 +19,82 @@ SYSTEM = (
20
  "'I don't know' rather than confabulate."
21
  )
22
 
23
- print(f"[boot] tokenizer: {BASE_MODEL}")
24
- tokenizer = AutoTokenizer.from_pretrained(
25
- BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
26
- if tokenizer.pad_token_id is None:
27
- tokenizer.pad_token_id = tokenizer.eos_token_id
28
 
29
- print(f"[boot] base model on CPU: {BASE_MODEL}")
30
- model = AutoModelForCausalLM.from_pretrained(
31
- BASE_MODEL, torch_dtype=torch.bfloat16,
32
- token=HF_TOKEN or None, trust_remote_code=True,
33
- device_map="cpu")
34
 
35
- LORA_ACTIVE = False
36
- try:
37
- from peft import PeftModel
38
- print(f"[boot] LoRA: {LORA_REPO}")
39
- model = PeftModel.from_pretrained(model, LORA_REPO, token=HF_TOKEN or None)
40
- LORA_ACTIVE = True
41
- print("[boot] LoRA applied")
42
- except Exception as e:
43
- print(f"[boot] LoRA failed (using base only): {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
 
46
- @spaces.GPU(duration=120)
47
  def respond(message, history, max_new_tokens=512, temperature=0.4, top_p=0.9):
 
 
 
48
  msgs = [{"role": "system", "content": SYSTEM}]
49
  for u, a in (history or []):
50
  if u: msgs.append({"role": "user", "content": u})
51
  if a: msgs.append({"role": "assistant", "content": a})
52
  msgs.append({"role": "user", "content": message})
53
 
54
- prompt = tokenizer.apply_chat_template(
55
  msgs, tokenize=False, add_generation_prompt=True)
56
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
57
- max_length=24000).to("cuda")
58
- model.to("cuda")
59
-
60
  out = model.generate(
61
  **inputs,
62
  max_new_tokens=int(max_new_tokens),
63
  temperature=float(temperature) if temperature > 0 else 1e-5,
64
  top_p=float(top_p),
65
  do_sample=temperature > 0,
66
- pad_token_id=tokenizer.pad_token_id,
67
- eos_token_id=tokenizer.eos_token_id,
68
  use_cache=True,
69
  )
70
  new_tokens = out[0][inputs["input_ids"].shape[1]:]
71
- return tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
72
 
73
 
74
  desc = (
75
- f"**Base**: `{BASE_MODEL}`    "
76
- f"**LoRA**: `{LORA_REPO}` "
77
- f"{'✅ active' if LORA_ACTIVE else '⚠️ base only'}<br>"
78
- f"**Hardware**: ZeroGPU A10G (PRO subscription, 25K min/mo @ $0)"
79
  )
80
 
81
  demo = gr.ChatInterface(
82
  fn=respond,
83
- title="Surrogate-1 — DevSecOps + SRE + Code Agent",
84
  description=desc,
85
  additional_inputs=[
86
  gr.Slider(64, 2048, value=512, step=64, label="max new tokens"),
87
  gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature"),
88
  gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="top_p"),
89
  ],
90
- examples=[
91
- "Write a Terraform module for an S3 bucket with KMS encryption + versioning.",
92
- "Diagnose: AWS Lambda cold start latency 3s. Architecture suggestions?",
93
- "Review this IAM policy for least-privilege violations: <paste here>",
94
- "Implement rate-limit per-API-key in FastAPI with Redis.",
95
- ],
96
  )
97
 
98
  if __name__ == "__main__":
 
1
+ """Surrogate-1 ZeroGPU Space — lazy-load variant.
2
 
3
+ Don't load the 7B model at import time (causes Space init OOM/timeout).
4
+ Load lazily inside the @spaces.GPU function ZeroGPU spins a fresh GPU
5
+ worker per request anyway, so loading there is correct.
6
  """
7
  import os
8
  import gradio as gr
9
  import spaces
10
  import torch
 
11
 
12
  BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
13
  LORA_REPO = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
 
19
  "'I don't know' rather than confabulate."
20
  )
21
 
22
+ # Module-level cache so repeated calls within same GPU worker reuse
23
+ _model = None
24
+ _tokenizer = None
 
 
25
 
 
 
 
 
 
26
 
27
+ def _load():
28
+ """Lazy load on first @spaces.GPU call (running on GPU worker)."""
29
+ global _model, _tokenizer
30
+ if _model is not None:
31
+ return _model, _tokenizer
32
+ from transformers import AutoModelForCausalLM, AutoTokenizer
33
+ print(f"[lazy-load] tokenizer: {BASE_MODEL}")
34
+ tok = AutoTokenizer.from_pretrained(
35
+ BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
36
+ if tok.pad_token_id is None:
37
+ tok.pad_token_id = tok.eos_token_id
38
+ print(f"[lazy-load] base model on cuda")
39
+ m = AutoModelForCausalLM.from_pretrained(
40
+ BASE_MODEL, torch_dtype=torch.bfloat16,
41
+ token=HF_TOKEN or None, trust_remote_code=True,
42
+ device_map="cuda")
43
+ try:
44
+ from peft import PeftModel
45
+ print(f"[lazy-load] LoRA: {LORA_REPO}")
46
+ m = PeftModel.from_pretrained(m, LORA_REPO, token=HF_TOKEN or None)
47
+ print("[lazy-load] LoRA applied")
48
+ except Exception as e:
49
+ print(f"[lazy-load] LoRA failed (using base only): {e}")
50
+ _model, _tokenizer = m, tok
51
+ return _model, _tokenizer
52
 
53
 
54
+ @spaces.GPU(duration=180)
55
  def respond(message, history, max_new_tokens=512, temperature=0.4, top_p=0.9):
56
+ if not message or not message.strip():
57
+ return ""
58
+ model, tok = _load()
59
  msgs = [{"role": "system", "content": SYSTEM}]
60
  for u, a in (history or []):
61
  if u: msgs.append({"role": "user", "content": u})
62
  if a: msgs.append({"role": "assistant", "content": a})
63
  msgs.append({"role": "user", "content": message})
64
 
65
+ prompt = tok.apply_chat_template(
66
  msgs, tokenize=False, add_generation_prompt=True)
67
+ inputs = tok(prompt, return_tensors="pt", truncation=True,
68
+ max_length=24000).to("cuda")
 
 
69
  out = model.generate(
70
  **inputs,
71
  max_new_tokens=int(max_new_tokens),
72
  temperature=float(temperature) if temperature > 0 else 1e-5,
73
  top_p=float(top_p),
74
  do_sample=temperature > 0,
75
+ pad_token_id=tok.pad_token_id,
76
+ eos_token_id=tok.eos_token_id,
77
  use_cache=True,
78
  )
79
  new_tokens = out[0][inputs["input_ids"].shape[1]:]
80
+ return tok.decode(new_tokens, skip_special_tokens=True).strip()
81
 
82
 
83
  desc = (
84
+ f"**Base**: `{BASE_MODEL}` &nbsp; **LoRA**: `{LORA_REPO}`<br>"
85
+ f"**Hardware**: ZeroGPU A10G (PRO, 25K min/mo @ $0). "
86
+ f"First request takes ~30-60s (cold model load), subsequent ~3-10s."
 
87
  )
88
 
89
  demo = gr.ChatInterface(
90
  fn=respond,
91
+ title="Surrogate-1 — DevSecOps + Code Agent",
92
  description=desc,
93
  additional_inputs=[
94
  gr.Slider(64, 2048, value=512, step=64, label="max new tokens"),
95
  gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature"),
96
  gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="top_p"),
97
  ],
 
 
 
 
 
 
98
  )
99
 
100
  if __name__ == "__main__":