ashirato commited on
Commit
0367d10
Β·
verified Β·
1 Parent(s): 16a3ce3

fix: use gr.ChatInterface (simpler sig, avoids _json_schema bug)

Browse files
Files changed (1) hide show
  1. app.py +46 -104
app.py CHANGED
@@ -1,14 +1,14 @@
1
- """Surrogate-1 ZeroGPU Space β€” Qwen2.5-Coder-7B + Surrogate-1 v1 LoRA.
2
 
3
- Loads on CPU, swaps to ZeroGPU (A10G) per request via @spaces.GPU.
4
- PRO subscription on owner account = 25K GPU-min/mo at $0.
 
5
  """
6
  import os
7
  import gradio as gr
8
  import spaces
9
  import torch
10
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
- from threading import Thread
12
 
13
  BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
14
  LORA_REPO = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
@@ -16,60 +16,48 @@ HF_TOKEN = os.environ.get("HF_TOKEN", "")
16
 
17
  SYSTEM = (
18
  "You are Surrogate-1, an expert DevSecOps + SRE + coding agent. "
19
- "You handle Terraform/CDK/CFN, Kubernetes, IAM least-privilege, CVE "
20
- "remediation, SLO/runbooks, and full-stack code in Python/TypeScript/"
21
- "Go/Rust. Cite real APIs only β€” no phantom imports. When uncertain, "
22
- "say 'I don't know' rather than confabulate."
23
  )
24
 
25
- print(f"[boot] loading tokenizer: {BASE_MODEL}")
26
  tokenizer = AutoTokenizer.from_pretrained(
27
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
 
 
28
 
29
- print(f"[boot] loading base model on CPU (will move to GPU on call): {BASE_MODEL}")
30
  model = AutoModelForCausalLM.from_pretrained(
31
  BASE_MODEL, torch_dtype=torch.bfloat16,
32
  token=HF_TOKEN or None, trust_remote_code=True,
33
  device_map="cpu")
34
 
35
- # Try to apply LoRA β€” graceful fallback to base if LoRA repo unavailable
36
  LORA_ACTIVE = False
37
  try:
38
  from peft import PeftModel
39
- print(f"[boot] applying LoRA: {LORA_REPO}")
40
  model = PeftModel.from_pretrained(model, LORA_REPO, token=HF_TOKEN or None)
41
  LORA_ACTIVE = True
42
- print("[boot] LoRA applied OK")
43
  except Exception as e:
44
- print(f"[boot] LoRA apply failed (using base only): {e}")
45
-
46
- if tokenizer.pad_token_id is None:
47
- tokenizer.pad_token_id = tokenizer.eos_token_id
48
 
49
 
50
- def render_messages(history: list[tuple[str, str]], user_msg: str) -> str:
 
51
  msgs = [{"role": "system", "content": SYSTEM}]
52
- for u, a in history:
53
  if u: msgs.append({"role": "user", "content": u})
54
  if a: msgs.append({"role": "assistant", "content": a})
55
- msgs.append({"role": "user", "content": user_msg})
56
- return tokenizer.apply_chat_template(
57
- msgs, tokenize=False, add_generation_prompt=True)
58
-
59
 
60
- @spaces.GPU(duration=120)
61
- def chat(user_msg, history, max_new_tokens=512, temperature=0.4, top_p=0.9):
62
- if not user_msg or not user_msg.strip():
63
- yield ""; return
64
-
65
- prompt_text = render_messages(history or [], user_msg)
66
- inputs = tokenizer(prompt_text, return_tensors="pt", truncation=True,
67
  max_length=24000).to("cuda")
68
  model.to("cuda")
69
 
70
- streamer = TextIteratorStreamer(
71
- tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=60)
72
- gen_kwargs = dict(
73
  **inputs,
74
  max_new_tokens=int(max_new_tokens),
75
  temperature=float(temperature) if temperature > 0 else 1e-5,
@@ -77,81 +65,35 @@ def chat(user_msg, history, max_new_tokens=512, temperature=0.4, top_p=0.9):
77
  do_sample=temperature > 0,
78
  pad_token_id=tokenizer.pad_token_id,
79
  eos_token_id=tokenizer.eos_token_id,
80
- streamer=streamer,
81
  use_cache=True,
82
  )
83
- th = Thread(target=model.generate, kwargs=gen_kwargs)
84
- th.start()
85
- out = ""
86
- for chunk in streamer:
87
- out += chunk
88
- yield out
89
- th.join()
90
 
91
 
92
- CSS = """
93
- .gradio-container { max-width: 1100px !important; }
94
- .contain { font-family: ui-monospace, SFMono-Regular, monospace; }
95
- """
96
-
97
- with gr.Blocks(title="Surrogate-1 ZeroGPU", css=CSS,
98
- theme=gr.themes.Base()) as demo:
99
- gr.Markdown(
100
- f"# Surrogate-1 ZeroGPU\n"
101
- f"**Base**: `{BASE_MODEL}` \n"
102
- f"**LoRA**: `{LORA_REPO}` {'βœ… active' if LORA_ACTIVE else '⚠️ base only (LoRA load failed)'} \n"
103
- f"**Hardware**: ZeroGPU A10G (PRO subscription, 25K min/mo @ $0)\n"
104
- )
105
-
106
- with gr.Row():
107
- with gr.Column(scale=4):
108
- chatbot = gr.Chatbot(
109
- height=560,
110
- show_label=False,
111
- avatar_images=(None, None),
112
- bubble_full_width=False,
113
- )
114
- msg = gr.Textbox(
115
- placeholder="ask Surrogate-1 anything: code, devops, security, sre...",
116
- show_label=False,
117
- lines=2,
118
- )
119
- with gr.Row():
120
- submit = gr.Button("send", variant="primary")
121
- clear = gr.Button("clear")
122
- with gr.Column(scale=1):
123
- max_new = gr.Slider(64, 2048, value=512, step=64, label="max new tokens")
124
- temp = gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature")
125
- top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="top_p")
126
-
127
- def _user(user_msg, hist):
128
- return "", (hist or []) + [(user_msg, None)]
129
-
130
- def _bot(hist, mn, t, tp):
131
- if not hist: return
132
- user_msg = hist[-1][0]
133
- h_for_render = hist[:-1]
134
- partial = ""
135
- for chunk in chat(user_msg, h_for_render, mn, t, tp):
136
- partial = chunk
137
- hist[-1] = (user_msg, partial)
138
- yield hist
139
-
140
- submit.click(_user, [msg, chatbot], [msg, chatbot], queue=False) \
141
- .then(_bot, [chatbot, max_new, temp, top_p], chatbot)
142
- msg.submit(_user, [msg, chatbot], [msg, chatbot], queue=False) \
143
- .then(_bot, [chatbot, max_new, temp, top_p], chatbot)
144
- clear.click(lambda: None, None, chatbot, queue=False)
145
-
146
- gr.Markdown(
147
- "---\n"
148
- "**API**: any caller can hit `/api/predict` on this Space (Gradio API). \n"
149
- "**Programmatic**: `from gradio_client import Client; "
150
- "Client('ashirato/surrogate-1-zero-gpu').predict(...)`. \n"
151
- "**Source**: [github.com/axentx/surrogate-1](https://github.com/axentx) "
152
- "(orchestration on `axentx/surrogate-1`, inference here)."
153
- )
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  if __name__ == "__main__":
157
  demo.queue(max_size=20).launch()
 
1
+ """Surrogate-1 ZeroGPU Space β€” Qwen2.5-Coder-7B + v1 LoRA.
2
 
3
+ Rewritten 2026-04-30 to use gr.ChatInterface (simpler signature, avoids
4
+ the gradio_client._json_schema_to_python_type recursion bug that broke
5
+ the previous custom-Blocks app.py).
6
  """
7
  import os
8
  import gradio as gr
9
  import spaces
10
  import torch
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
12
 
13
  BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
14
  LORA_REPO = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
 
16
 
17
  SYSTEM = (
18
  "You are Surrogate-1, an expert DevSecOps + SRE + coding agent. "
19
+ "Cite real APIs only β€” no phantom imports. When uncertain, say "
20
+ "'I don't know' rather than confabulate."
 
 
21
  )
22
 
23
+ print(f"[boot] tokenizer: {BASE_MODEL}")
24
  tokenizer = AutoTokenizer.from_pretrained(
25
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
26
+ if tokenizer.pad_token_id is None:
27
+ tokenizer.pad_token_id = tokenizer.eos_token_id
28
 
29
+ print(f"[boot] base model on CPU: {BASE_MODEL}")
30
  model = AutoModelForCausalLM.from_pretrained(
31
  BASE_MODEL, torch_dtype=torch.bfloat16,
32
  token=HF_TOKEN or None, trust_remote_code=True,
33
  device_map="cpu")
34
 
 
35
  LORA_ACTIVE = False
36
  try:
37
  from peft import PeftModel
38
+ print(f"[boot] LoRA: {LORA_REPO}")
39
  model = PeftModel.from_pretrained(model, LORA_REPO, token=HF_TOKEN or None)
40
  LORA_ACTIVE = True
41
+ print("[boot] LoRA applied")
42
  except Exception as e:
43
+ print(f"[boot] LoRA failed (using base only): {e}")
 
 
 
44
 
45
 
46
+ @spaces.GPU(duration=120)
47
+ def respond(message, history, max_new_tokens=512, temperature=0.4, top_p=0.9):
48
  msgs = [{"role": "system", "content": SYSTEM}]
49
+ for u, a in (history or []):
50
  if u: msgs.append({"role": "user", "content": u})
51
  if a: msgs.append({"role": "assistant", "content": a})
52
+ msgs.append({"role": "user", "content": message})
 
 
 
53
 
54
+ prompt = tokenizer.apply_chat_template(
55
+ msgs, tokenize=False, add_generation_prompt=True)
56
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
 
 
 
 
57
  max_length=24000).to("cuda")
58
  model.to("cuda")
59
 
60
+ out = model.generate(
 
 
61
  **inputs,
62
  max_new_tokens=int(max_new_tokens),
63
  temperature=float(temperature) if temperature > 0 else 1e-5,
 
65
  do_sample=temperature > 0,
66
  pad_token_id=tokenizer.pad_token_id,
67
  eos_token_id=tokenizer.eos_token_id,
 
68
  use_cache=True,
69
  )
70
+ new_tokens = out[0][inputs["input_ids"].shape[1]:]
71
+ return tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
 
 
 
 
 
72
 
73
 
74
+ desc = (
75
+ f"**Base**: `{BASE_MODEL}`    "
76
+ f"**LoRA**: `{LORA_REPO}` "
77
+ f"{'βœ… active' if LORA_ACTIVE else '⚠️ base only'}<br>"
78
+ f"**Hardware**: ZeroGPU A10G (PRO subscription, 25K min/mo @ $0)"
79
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ demo = gr.ChatInterface(
82
+ fn=respond,
83
+ title="Surrogate-1 β€” DevSecOps + SRE + Code Agent",
84
+ description=desc,
85
+ additional_inputs=[
86
+ gr.Slider(64, 2048, value=512, step=64, label="max new tokens"),
87
+ gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature"),
88
+ gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="top_p"),
89
+ ],
90
+ examples=[
91
+ "Write a Terraform module for an S3 bucket with KMS encryption + versioning.",
92
+ "Diagnose: AWS Lambda cold start latency 3s. Architecture suggestions?",
93
+ "Review this IAM policy for least-privilege violations: <paste here>",
94
+ "Implement rate-limit per-API-key in FastAPI with Redis.",
95
+ ],
96
+ )
97
 
98
  if __name__ == "__main__":
99
  demo.queue(max_size=20).launch()