surrogate1 commited on
Commit
f091fe3
·
verified ·
1 Parent(s): df84ec9

feat: specialize as synth+judge worker (3 endpoints, 14B INT4)

Browse files
Files changed (1) hide show
  1. app.py +179 -59
app.py CHANGED
@@ -1,91 +1,211 @@
1
- """Surrogate-1 ZeroGPU Space — minimal, works.
2
 
3
- Switched to Qwen2.5-Coder-3B (~6GB BF16 vs 14GB on 7B) for faster cold
4
- load (≤60s on A10G). Same Surrogate-1 v1 LoRA applies — only base model
5
- size differs. For long form / hard tasks, the chat ladder includes 7B
6
- fallback via free APIs; this Space serves the fast path.
 
 
 
 
 
7
  """
8
- import os
9
  import gradio as gr
10
  import spaces
11
  import torch
12
 
13
- BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-3B-Instruct")
14
- LORA_REPO = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
15
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
16
 
17
- SYSTEM = (
18
- "You are Surrogate-1, an expert DevSecOps + SRE + coding agent. "
19
- "Cite real APIs only. Say 'I don't know' rather than confabulate."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  )
21
 
22
- # Module-level cache
23
  _model = None
24
  _tok = None
25
 
26
 
27
  def _load_lazy():
28
- """Load only inside @spaces.GPU function (i.e., on GPU worker)."""
29
  global _model, _tok
30
  if _model is not None:
31
  return _model, _tok
32
- from transformers import AutoModelForCausalLM, AutoTokenizer
33
  _tok = AutoTokenizer.from_pretrained(
34
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
35
  if _tok.pad_token_id is None:
36
  _tok.pad_token_id = _tok.eos_token_id
37
  _model = AutoModelForCausalLM.from_pretrained(
38
- BASE_MODEL, torch_dtype=torch.bfloat16,
39
- token=HF_TOKEN or None, trust_remote_code=True,
40
- device_map="cuda")
41
- # LoRA optional — base model size mismatch (3B vs 7B) makes v1 LoRA
42
- # incompatible. We serve the base 3B for now; on 7B Space we apply LoRA.
43
- if BASE_MODEL.endswith("7B-Instruct"):
44
- try:
45
- from peft import PeftModel
46
- _model = PeftModel.from_pretrained(_model, LORA_REPO,
47
- token=HF_TOKEN or None)
48
- except Exception as e:
49
- print(f"[load] LoRA skip: {e}")
50
  return _model, _tok
51
 
52
 
53
- @spaces.GPU(duration=300)
54
- def respond(message, history, max_new_tokens=512, temperature=0.4):
55
- if not message or not message.strip():
56
- return ""
57
  model, tok = _load_lazy()
58
- msgs = [{"role": "system", "content": SYSTEM}]
59
- for u, a in (history or []):
60
- if u: msgs.append({"role": "user", "content": u})
61
- if a: msgs.append({"role": "assistant", "content": a})
62
- msgs.append({"role": "user", "content": message})
63
-
64
- prompt = tok.apply_chat_template(msgs, tokenize=False,
65
- add_generation_prompt=True)
66
  inputs = tok(prompt, return_tensors="pt", truncation=True,
67
- max_length=8000).to("cuda")
68
  out = model.generate(
69
- **inputs,
70
- max_new_tokens=int(max_new_tokens),
71
- temperature=float(temperature) if temperature > 0 else 1e-5,
72
- do_sample=temperature > 0,
73
- pad_token_id=tok.pad_token_id,
74
- eos_token_id=tok.eos_token_id,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  )
76
- new_tokens = out[0][inputs["input_ids"].shape[1]:]
77
- return tok.decode(new_tokens, skip_special_tokens=True).strip()
78
-
79
-
80
- demo = gr.ChatInterface(
81
- fn=respond,
82
- title=f"Surrogate-1 — {BASE_MODEL.split('/')[-1]}",
83
- description=f"ZeroGPU A10G — {BASE_MODEL}. First request ~30-60s cold load.",
84
- additional_inputs=[
85
- gr.Slider(64, 2048, value=512, step=64, label="max new tokens"),
86
- gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature"),
87
- ],
88
- )
 
 
 
 
 
 
 
89
 
90
  if __name__ == "__main__":
91
- demo.queue(max_size=10).launch()
 
1
+ """surrogate1 ZeroGPU Space — synth + judge worker.
2
 
3
+ Qwen2.5-Coder-14B INT4 specialized for:
4
+ POST /run/synth_batch — Magpie-style synthesizer (empty user prompt
5
+ diverse instructions + responses)
6
+ POST /run/judge_pair — LLM-as-judge (prompt + response + criteria
7
+ → 0-10 score + rationale)
8
+ • POST /run/best_of_n — generate N candidates + score, return best
9
+
10
+ Cron loops on the 3 harvester Spaces hit these endpoints. Drains the
11
+ combined 50K A10G-min/mo PRO budget into actual training data.
12
  """
13
+ import os, json, random, re
14
  import gradio as gr
15
  import spaces
16
  import torch
17
 
18
+ BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-14B-Instruct")
 
19
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
20
 
21
+ DOMAIN_SEEDS = {
22
+ "code-python": "a Python coding task (function, class, or script)",
23
+ "code-typescript": "a TypeScript / Node.js coding task",
24
+ "code-rust": "a Rust coding task",
25
+ "code-go": "a Go coding task",
26
+ "devops-tf": "a Terraform / IaC task",
27
+ "devops-k8s": "a Kubernetes manifest or operator task",
28
+ "devops-cdk": "an AWS CDK construct task",
29
+ "ci-github": "a GitHub Actions workflow task",
30
+ "sec-iam": "an IAM least-privilege policy task",
31
+ "sec-cve": "a CVE remediation task",
32
+ "sre-runbook": "an incident runbook task",
33
+ "sre-slo": "an SLI/SLO/error-budget definition task",
34
+ "data-sql": "a parameterized SQL query task",
35
+ "ai-eng": "a RAG / vector / LoRA / vLLM task",
36
+ "api-rest": "a REST API design task",
37
+ "test-pytest": "a pytest test-writing task",
38
+ }
39
+
40
+ JUDGE_PRINCIPLES = (
41
+ "Score the response on a 0-10 scale across these axes:\n"
42
+ " correctness (does it work?), security (no leaked secrets, IAM least-priv, "
43
+ "input validated), idiomatic (best practices for the stack), "
44
+ "completeness (handles edge cases), citation (real APIs only).\n"
45
+ "Return ONLY JSON: {\"score\": float, \"axes\": {...}, \"why\": \"...\"}"
46
  )
47
 
 
48
  _model = None
49
  _tok = None
50
 
51
 
52
  def _load_lazy():
 
53
  global _model, _tok
54
  if _model is not None:
55
  return _model, _tok
56
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
57
  _tok = AutoTokenizer.from_pretrained(
58
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
59
  if _tok.pad_token_id is None:
60
  _tok.pad_token_id = _tok.eos_token_id
61
  _model = AutoModelForCausalLM.from_pretrained(
62
+ BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
63
+ device_map="cuda",
64
+ quantization_config=BitsAndBytesConfig(
65
+ load_in_4bit=True,
66
+ bnb_4bit_compute_dtype=torch.bfloat16,
67
+ bnb_4bit_quant_type="nf4",
68
+ bnb_4bit_use_double_quant=True))
 
 
 
 
 
69
  return _model, _tok
70
 
71
 
72
+ def _generate(prompt: str, max_new=512, temp=0.7) -> str:
 
 
 
73
  model, tok = _load_lazy()
 
 
 
 
 
 
 
 
74
  inputs = tok(prompt, return_tensors="pt", truncation=True,
75
+ max_length=12000).to("cuda")
76
  out = model.generate(
77
+ **inputs, max_new_tokens=int(max_new), temperature=float(temp),
78
+ top_p=0.9, do_sample=temp > 0,
79
+ pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id)
80
+ return tok.decode(out[0][inputs["input_ids"].shape[1]:],
81
+ skip_special_tokens=True).strip()
82
+
83
+
84
+ @spaces.GPU(duration=300)
85
+ def synth_batch(domain: str = "code-python", count: int = 10) -> str:
86
+ """Magpie-style: model generates BOTH instruction and response.
87
+ Returns JSONL string of {prompt, response, source, meta}.
88
+ """
89
+ model, tok = _load_lazy()
90
+ seed_text = DOMAIN_SEEDS.get(domain, DOMAIN_SEEDS["code-python"])
91
+ out_lines = []
92
+ for i in range(int(count)):
93
+ # Step 1: generate instruction (Magpie — empty user message)
94
+ instr_msgs = [
95
+ {"role": "system",
96
+ "content": f"You are an expert who generates realistic, "
97
+ f"diverse, high-quality user requests about {seed_text}. "
98
+ "Output ONLY one user request — no preamble, no JSON, "
99
+ "just the request as a single paragraph."},
100
+ {"role": "user", "content": ""},
101
+ ]
102
+ instr_prompt = tok.apply_chat_template(
103
+ instr_msgs, tokenize=False, add_generation_prompt=True)
104
+ instruction = _generate(instr_prompt, max_new=180, temp=0.95)
105
+ instruction = instruction.strip().split("\n")[0][:600]
106
+ if len(instruction) < 30: continue
107
+
108
+ # Step 2: generate response to that instruction
109
+ resp_msgs = [
110
+ {"role": "system", "content": "You are Surrogate-1, an expert "
111
+ "DevSecOps + coding agent. Cite real APIs. No phantom imports."},
112
+ {"role": "user", "content": instruction},
113
+ ]
114
+ resp_prompt = tok.apply_chat_template(
115
+ resp_msgs, tokenize=False, add_generation_prompt=True)
116
+ response = _generate(resp_prompt, max_new=512, temp=0.4)
117
+ if len(response) < 50: continue
118
+
119
+ out_lines.append(json.dumps({
120
+ "prompt": instruction,
121
+ "response": response,
122
+ "source": "magpie-zerogpu",
123
+ "meta": {"domain": domain, "seed": seed_text, "ix": i},
124
+ }, ensure_ascii=False))
125
+ return "\n".join(out_lines)
126
+
127
+
128
+ @spaces.GPU(duration=120)
129
+ def judge_pair(prompt: str, response: str, criteria: str = "default") -> str:
130
+ """Score a (prompt, response) pair. Returns JSON string."""
131
+ if not prompt or not response:
132
+ return json.dumps({"score": 0.0, "why": "empty input"})
133
+ model, tok = _load_lazy()
134
+ judge_msgs = [
135
+ {"role": "system",
136
+ "content": "You are a strict senior code reviewer. " + JUDGE_PRINCIPLES},
137
+ {"role": "user",
138
+ "content": f"PROMPT:\n{prompt[:2000]}\n\nRESPONSE:\n{response[:4000]}\n\n"
139
+ f"Score per the rubric. JSON only."},
140
+ ]
141
+ j_prompt = tok.apply_chat_template(
142
+ judge_msgs, tokenize=False, add_generation_prompt=True)
143
+ raw = _generate(j_prompt, max_new=400, temp=0.1)
144
+ # Extract first JSON object
145
+ m = re.search(r"\{[^{}]*\"score\"[^{}]*\}", raw, re.DOTALL)
146
+ if m:
147
+ try:
148
+ d = json.loads(m.group(0))
149
+ return json.dumps(d, ensure_ascii=False)
150
+ except Exception:
151
+ pass
152
+ return json.dumps({"score": 5.0, "why": raw[:300], "raw": True})
153
+
154
+
155
+ @spaces.GPU(duration=300)
156
+ def best_of_n(prompt: str, n: int = 4, max_new: int = 512) -> str:
157
+ """Generate N candidates, score each, return best."""
158
+ if not prompt: return json.dumps({"error": "empty prompt"})
159
+ model, tok = _load_lazy()
160
+ sys_msg = ("You are Surrogate-1. Solve the user's task with production-"
161
+ "quality code. Cite real APIs.")
162
+ cands = []
163
+ for i in range(int(n)):
164
+ msgs = [{"role": "system", "content": sys_msg},
165
+ {"role": "user", "content": prompt}]
166
+ p = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
167
+ out = _generate(p, max_new=int(max_new), temp=0.7 + 0.05 * i)
168
+ cands.append(out)
169
+ # Score each (cheap heuristic: length + has-code-block)
170
+ scored = []
171
+ for c in cands:
172
+ s = min(1.0, len(c) / 800)
173
+ if "```" in c: s += 0.2
174
+ if "import " in c or "def " in c or "function " in c: s += 0.1
175
+ scored.append((s, c))
176
+ scored.sort(key=lambda x: -x[0])
177
+ return json.dumps({"best": scored[0][1], "best_score": scored[0][0],
178
+ "all": [c for _, c in scored]}, ensure_ascii=False)
179
+
180
+
181
+ with gr.Blocks(title="Surrogate-1 synth + judge worker") as demo:
182
+ gr.Markdown(
183
+ f"# Surrogate-1 — synth + judge worker (24/7)\n"
184
+ f"**{BASE_MODEL}** on ZeroGPU A10G (4-bit). 3 endpoints:\n"
185
+ f"- `/run/synth_batch` — generate training pairs (Magpie)\n"
186
+ f"- `/run/judge_pair` — score (prompt, response)\n"
187
+ f"- `/run/best_of_n` — generate N + pick best"
188
  )
189
+ with gr.Tab("synth"):
190
+ d = gr.Dropdown(list(DOMAIN_SEEDS.keys()), value="code-python", label="domain")
191
+ n = gr.Slider(1, 30, value=10, step=1, label="count")
192
+ bt = gr.Button("synth_batch")
193
+ out = gr.Textbox(label="JSONL", lines=20)
194
+ bt.click(synth_batch, inputs=[d, n], outputs=out, api_name="synth_batch")
195
+ with gr.Tab("judge"):
196
+ p = gr.Textbox(label="prompt", lines=3)
197
+ r = gr.Textbox(label="response", lines=8)
198
+ c = gr.Textbox(label="criteria", value="default")
199
+ bj = gr.Button("judge_pair")
200
+ oj = gr.Textbox(label="JSON score", lines=8)
201
+ bj.click(judge_pair, inputs=[p, r, c], outputs=oj, api_name="judge_pair")
202
+ with gr.Tab("best-of-n"):
203
+ bp = gr.Textbox(label="prompt", lines=3)
204
+ bn = gr.Slider(2, 8, value=4, step=1, label="n")
205
+ bm = gr.Slider(128, 1024, value=512, step=64, label="max new")
206
+ bb = gr.Button("best_of_n")
207
+ bo = gr.Textbox(label="JSON best", lines=15)
208
+ bb.click(best_of_n, inputs=[bp, bn, bm], outputs=bo, api_name="best_of_n")
209
 
210
  if __name__ == "__main__":
211
+ demo.queue(max_size=8).launch()