surrogate1 commited on
Commit
e84dab6
·
verified ·
1 Parent(s): dc8decc

feat: 32B AWQ synth+judge

Browse files
Files changed (1) hide show
  1. app.py +69 -114
app.py CHANGED
@@ -1,48 +1,40 @@
1
- """surrogate1 ZeroGPU Space synth + judge worker.
2
 
3
- Qwen2.5-Coder-14B INT4 specialized for:
4
- POST /run/synth_batch — Magpie-style synthesizer (empty user prompt
5
- → diverse instructions + responses)
6
- • POST /run/judge_pair — LLM-as-judge (prompt + response + criteria
7
- → 0-10 score + rationale)
8
- • POST /run/best_of_n — generate N candidates + score, return best
9
-
10
- Cron loops on the 3 harvester Spaces hit these endpoints. Drains the
11
- combined 50K A10G-min/mo PRO budget into actual training data.
12
  """
13
- import os, json, random, re
14
  import gradio as gr
15
  import spaces
16
  import torch
17
 
18
- BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-7B-Instruct")
19
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
20
 
21
  DOMAIN_SEEDS = {
22
- "code-python": "a Python coding task (function, class, or script)",
23
  "code-typescript": "a TypeScript / Node.js coding task",
24
- "code-rust": "a Rust coding task",
25
- "code-go": "a Go coding task",
26
- "devops-tf": "a Terraform / IaC task",
27
- "devops-k8s": "a Kubernetes manifest or operator task",
28
- "devops-cdk": "an AWS CDK construct task",
29
- "ci-github": "a GitHub Actions workflow task",
30
- "sec-iam": "an IAM least-privilege policy task",
31
- "sec-cve": "a CVE remediation task",
32
- "sre-runbook": "an incident runbook task",
33
- "sre-slo": "an SLI/SLO/error-budget definition task",
34
- "data-sql": "a parameterized SQL query task",
35
- "ai-eng": "a RAG / vector / LoRA / vLLM task",
36
- "api-rest": "a REST API design task",
37
- "test-pytest": "a pytest test-writing task",
38
  }
39
 
40
  JUDGE_PRINCIPLES = (
41
- "Score the response on a 0-10 scale across these axes:\n"
42
- " correctness (does it work?), security (no leaked secrets, IAM least-priv, "
43
- "input validated), idiomatic (best practices for the stack), "
44
- "completeness (handles edge cases), citation (real APIs only).\n"
45
- "Return ONLY JSON: {\"score\": float, \"axes\": {...}, \"why\": \"...\"}"
46
  )
47
 
48
  _model = None
@@ -53,26 +45,21 @@ def _load_lazy():
53
  global _model, _tok
54
  if _model is not None:
55
  return _model, _tok
56
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
57
  _tok = AutoTokenizer.from_pretrained(
58
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
59
  if _tok.pad_token_id is None:
60
  _tok.pad_token_id = _tok.eos_token_id
61
  _model = AutoModelForCausalLM.from_pretrained(
62
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
63
- device_map="cuda",
64
- quantization_config=BitsAndBytesConfig(
65
- load_in_4bit=True,
66
- bnb_4bit_compute_dtype=torch.bfloat16,
67
- bnb_4bit_quant_type="nf4",
68
- bnb_4bit_use_double_quant=True))
69
  return _model, _tok
70
 
71
 
72
- def _generate(prompt: str, max_new=512, temp=0.7) -> str:
73
  model, tok = _load_lazy()
74
  inputs = tok(prompt, return_tensors="pt", truncation=True,
75
- max_length=12000).to("cuda")
76
  out = model.generate(
77
  **inputs, max_new_tokens=int(max_new), temperature=float(temp),
78
  top_p=0.9, do_sample=temp > 0,
@@ -82,130 +69,98 @@ def _generate(prompt: str, max_new=512, temp=0.7) -> str:
82
 
83
 
84
  @spaces.GPU(duration=300)
85
- def synth_batch(domain: str = "code-python", count: int = 10) -> str:
86
- """Magpie-style: model generates BOTH instruction and response.
87
- Returns JSONL string of {prompt, response, source, meta}.
88
- """
89
  model, tok = _load_lazy()
90
  seed_text = DOMAIN_SEEDS.get(domain, DOMAIN_SEEDS["code-python"])
91
  out_lines = []
92
  for i in range(int(count)):
93
- # Step 1: generate instruction (Magpie — empty user message)
94
  instr_msgs = [
95
  {"role": "system",
96
- "content": f"You are an expert who generates realistic, "
97
- f"diverse, high-quality user requests about {seed_text}. "
98
- "Output ONLY one user request — no preamble, no JSON, "
99
- "just the request as a single paragraph."},
100
  {"role": "user", "content": ""},
101
  ]
102
- instr_prompt = tok.apply_chat_template(
103
- instr_msgs, tokenize=False, add_generation_prompt=True)
104
- instruction = _generate(instr_prompt, max_new=180, temp=0.95)
105
- instruction = instruction.strip().split("\n")[0][:600]
106
  if len(instruction) < 30: continue
107
 
108
- # Step 2: generate response to that instruction
109
  resp_msgs = [
110
- {"role": "system", "content": "You are Surrogate-1, an expert "
111
- "DevSecOps + coding agent. Cite real APIs. No phantom imports."},
112
  {"role": "user", "content": instruction},
113
  ]
114
- resp_prompt = tok.apply_chat_template(
115
- resp_msgs, tokenize=False, add_generation_prompt=True)
116
- response = _generate(resp_prompt, max_new=512, temp=0.4)
117
  if len(response) < 50: continue
118
 
119
  out_lines.append(json.dumps({
120
- "prompt": instruction,
121
- "response": response,
122
- "source": "magpie-zerogpu",
123
- "meta": {"domain": domain, "seed": seed_text, "ix": i},
124
  }, ensure_ascii=False))
125
  return "\n".join(out_lines)
126
 
127
 
128
  @spaces.GPU(duration=120)
129
- def judge_pair(prompt: str, response: str, criteria: str = "default") -> str:
130
- """Score a (prompt, response) pair. Returns JSON string."""
131
  if not prompt or not response:
132
- return json.dumps({"score": 0.0, "why": "empty input"})
133
  model, tok = _load_lazy()
134
- judge_msgs = [
135
- {"role": "system",
136
- "content": "You are a strict senior code reviewer. " + JUDGE_PRINCIPLES},
137
- {"role": "user",
138
- "content": f"PROMPT:\n{prompt[:2000]}\n\nRESPONSE:\n{response[:4000]}\n\n"
139
- f"Score per the rubric. JSON only."},
140
  ]
141
- j_prompt = tok.apply_chat_template(
142
- judge_msgs, tokenize=False, add_generation_prompt=True)
143
- raw = _generate(j_prompt, max_new=400, temp=0.1)
144
- # Extract first JSON object
145
  m = re.search(r"\{[^{}]*\"score\"[^{}]*\}", raw, re.DOTALL)
146
  if m:
147
- try:
148
- d = json.loads(m.group(0))
149
- return json.dumps(d, ensure_ascii=False)
150
- except Exception:
151
- pass
152
  return json.dumps({"score": 5.0, "why": raw[:300], "raw": True})
153
 
154
 
155
  @spaces.GPU(duration=300)
156
- def best_of_n(prompt: str, n: int = 4, max_new: int = 512) -> str:
157
- """Generate N candidates, score each, return best."""
158
- if not prompt: return json.dumps({"error": "empty prompt"})
159
  model, tok = _load_lazy()
160
- sys_msg = ("You are Surrogate-1. Solve the user's task with production-"
161
- "quality code. Cite real APIs.")
162
  cands = []
163
  for i in range(int(n)):
164
- msgs = [{"role": "system", "content": sys_msg},
165
- {"role": "user", "content": prompt}]
166
  p = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
167
- out = _generate(p, max_new=int(max_new), temp=0.7 + 0.05 * i)
168
- cands.append(out)
169
- # Score each (cheap heuristic: length + has-code-block)
170
  scored = []
171
  for c in cands:
172
  s = min(1.0, len(c) / 800)
173
  if "```" in c: s += 0.2
174
- if "import " in c or "def " in c or "function " in c: s += 0.1
175
  scored.append((s, c))
176
  scored.sort(key=lambda x: -x[0])
177
  return json.dumps({"best": scored[0][1], "best_score": scored[0][0],
178
  "all": [c for _, c in scored]}, ensure_ascii=False)
179
 
180
 
181
- with gr.Blocks(title="Surrogate-1 synth + judge worker") as demo:
182
  gr.Markdown(
183
- f"# Surrogate-1 — synth + judge worker (24/7)\n"
184
- f"**{BASE_MODEL}** on ZeroGPU A10G (4-bit). 3 endpoints:\n"
185
- f"- `/run/synth_batch` — generate training pairs (Magpie)\n"
186
- f"- `/run/judge_pair` — score (prompt, response)\n"
187
- f"- `/run/best_of_n` — generate N + pick best"
188
  )
189
  with gr.Tab("synth"):
190
  d = gr.Dropdown(list(DOMAIN_SEEDS.keys()), value="code-python", label="domain")
191
- n = gr.Slider(1, 30, value=10, step=1, label="count")
192
- bt = gr.Button("synth_batch")
193
- out = gr.Textbox(label="JSONL", lines=20)
194
- bt.click(synth_batch, inputs=[d, n], outputs=out, api_name="synth_batch")
195
  with gr.Tab("judge"):
196
- p = gr.Textbox(label="prompt", lines=3)
197
- r = gr.Textbox(label="response", lines=8)
198
  c = gr.Textbox(label="criteria", value="default")
199
- bj = gr.Button("judge_pair")
200
- oj = gr.Textbox(label="JSON score", lines=8)
201
- bj.click(judge_pair, inputs=[p, r, c], outputs=oj, api_name="judge_pair")
202
  with gr.Tab("best-of-n"):
203
  bp = gr.Textbox(label="prompt", lines=3)
204
- bn = gr.Slider(2, 8, value=4, step=1, label="n")
205
- bm = gr.Slider(128, 1024, value=512, step=64, label="max new")
206
- bb = gr.Button("best_of_n")
207
- bo = gr.Textbox(label="JSON best", lines=15)
208
- bb.click(best_of_n, inputs=[bp, bn, bm], outputs=bo, api_name="best_of_n")
209
 
210
  if __name__ == "__main__":
211
- demo.queue(max_size=8).launch()
 
1
+ """surrogate1 ZeroGPU — Qwen2.5-Coder-32B AWQ synth+judge worker.
2
 
3
+ Higher-quality synth pairs + better judge calibration with 32B vs 7B.
4
+ AWQ pre-quantized = no bitsandbytes runtime quant = no startup error.
 
 
 
 
 
 
 
5
  """
6
+ import os, json, re
7
  import gradio as gr
8
  import spaces
9
  import torch
10
 
11
+ BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ")
12
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
13
 
14
  DOMAIN_SEEDS = {
15
+ "code-python": "a Python coding task",
16
  "code-typescript": "a TypeScript / Node.js coding task",
17
+ "code-rust": "a Rust coding task",
18
+ "code-go": "a Go coding task",
19
+ "devops-tf": "a Terraform / IaC task",
20
+ "devops-k8s": "a Kubernetes manifest or operator task",
21
+ "devops-cdk": "an AWS CDK construct task",
22
+ "ci-github": "a GitHub Actions workflow task",
23
+ "sec-iam": "an IAM least-privilege policy task",
24
+ "sec-cve": "a CVE remediation task",
25
+ "sre-runbook": "an incident runbook task",
26
+ "sre-slo": "an SLI/SLO/error-budget definition task",
27
+ "data-sql": "a parameterized SQL query task",
28
+ "ai-eng": "a RAG / vector / LoRA / vLLM task",
29
+ "api-rest": "a REST API design task",
30
+ "test-pytest": "a pytest test-writing task",
31
  }
32
 
33
  JUDGE_PRINCIPLES = (
34
+ "Score the response 0-10 on: correctness, security (no leaked secrets, "
35
+ "IAM least-priv, input validated), idiomatic, completeness, citation "
36
+ "(real APIs only). Return ONLY JSON: "
37
+ "{\"score\": float, \"axes\": {...}, \"why\": \"...\"}"
 
38
  )
39
 
40
  _model = None
 
45
  global _model, _tok
46
  if _model is not None:
47
  return _model, _tok
48
+ from transformers import AutoModelForCausalLM, AutoTokenizer
49
  _tok = AutoTokenizer.from_pretrained(
50
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
51
  if _tok.pad_token_id is None:
52
  _tok.pad_token_id = _tok.eos_token_id
53
  _model = AutoModelForCausalLM.from_pretrained(
54
  BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
55
+ device_map="cuda", torch_dtype=torch.bfloat16)
 
 
 
 
 
56
  return _model, _tok
57
 
58
 
59
+ def _generate(prompt, max_new=512, temp=0.7):
60
  model, tok = _load_lazy()
61
  inputs = tok(prompt, return_tensors="pt", truncation=True,
62
+ max_length=8000).to("cuda")
63
  out = model.generate(
64
  **inputs, max_new_tokens=int(max_new), temperature=float(temp),
65
  top_p=0.9, do_sample=temp > 0,
 
69
 
70
 
71
  @spaces.GPU(duration=300)
72
+ def synth_batch(domain="code-python", count=10):
 
 
 
73
  model, tok = _load_lazy()
74
  seed_text = DOMAIN_SEEDS.get(domain, DOMAIN_SEEDS["code-python"])
75
  out_lines = []
76
  for i in range(int(count)):
 
77
  instr_msgs = [
78
  {"role": "system",
79
+ "content": f"Generate ONE realistic, diverse, high-quality user request about {seed_text}. "
80
+ "Output ONLY the request as a single paragraph. No preamble."},
 
 
81
  {"role": "user", "content": ""},
82
  ]
83
+ ip = tok.apply_chat_template(instr_msgs, tokenize=False, add_generation_prompt=True)
84
+ instruction = _generate(ip, max_new=180, temp=0.95).strip().split("\n")[0][:600]
 
 
85
  if len(instruction) < 30: continue
86
 
 
87
  resp_msgs = [
88
+ {"role": "system", "content": "You are Surrogate-1, expert DevSecOps + coding agent. "
89
+ "Cite real APIs. No phantom imports."},
90
  {"role": "user", "content": instruction},
91
  ]
92
+ rp = tok.apply_chat_template(resp_msgs, tokenize=False, add_generation_prompt=True)
93
+ response = _generate(rp, max_new=512, temp=0.4)
 
94
  if len(response) < 50: continue
95
 
96
  out_lines.append(json.dumps({
97
+ "prompt": instruction, "response": response,
98
+ "source": "magpie-zerogpu-32b",
99
+ "meta": {"domain": domain, "ix": i},
 
100
  }, ensure_ascii=False))
101
  return "\n".join(out_lines)
102
 
103
 
104
  @spaces.GPU(duration=120)
105
+ def judge_pair(prompt, response, criteria="default"):
 
106
  if not prompt or not response:
107
+ return json.dumps({"score": 0.0, "why": "empty"})
108
  model, tok = _load_lazy()
109
+ j_msgs = [
110
+ {"role": "system", "content": "You are a strict senior code reviewer. " + JUDGE_PRINCIPLES},
111
+ {"role": "user", "content": f"PROMPT:\n{prompt[:2000]}\n\nRESPONSE:\n{response[:4000]}\n\nScore. JSON only."},
 
 
 
112
  ]
113
+ jp = tok.apply_chat_template(j_msgs, tokenize=False, add_generation_prompt=True)
114
+ raw = _generate(jp, max_new=400, temp=0.1)
 
 
115
  m = re.search(r"\{[^{}]*\"score\"[^{}]*\}", raw, re.DOTALL)
116
  if m:
117
+ try: return json.dumps(json.loads(m.group(0)), ensure_ascii=False)
118
+ except: pass
 
 
 
119
  return json.dumps({"score": 5.0, "why": raw[:300], "raw": True})
120
 
121
 
122
  @spaces.GPU(duration=300)
123
+ def best_of_n(prompt, n=4, max_new=512):
124
+ if not prompt: return json.dumps({"error": "empty"})
 
125
  model, tok = _load_lazy()
126
+ sys_msg = "You are Surrogate-1. Solve the task with production-quality code."
 
127
  cands = []
128
  for i in range(int(n)):
129
+ msgs = [{"role": "system", "content": sys_msg}, {"role": "user", "content": prompt}]
 
130
  p = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
131
+ cands.append(_generate(p, max_new=int(max_new), temp=0.7 + 0.05 * i))
 
 
132
  scored = []
133
  for c in cands:
134
  s = min(1.0, len(c) / 800)
135
  if "```" in c: s += 0.2
136
+ if "import " in c or "def " in c: s += 0.1
137
  scored.append((s, c))
138
  scored.sort(key=lambda x: -x[0])
139
  return json.dumps({"best": scored[0][1], "best_score": scored[0][0],
140
  "all": [c for _, c in scored]}, ensure_ascii=False)
141
 
142
 
143
+ with gr.Blocks(title="Surrogate-1 32B synth+judge") as demo:
144
  gr.Markdown(
145
+ f"# Surrogate-1 — synth + judge (Qwen2.5-Coder-32B AWQ)\n"
146
+ f"3 endpoints: `/run/synth_batch`, `/run/judge_pair`, `/run/best_of_n`"
 
 
 
147
  )
148
  with gr.Tab("synth"):
149
  d = gr.Dropdown(list(DOMAIN_SEEDS.keys()), value="code-python", label="domain")
150
+ n = gr.Slider(1, 20, value=10, step=1, label="count")
151
+ gr.Button("synth_batch").click(synth_batch, [d, n], gr.Textbox(label="JSONL", lines=20),
152
+ api_name="synth_batch")
 
153
  with gr.Tab("judge"):
154
+ p = gr.Textbox(label="prompt", lines=3); r = gr.Textbox(label="response", lines=8)
 
155
  c = gr.Textbox(label="criteria", value="default")
156
+ gr.Button("judge_pair").click(judge_pair, [p, r, c], gr.Textbox(label="JSON", lines=8),
157
+ api_name="judge_pair")
 
158
  with gr.Tab("best-of-n"):
159
  bp = gr.Textbox(label="prompt", lines=3)
160
+ bn = gr.Slider(2, 6, value=4, step=1, label="n")
161
+ bm = gr.Slider(128, 1024, value=512, step=64, label="max_new")
162
+ gr.Button("best_of_n").click(best_of_n, [bp, bn, bm], gr.Textbox(label="JSON", lines=15),
163
+ api_name="best_of_n")
 
164
 
165
  if __name__ == "__main__":
166
+ demo.queue(max_size=6).launch()