Spaces:

surrogate1
/

surrogate-1-zero-gpu

Running on Zero

App Files Files Community

surrogate1 commited on 9 days ago

Commit

e84dab6

verified ·

1 Parent(s): dc8decc

feat: 32B AWQ synth+judge

Browse files

Files changed (1) hide show

app.py +69 -114

app.py CHANGED Viewed

@@ -1,48 +1,40 @@
-"""surrogate1 ZeroGPU Space — synth + judge worker.
-Qwen2.5-Coder-14B INT4 specialized for:
-  • POST /run/synth_batch  — Magpie-style synthesizer (empty user prompt
-                             → diverse instructions + responses)
-  • POST /run/judge_pair   — LLM-as-judge (prompt + response + criteria
-                             → 0-10 score + rationale)
-  • POST /run/best_of_n    — generate N candidates + score, return best
-Cron loops on the 3 harvester Spaces hit these endpoints. Drains the
-combined 50K A10G-min/mo PRO budget into actual training data.
 """
-import os, json, random, re
 import gradio as gr
 import spaces
 import torch
-BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-7B-Instruct")
 HF_TOKEN   = os.environ.get("HF_TOKEN", "")
 DOMAIN_SEEDS = {
-    "code-python":   "a Python coding task (function, class, or script)",
     "code-typescript": "a TypeScript / Node.js coding task",
-    "code-rust":     "a Rust coding task",
-    "code-go":       "a Go coding task",
-    "devops-tf":     "a Terraform / IaC task",
-    "devops-k8s":    "a Kubernetes manifest or operator task",
-    "devops-cdk":    "an AWS CDK construct task",
-    "ci-github":     "a GitHub Actions workflow task",
-    "sec-iam":       "an IAM least-privilege policy task",
-    "sec-cve":       "a CVE remediation task",
-    "sre-runbook":   "an incident runbook task",
-    "sre-slo":       "an SLI/SLO/error-budget definition task",
-    "data-sql":      "a parameterized SQL query task",
-    "ai-eng":        "a RAG / vector / LoRA / vLLM task",
-    "api-rest":      "a REST API design task",
-    "test-pytest":   "a pytest test-writing task",
 }
 JUDGE_PRINCIPLES = (
-    "Score the response on a 0-10 scale across these axes:\n"
-    "  correctness (does it work?), security (no leaked secrets, IAM least-priv, "
-    "input validated), idiomatic (best practices for the stack), "
-    "completeness (handles edge cases), citation (real APIs only).\n"
-    "Return ONLY JSON: {\"score\": float, \"axes\": {...}, \"why\": \"...\"}"
 )
 _model = None
@@ -53,26 +45,21 @@ def _load_lazy():
     global _model, _tok
     if _model is not None:
         return _model, _tok
-    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
     _tok = AutoTokenizer.from_pretrained(
         BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
     if _tok.pad_token_id is None:
         _tok.pad_token_id = _tok.eos_token_id
     _model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
-        device_map="cuda",
-        quantization_config=BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.bfloat16,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_use_double_quant=True))
     return _model, _tok
-def _generate(prompt: str, max_new=512, temp=0.7) -> str:
     model, tok = _load_lazy()
     inputs = tok(prompt, return_tensors="pt", truncation=True,
-                 max_length=12000).to("cuda")
     out = model.generate(
         **inputs, max_new_tokens=int(max_new), temperature=float(temp),
         top_p=0.9, do_sample=temp > 0,
@@ -82,130 +69,98 @@ def _generate(prompt: str, max_new=512, temp=0.7) -> str:
 @spaces.GPU(duration=300)
-def synth_batch(domain: str = "code-python", count: int = 10) -> str:
-    """Magpie-style: model generates BOTH instruction and response.
-    Returns JSONL string of {prompt, response, source, meta}.
-    """
     model, tok = _load_lazy()
     seed_text = DOMAIN_SEEDS.get(domain, DOMAIN_SEEDS["code-python"])
     out_lines = []
     for i in range(int(count)):
-        # Step 1: generate instruction (Magpie — empty user message)
         instr_msgs = [
             {"role": "system",
-             "content": f"You are an expert who generates realistic, "
-                        f"diverse, high-quality user requests about {seed_text}. "
-                        "Output ONLY one user request — no preamble, no JSON, "
-                        "just the request as a single paragraph."},
             {"role": "user", "content": ""},
         ]
-        instr_prompt = tok.apply_chat_template(
-            instr_msgs, tokenize=False, add_generation_prompt=True)
-        instruction = _generate(instr_prompt, max_new=180, temp=0.95)
-        instruction = instruction.strip().split("\n")[0][:600]
         if len(instruction) < 30: continue
-        # Step 2: generate response to that instruction
         resp_msgs = [
-            {"role": "system", "content": "You are Surrogate-1, an expert "
-             "DevSecOps + coding agent. Cite real APIs. No phantom imports."},
             {"role": "user", "content": instruction},
         ]
-        resp_prompt = tok.apply_chat_template(
-            resp_msgs, tokenize=False, add_generation_prompt=True)
-        response = _generate(resp_prompt, max_new=512, temp=0.4)
         if len(response) < 50: continue
         out_lines.append(json.dumps({
-            "prompt":   instruction,
-            "response": response,
-            "source":   "magpie-zerogpu",
-            "meta":     {"domain": domain, "seed": seed_text, "ix": i},
         }, ensure_ascii=False))
     return "\n".join(out_lines)
 @spaces.GPU(duration=120)
-def judge_pair(prompt: str, response: str, criteria: str = "default") -> str:
-    """Score a (prompt, response) pair. Returns JSON string."""
     if not prompt or not response:
-        return json.dumps({"score": 0.0, "why": "empty input"})
     model, tok = _load_lazy()
-    judge_msgs = [
-        {"role": "system",
-         "content": "You are a strict senior code reviewer. " + JUDGE_PRINCIPLES},
-        {"role": "user",
-         "content": f"PROMPT:\n{prompt[:2000]}\n\nRESPONSE:\n{response[:4000]}\n\n"
-                    f"Score per the rubric. JSON only."},
     ]
-    j_prompt = tok.apply_chat_template(
-        judge_msgs, tokenize=False, add_generation_prompt=True)
-    raw = _generate(j_prompt, max_new=400, temp=0.1)
-    # Extract first JSON object
     m = re.search(r"\{[^{}]*\"score\"[^{}]*\}", raw, re.DOTALL)
     if m:
-        try:
-            d = json.loads(m.group(0))
-            return json.dumps(d, ensure_ascii=False)
-        except Exception:
-            pass
     return json.dumps({"score": 5.0, "why": raw[:300], "raw": True})
 @spaces.GPU(duration=300)
-def best_of_n(prompt: str, n: int = 4, max_new: int = 512) -> str:
-    """Generate N candidates, score each, return best."""
-    if not prompt: return json.dumps({"error": "empty prompt"})
     model, tok = _load_lazy()
-    sys_msg = ("You are Surrogate-1. Solve the user's task with production-"
-               "quality code. Cite real APIs.")
     cands = []
     for i in range(int(n)):
-        msgs = [{"role": "system", "content": sys_msg},
-                {"role": "user", "content": prompt}]
         p = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
-        out = _generate(p, max_new=int(max_new), temp=0.7 + 0.05 * i)
-        cands.append(out)
-    # Score each (cheap heuristic: length + has-code-block)
     scored = []
     for c in cands:
         s = min(1.0, len(c) / 800)
         if "```" in c: s += 0.2
-        if "import " in c or "def " in c or "function " in c: s += 0.1
         scored.append((s, c))
     scored.sort(key=lambda x: -x[0])
     return json.dumps({"best": scored[0][1], "best_score": scored[0][0],
                         "all": [c for _, c in scored]}, ensure_ascii=False)
-with gr.Blocks(title="Surrogate-1 synth + judge worker") as demo:
     gr.Markdown(
-        f"# Surrogate-1 — synth + judge worker (24/7)\n"
-        f"**{BASE_MODEL}** on ZeroGPU A10G (4-bit). 3 endpoints:\n"
-        f"- `/run/synth_batch` — generate training pairs (Magpie)\n"
-        f"- `/run/judge_pair` — score (prompt, response)\n"
-        f"- `/run/best_of_n`  — generate N + pick best"
     )
     with gr.Tab("synth"):
         d = gr.Dropdown(list(DOMAIN_SEEDS.keys()), value="code-python", label="domain")
-        n = gr.Slider(1, 30, value=10, step=1, label="count")
-        bt = gr.Button("synth_batch")
-        out = gr.Textbox(label="JSONL", lines=20)
-        bt.click(synth_batch, inputs=[d, n], outputs=out, api_name="synth_batch")
     with gr.Tab("judge"):
-        p = gr.Textbox(label="prompt", lines=3)
-        r = gr.Textbox(label="response", lines=8)
         c = gr.Textbox(label="criteria", value="default")
-        bj = gr.Button("judge_pair")
-        oj = gr.Textbox(label="JSON score", lines=8)
-        bj.click(judge_pair, inputs=[p, r, c], outputs=oj, api_name="judge_pair")
     with gr.Tab("best-of-n"):
         bp = gr.Textbox(label="prompt", lines=3)
-        bn = gr.Slider(2, 8, value=4, step=1, label="n")
-        bm = gr.Slider(128, 1024, value=512, step=64, label="max new")
-        bb = gr.Button("best_of_n")
-        bo = gr.Textbox(label="JSON best", lines=15)
-        bb.click(best_of_n, inputs=[bp, bn, bm], outputs=bo, api_name="best_of_n")
 if __name__ == "__main__":
-    demo.queue(max_size=8).launch()

+"""surrogate1 ZeroGPU — Qwen2.5-Coder-32B AWQ synth+judge worker.
+Higher-quality synth pairs + better judge calibration with 32B vs 7B.
+AWQ pre-quantized = no bitsandbytes runtime quant = no startup error.
 """
+import os, json, re
 import gradio as gr
 import spaces
 import torch
+BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ")
 HF_TOKEN   = os.environ.get("HF_TOKEN", "")
 DOMAIN_SEEDS = {
+    "code-python": "a Python coding task",
     "code-typescript": "a TypeScript / Node.js coding task",
+    "code-rust": "a Rust coding task",
+    "code-go": "a Go coding task",
+    "devops-tf": "a Terraform / IaC task",
+    "devops-k8s": "a Kubernetes manifest or operator task",
+    "devops-cdk": "an AWS CDK construct task",
+    "ci-github": "a GitHub Actions workflow task",
+    "sec-iam": "an IAM least-privilege policy task",
+    "sec-cve": "a CVE remediation task",
+    "sre-runbook": "an incident runbook task",
+    "sre-slo": "an SLI/SLO/error-budget definition task",
+    "data-sql": "a parameterized SQL query task",
+    "ai-eng": "a RAG / vector / LoRA / vLLM task",
+    "api-rest": "a REST API design task",
+    "test-pytest": "a pytest test-writing task",
 }
 JUDGE_PRINCIPLES = (
+    "Score the response 0-10 on: correctness, security (no leaked secrets, "
+    "IAM least-priv, input validated), idiomatic, completeness, citation "
+    "(real APIs only). Return ONLY JSON: "
+    "{\"score\": float, \"axes\": {...}, \"why\": \"...\"}"
 )
 _model = None
     global _model, _tok
     if _model is not None:
         return _model, _tok
+    from transformers import AutoModelForCausalLM, AutoTokenizer
     _tok = AutoTokenizer.from_pretrained(
         BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
     if _tok.pad_token_id is None:
         _tok.pad_token_id = _tok.eos_token_id
     _model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
+        device_map="cuda", torch_dtype=torch.bfloat16)
     return _model, _tok
+def _generate(prompt, max_new=512, temp=0.7):
     model, tok = _load_lazy()
     inputs = tok(prompt, return_tensors="pt", truncation=True,
+                 max_length=8000).to("cuda")
     out = model.generate(
         **inputs, max_new_tokens=int(max_new), temperature=float(temp),
         top_p=0.9, do_sample=temp > 0,
 @spaces.GPU(duration=300)
+def synth_batch(domain="code-python", count=10):
     model, tok = _load_lazy()
     seed_text = DOMAIN_SEEDS.get(domain, DOMAIN_SEEDS["code-python"])
     out_lines = []
     for i in range(int(count)):
         instr_msgs = [
             {"role": "system",
+             "content": f"Generate ONE realistic, diverse, high-quality user request about {seed_text}. "
+                        "Output ONLY the request as a single paragraph. No preamble."},
             {"role": "user", "content": ""},
         ]
+        ip = tok.apply_chat_template(instr_msgs, tokenize=False, add_generation_prompt=True)
+        instruction = _generate(ip, max_new=180, temp=0.95).strip().split("\n")[0][:600]
         if len(instruction) < 30: continue
         resp_msgs = [
+            {"role": "system", "content": "You are Surrogate-1, expert DevSecOps + coding agent. "
+             "Cite real APIs. No phantom imports."},
             {"role": "user", "content": instruction},
         ]
+        rp = tok.apply_chat_template(resp_msgs, tokenize=False, add_generation_prompt=True)
+        response = _generate(rp, max_new=512, temp=0.4)
         if len(response) < 50: continue
         out_lines.append(json.dumps({
+            "prompt": instruction, "response": response,
+            "source": "magpie-zerogpu-32b",
+            "meta": {"domain": domain, "ix": i},
         }, ensure_ascii=False))
     return "\n".join(out_lines)
 @spaces.GPU(duration=120)
+def judge_pair(prompt, response, criteria="default"):
     if not prompt or not response:
+        return json.dumps({"score": 0.0, "why": "empty"})
     model, tok = _load_lazy()
+    j_msgs = [
+        {"role": "system", "content": "You are a strict senior code reviewer. " + JUDGE_PRINCIPLES},
+        {"role": "user", "content": f"PROMPT:\n{prompt[:2000]}\n\nRESPONSE:\n{response[:4000]}\n\nScore. JSON only."},
     ]
+    jp = tok.apply_chat_template(j_msgs, tokenize=False, add_generation_prompt=True)
+    raw = _generate(jp, max_new=400, temp=0.1)
     m = re.search(r"\{[^{}]*\"score\"[^{}]*\}", raw, re.DOTALL)
     if m:
+        try: return json.dumps(json.loads(m.group(0)), ensure_ascii=False)
+        except: pass
     return json.dumps({"score": 5.0, "why": raw[:300], "raw": True})
 @spaces.GPU(duration=300)
+def best_of_n(prompt, n=4, max_new=512):
+    if not prompt: return json.dumps({"error": "empty"})
     model, tok = _load_lazy()
+    sys_msg = "You are Surrogate-1. Solve the task with production-quality code."
     cands = []
     for i in range(int(n)):
+        msgs = [{"role": "system", "content": sys_msg}, {"role": "user", "content": prompt}]
         p = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
+        cands.append(_generate(p, max_new=int(max_new), temp=0.7 + 0.05 * i))
     scored = []
     for c in cands:
         s = min(1.0, len(c) / 800)
         if "```" in c: s += 0.2
+        if "import " in c or "def " in c: s += 0.1
         scored.append((s, c))
     scored.sort(key=lambda x: -x[0])
     return json.dumps({"best": scored[0][1], "best_score": scored[0][0],
                         "all": [c for _, c in scored]}, ensure_ascii=False)
+with gr.Blocks(title="Surrogate-1 32B synth+judge") as demo:
     gr.Markdown(
+        f"# Surrogate-1 — synth + judge (Qwen2.5-Coder-32B AWQ)\n"
+        f"3 endpoints: `/run/synth_batch`, `/run/judge_pair`, `/run/best_of_n`"
     )
     with gr.Tab("synth"):
         d = gr.Dropdown(list(DOMAIN_SEEDS.keys()), value="code-python", label="domain")
+        n = gr.Slider(1, 20, value=10, step=1, label="count")
+        gr.Button("synth_batch").click(synth_batch, [d, n], gr.Textbox(label="JSONL", lines=20),
+                                         api_name="synth_batch")
     with gr.Tab("judge"):
+        p = gr.Textbox(label="prompt", lines=3); r = gr.Textbox(label="response", lines=8)
         c = gr.Textbox(label="criteria", value="default")
+        gr.Button("judge_pair").click(judge_pair, [p, r, c], gr.Textbox(label="JSON", lines=8),
+                                        api_name="judge_pair")
     with gr.Tab("best-of-n"):
         bp = gr.Textbox(label="prompt", lines=3)
+        bn = gr.Slider(2, 6, value=4, step=1, label="n")
+        bm = gr.Slider(128, 1024, value=512, step=64, label="max_new")
+        gr.Button("best_of_n").click(best_of_n, [bp, bn, bm], gr.Textbox(label="JSON", lines=15),
+                                        api_name="best_of_n")
 if __name__ == "__main__":
+    demo.queue(max_size=6).launch()