Spaces:

surrogate1
/

surrogate-1-zero-gpu

Running on Zero

App Files Files Community

surrogate1 commited on 9 days ago

Commit

f091fe3

verified ·

1 Parent(s): df84ec9

feat: specialize as synth+judge worker (3 endpoints, 14B INT4)

Browse files

Files changed (1) hide show

app.py +179 -59

app.py CHANGED Viewed

@@ -1,91 +1,211 @@
-"""Surrogate-1 ZeroGPU Space — minimal, works.
-Switched to Qwen2.5-Coder-3B (~6GB BF16 vs 14GB on 7B) for faster cold
-load (≤60s on A10G). Same Surrogate-1 v1 LoRA applies — only base model
-size differs. For long form / hard tasks, the chat ladder includes 7B
-fallback via free APIs; this Space serves the fast path.
 """
-import os
 import gradio as gr
 import spaces
 import torch
-BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-3B-Instruct")
-LORA_REPO  = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
 HF_TOKEN   = os.environ.get("HF_TOKEN", "")
-SYSTEM = (
-    "You are Surrogate-1, an expert DevSecOps + SRE + coding agent. "
-    "Cite real APIs only. Say 'I don't know' rather than confabulate."
 )
-# Module-level cache
 _model = None
 _tok = None
 def _load_lazy():
-    """Load only inside @spaces.GPU function (i.e., on GPU worker)."""
     global _model, _tok
     if _model is not None:
         return _model, _tok
-    from transformers import AutoModelForCausalLM, AutoTokenizer
     _tok = AutoTokenizer.from_pretrained(
         BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
     if _tok.pad_token_id is None:
         _tok.pad_token_id = _tok.eos_token_id
     _model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL, torch_dtype=torch.bfloat16,
-        token=HF_TOKEN or None, trust_remote_code=True,
-        device_map="cuda")
-    # LoRA optional — base model size mismatch (3B vs 7B) makes v1 LoRA
-    # incompatible. We serve the base 3B for now; on 7B Space we apply LoRA.
-    if BASE_MODEL.endswith("7B-Instruct"):
-        try:
-            from peft import PeftModel
-            _model = PeftModel.from_pretrained(_model, LORA_REPO,
-                                                token=HF_TOKEN or None)
-        except Exception as e:
-            print(f"[load] LoRA skip: {e}")
     return _model, _tok
-@spaces.GPU(duration=300)
-def respond(message, history, max_new_tokens=512, temperature=0.4):
-    if not message or not message.strip():
-        return ""
     model, tok = _load_lazy()
-    msgs = [{"role": "system", "content": SYSTEM}]
-    for u, a in (history or []):
-        if u: msgs.append({"role": "user", "content": u})
-        if a: msgs.append({"role": "assistant", "content": a})
-    msgs.append({"role": "user", "content": message})
-    prompt = tok.apply_chat_template(msgs, tokenize=False,
-                                      add_generation_prompt=True)
     inputs = tok(prompt, return_tensors="pt", truncation=True,
-                 max_length=8000).to("cuda")
     out = model.generate(
-        **inputs,
-        max_new_tokens=int(max_new_tokens),
-        temperature=float(temperature) if temperature > 0 else 1e-5,
-        do_sample=temperature > 0,
-        pad_token_id=tok.pad_token_id,
-        eos_token_id=tok.eos_token_id,
     )
-    new_tokens = out[0][inputs["input_ids"].shape[1]:]
-    return tok.decode(new_tokens, skip_special_tokens=True).strip()
-demo = gr.ChatInterface(
-    fn=respond,
-    title=f"Surrogate-1 — {BASE_MODEL.split('/')[-1]}",
-    description=f"ZeroGPU A10G — {BASE_MODEL}. First request ~30-60s cold load.",
-    additional_inputs=[
-        gr.Slider(64, 2048, value=512, step=64, label="max new tokens"),
-        gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature"),
-    ],
-)
 if __name__ == "__main__":
-    demo.queue(max_size=10).launch()

+"""surrogate1 ZeroGPU Space — synth + judge worker.
+Qwen2.5-Coder-14B INT4 specialized for:
+  • POST /run/synth_batch  — Magpie-style synthesizer (empty user prompt
+                             → diverse instructions + responses)
+  • POST /run/judge_pair   — LLM-as-judge (prompt + response + criteria
+                             → 0-10 score + rationale)
+  • POST /run/best_of_n    — generate N candidates + score, return best
+Cron loops on the 3 harvester Spaces hit these endpoints. Drains the
+combined 50K A10G-min/mo PRO budget into actual training data.
 """
+import os, json, random, re
 import gradio as gr
 import spaces
 import torch
+BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-14B-Instruct")
 HF_TOKEN   = os.environ.get("HF_TOKEN", "")
+DOMAIN_SEEDS = {
+    "code-python":   "a Python coding task (function, class, or script)",
+    "code-typescript": "a TypeScript / Node.js coding task",
+    "code-rust":     "a Rust coding task",
+    "code-go":       "a Go coding task",
+    "devops-tf":     "a Terraform / IaC task",
+    "devops-k8s":    "a Kubernetes manifest or operator task",
+    "devops-cdk":    "an AWS CDK construct task",
+    "ci-github":     "a GitHub Actions workflow task",
+    "sec-iam":       "an IAM least-privilege policy task",
+    "sec-cve":       "a CVE remediation task",
+    "sre-runbook":   "an incident runbook task",
+    "sre-slo":       "an SLI/SLO/error-budget definition task",
+    "data-sql":      "a parameterized SQL query task",
+    "ai-eng":        "a RAG / vector / LoRA / vLLM task",
+    "api-rest":      "a REST API design task",
+    "test-pytest":   "a pytest test-writing task",
+}
+JUDGE_PRINCIPLES = (
+    "Score the response on a 0-10 scale across these axes:\n"
+    "  correctness (does it work?), security (no leaked secrets, IAM least-priv, "
+    "input validated), idiomatic (best practices for the stack), "
+    "completeness (handles edge cases), citation (real APIs only).\n"
+    "Return ONLY JSON: {\"score\": float, \"axes\": {...}, \"why\": \"...\"}"
 )
 _model = None
 _tok = None
 def _load_lazy():
     global _model, _tok
     if _model is not None:
         return _model, _tok
+    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
     _tok = AutoTokenizer.from_pretrained(
         BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
     if _tok.pad_token_id is None:
         _tok.pad_token_id = _tok.eos_token_id
     _model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
+        device_map="cuda",
+        quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True))
     return _model, _tok
+def _generate(prompt: str, max_new=512, temp=0.7) -> str:
     model, tok = _load_lazy()
     inputs = tok(prompt, return_tensors="pt", truncation=True,
+                 max_length=12000).to("cuda")
     out = model.generate(
+        **inputs, max_new_tokens=int(max_new), temperature=float(temp),
+        top_p=0.9, do_sample=temp > 0,
+        pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id)
+    return tok.decode(out[0][inputs["input_ids"].shape[1]:],
+                      skip_special_tokens=True).strip()
+@spaces.GPU(duration=300)
+def synth_batch(domain: str = "code-python", count: int = 10) -> str:
+    """Magpie-style: model generates BOTH instruction and response.
+    Returns JSONL string of {prompt, response, source, meta}.
+    """
+    model, tok = _load_lazy()
+    seed_text = DOMAIN_SEEDS.get(domain, DOMAIN_SEEDS["code-python"])
+    out_lines = []
+    for i in range(int(count)):
+        # Step 1: generate instruction (Magpie — empty user message)
+        instr_msgs = [
+            {"role": "system",
+             "content": f"You are an expert who generates realistic, "
+                        f"diverse, high-quality user requests about {seed_text}. "
+                        "Output ONLY one user request — no preamble, no JSON, "
+                        "just the request as a single paragraph."},
+            {"role": "user", "content": ""},
+        ]
+        instr_prompt = tok.apply_chat_template(
+            instr_msgs, tokenize=False, add_generation_prompt=True)
+        instruction = _generate(instr_prompt, max_new=180, temp=0.95)
+        instruction = instruction.strip().split("\n")[0][:600]
+        if len(instruction) < 30: continue
+        # Step 2: generate response to that instruction
+        resp_msgs = [
+            {"role": "system", "content": "You are Surrogate-1, an expert "
+             "DevSecOps + coding agent. Cite real APIs. No phantom imports."},
+            {"role": "user", "content": instruction},
+        ]
+        resp_prompt = tok.apply_chat_template(
+            resp_msgs, tokenize=False, add_generation_prompt=True)
+        response = _generate(resp_prompt, max_new=512, temp=0.4)
+        if len(response) < 50: continue
+        out_lines.append(json.dumps({
+            "prompt":   instruction,
+            "response": response,
+            "source":   "magpie-zerogpu",
+            "meta":     {"domain": domain, "seed": seed_text, "ix": i},
+        }, ensure_ascii=False))
+    return "\n".join(out_lines)
+@spaces.GPU(duration=120)
+def judge_pair(prompt: str, response: str, criteria: str = "default") -> str:
+    """Score a (prompt, response) pair. Returns JSON string."""
+    if not prompt or not response:
+        return json.dumps({"score": 0.0, "why": "empty input"})
+    model, tok = _load_lazy()
+    judge_msgs = [
+        {"role": "system",
+         "content": "You are a strict senior code reviewer. " + JUDGE_PRINCIPLES},
+        {"role": "user",
+         "content": f"PROMPT:\n{prompt[:2000]}\n\nRESPONSE:\n{response[:4000]}\n\n"
+                    f"Score per the rubric. JSON only."},
+    ]
+    j_prompt = tok.apply_chat_template(
+        judge_msgs, tokenize=False, add_generation_prompt=True)
+    raw = _generate(j_prompt, max_new=400, temp=0.1)
+    # Extract first JSON object
+    m = re.search(r"\{[^{}]*\"score\"[^{}]*\}", raw, re.DOTALL)
+    if m:
+        try:
+            d = json.loads(m.group(0))
+            return json.dumps(d, ensure_ascii=False)
+        except Exception:
+            pass
+    return json.dumps({"score": 5.0, "why": raw[:300], "raw": True})
+@spaces.GPU(duration=300)
+def best_of_n(prompt: str, n: int = 4, max_new: int = 512) -> str:
+    """Generate N candidates, score each, return best."""
+    if not prompt: return json.dumps({"error": "empty prompt"})
+    model, tok = _load_lazy()
+    sys_msg = ("You are Surrogate-1. Solve the user's task with production-"
+               "quality code. Cite real APIs.")
+    cands = []
+    for i in range(int(n)):
+        msgs = [{"role": "system", "content": sys_msg},
+                {"role": "user", "content": prompt}]
+        p = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
+        out = _generate(p, max_new=int(max_new), temp=0.7 + 0.05 * i)
+        cands.append(out)
+    # Score each (cheap heuristic: length + has-code-block)
+    scored = []
+    for c in cands:
+        s = min(1.0, len(c) / 800)
+        if "```" in c: s += 0.2
+        if "import " in c or "def " in c or "function " in c: s += 0.1
+        scored.append((s, c))
+    scored.sort(key=lambda x: -x[0])
+    return json.dumps({"best": scored[0][1], "best_score": scored[0][0],
+                        "all": [c for _, c in scored]}, ensure_ascii=False)
+with gr.Blocks(title="Surrogate-1 synth + judge worker") as demo:
+    gr.Markdown(
+        f"# Surrogate-1 — synth + judge worker (24/7)\n"
+        f"**{BASE_MODEL}** on ZeroGPU A10G (4-bit). 3 endpoints:\n"
+        f"- `/run/synth_batch` — generate training pairs (Magpie)\n"
+        f"- `/run/judge_pair` — score (prompt, response)\n"
+        f"- `/run/best_of_n`  — generate N + pick best"
     )
+    with gr.Tab("synth"):
+        d = gr.Dropdown(list(DOMAIN_SEEDS.keys()), value="code-python", label="domain")
+        n = gr.Slider(1, 30, value=10, step=1, label="count")
+        bt = gr.Button("synth_batch")
+        out = gr.Textbox(label="JSONL", lines=20)
+        bt.click(synth_batch, inputs=[d, n], outputs=out, api_name="synth_batch")
+    with gr.Tab("judge"):
+        p = gr.Textbox(label="prompt", lines=3)
+        r = gr.Textbox(label="response", lines=8)
+        c = gr.Textbox(label="criteria", value="default")
+        bj = gr.Button("judge_pair")
+        oj = gr.Textbox(label="JSON score", lines=8)
+        bj.click(judge_pair, inputs=[p, r, c], outputs=oj, api_name="judge_pair")
+    with gr.Tab("best-of-n"):
+        bp = gr.Textbox(label="prompt", lines=3)
+        bn = gr.Slider(2, 8, value=4, step=1, label="n")
+        bm = gr.Slider(128, 1024, value=512, step=64, label="max new")
+        bb = gr.Button("best_of_n")
+        bo = gr.Textbox(label="JSON best", lines=15)
+        bb.click(best_of_n, inputs=[bp, bn, bm], outputs=bo, api_name="best_of_n")
 if __name__ == "__main__":
+    demo.queue(max_size=8).launch()