Spaces:

surrogate1
/

surrogate-1-zero-gpu

Running on Zero

Ashira Pitchayapakayakul commited on 8 days ago

Commit

87b89b2

1 Parent(s): 99cf609

feat: clone working ashirato ZeroGPU app.py — 2nd PRO endpoint

Mirror of ashirato/surrogate-1-zero-gpu (which has been RUNNING with
gr.Blocks + /run/respond + /run/synth_batch since cca295a). Uses the
PRO ZeroGPU quota of the surrogate1 account (25K min/mo) — combined
with ashirato's quota = 50K min/mo total for synth-puller fan-out.

Same app.py:
• POST /call/respond — chat completion (used by zero-gpu-bridge.sh)
• POST /call/synth_batch — Magpie pair generator (used by synth-puller)
• Qwen2.5-Coder-7B-Instruct + axentx/surrogate-1-coder-7b-v1 LoRA
• bnb 4-bit NF4

starlette<0.40 + jinja2<3.2 pins prevent the gradio TemplateResponse
500 we hit before the cca295a fix on ashirato.

Files changed (3) hide show

README.md +9 -27
app.py +168 -93
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,41 +1,23 @@
 ---
-title: Surrogate-1 ZeroGPU
 emoji: 🚀
-colorFrom: indigo
-colorTo: purple
 sdk: gradio
 sdk_version: 4.44.0
 app_file: app.py
 pinned: true
 license: apache-2.0
-short_description: Surrogate-1 v1 LoRA on Qwen2.5-Coder-7B (ZeroGPU A10G)
 suggested_hardware: zero-a10g
 hf_oauth: false
 models:
   - Qwen/Qwen2.5-Coder-7B-Instruct
-  - axentx/surrogate-1-coder-7b-lora-v1
 ---
-# Surrogate-1 ZeroGPU
-DevSecOps + SRE + coding agent. Qwen2.5-Coder-7B-Instruct + Surrogate-1 v1
-LoRA, served via HF ZeroGPU (A10G, 60-120s per request).
-## Endpoints
-- Web UI: this Space (Gradio chat)
-- OpenAI-compatible: `/api/predict` (Gradio API auto-generated)
-- Use programmatically: `gradio_client.Client("ashirato/surrogate-1-zero-gpu")`
-## Why ZeroGPU
-PRO unlocks 25K minutes/mo of A10G time at $0/mo. Each request gets fresh
-GPU, so cold-start ~5-10s but no idle cost. Perfect for low-traffic
-agentic loops (self-improve, constitutional, validator-RLVR judge calls).
-## Connected to axentx/surrogate-1
-This Space serves inference. The orchestration Space at
-`axentx/surrogate-1` runs cron loops + bulk-mirror harvest + state DBs;
-those loops can call THIS endpoint for actual model output instead of
-free-tier API ladder.

 ---
+title: Surrogate-1 ZeroGPU PRO 2
 emoji: 🚀
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
 sdk_version: 4.44.0
 app_file: app.py
 pinned: true
 license: apache-2.0
+short_description: Surrogate-1 v1 LoRA on ZeroGPU (2nd PRO endpoint)
 suggested_hardware: zero-a10g
 hf_oauth: false
 models:
   - Qwen/Qwen2.5-Coder-7B-Instruct
+  - axentx/surrogate-1-coder-7b-v1
 ---
+# Surrogate-1 ZeroGPU (surrogate1 PRO endpoint)
+Twin of ashirato/surrogate-1-zero-gpu — same code, different free PRO
+ZeroGPU quota (25K min/mo each → 50K combined). synth-puller hits both
+in round-robin to double the daily synthetic-pair throughput.

app.py CHANGED Viewed

@@ -1,110 +1,185 @@
-"""surrogate1 ZeroGPU synth+judge — minimal gr.Interface tabs via Blocks.
-Same gradio-compat pattern as ashirato (gr.Interface only, no ChatInterface).
-3 endpoints exposed via separate Interfaces, mounted on TabbedInterface.
-"""
-import os, json, re, gradio as gr, spaces, torch
-BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-7B-Instruct")
-HF_TOKEN   = os.environ.get("HF_TOKEN", "")
-DOMAINS = ["code-python", "code-typescript", "code-rust", "code-go",
-           "devops-tf", "devops-k8s", "devops-cdk", "ci-github",
-           "sec-iam", "sec-cve", "sre-runbook", "sre-slo",
-           "data-sql", "ai-eng", "api-rest", "test-pytest"]
-SEED_TPL = "a {} task"
-JUDGE_RULES = ("Score 0-10: correctness, security, idiomatic, completeness, "
-               "real-API citation. Return ONLY JSON: "
-               '{"score":float,"why":str}')
-_m = _t = None
 def _load():
-    global _m, _t
-    if _m is not None: return _m, _t
-    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-    _t = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN or None,
-                                        trust_remote_code=True)
-    if _t.pad_token_id is None: _t.pad_token_id = _t.eos_token_id
     bnb = BitsAndBytesConfig(load_in_4bit=True,
                              bnb_4bit_compute_dtype=torch.bfloat16,
                              bnb_4bit_quant_type="nf4",
                              bnb_4bit_use_double_quant=True)
-    _m = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
         device_map="cuda", quantization_config=bnb)
-    return _m, _t
-def _gen(prompt, max_new=512, temp=0.7):
-    m, t = _load()
-    inputs = t(prompt, return_tensors="pt", truncation=True, max_length=8000).to("cuda")
-    out = m.generate(**inputs, max_new_tokens=int(max_new),
-                     temperature=float(temp), top_p=0.9, do_sample=temp > 0,
-                     pad_token_id=t.pad_token_id, eos_token_id=t.eos_token_id)
-    return t.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
 @spaces.GPU(duration=300)
-def synth_batch(domain: str, count: int) -> str:
-    m, t = _load()
-    seed = SEED_TPL.format(domain.replace("-", " "))
-    out_lines = []
-    for i in range(int(count)):
-        instr_msgs = [
-            {"role": "system",
-             "content": f"Generate ONE realistic, diverse, high-quality user request about {seed}. "
-                        "Output ONLY the request as a single paragraph. No preamble."},
-            {"role": "user", "content": ""},
-        ]
-        ip = t.apply_chat_template(instr_msgs, tokenize=False, add_generation_prompt=True)
-        instruction = _gen(ip, max_new=180, temp=0.95).strip().split("\n")[0][:600]
-        if len(instruction) < 30: continue
-        resp_msgs = [
-            {"role": "system",
-             "content": "You are Surrogate-1, expert DevSecOps + coding agent. Real APIs only."},
-            {"role": "user", "content": instruction},
-        ]
-        rp = t.apply_chat_template(resp_msgs, tokenize=False, add_generation_prompt=True)
-        response = _gen(rp, max_new=512, temp=0.4)
-        if len(response) < 50: continue
-        out_lines.append(json.dumps({
-            "prompt": instruction, "response": response,
-            "source": "magpie-zerogpu",
-            "meta": {"domain": domain, "ix": i},
-        }, ensure_ascii=False))
-    return "\n".join(out_lines)
-@spaces.GPU(duration=120)
-def judge_pair(prompt: str, response: str) -> str:
-    if not prompt or not response:
-        return json.dumps({"score": 0.0, "why": "empty"})
-    m, t = _load()
-    j_msgs = [{"role": "system", "content": "You are a strict reviewer. " + JUDGE_RULES},
-              {"role": "user", "content": f"PROMPT:\n{prompt[:2000]}\n\nRESPONSE:\n{response[:4000]}\n\nScore. JSON only."}]
-    raw = _gen(t.apply_chat_template(j_msgs, tokenize=False, add_generation_prompt=True),
-               max_new=300, temp=0.1)
-    mm = re.search(r"\{[^{}]*\"score\"[^{}]*\}", raw, re.DOTALL)
-    if mm:
-        try: return json.dumps(json.loads(mm.group(0)), ensure_ascii=False)
-        except: pass
-    return json.dumps({"score": 5.0, "why": raw[:200], "raw": True})
-synth_iface = gr.Interface(synth_batch,
-    [gr.Dropdown(DOMAINS, value="code-python", label="domain"),
-     gr.Slider(1, 20, value=10, step=1, label="count")],
-    gr.Textbox(label="JSONL", lines=20),
-    title="Magpie synth_batch")
-judge_iface = gr.Interface(judge_pair,
-    [gr.Textbox(label="prompt", lines=3),
-     gr.Textbox(label="response", lines=8)],
-    gr.Textbox(label="JSON", lines=6),
-    title="LLM-as-judge")
-demo = gr.TabbedInterface([synth_iface, judge_iface], ["synth", "judge"])
-demo.queue(max_size=6).launch()

+"""Surrogate-1 ZeroGPU — chat + synth-batch endpoints (gr.Blocks 4.44).
+Two functions exposed via Gradio API:
+  • POST /run/respond      — single chat completion (also UI tab)
+  • POST /run/synth_batch  — Magpie-style synthetic training pair batch
+synth_batch is hit by ~/.surrogate/bin/v2/synth-puller.sh every 5 min
+on the bulk Space, drains free PRO ZeroGPU budget into training data.
+Each call returns up to 20 JSONL pairs as a single string.
+Earlier ChatInterface attempts hit a starlette TemplateResponse failure
+during gradio's static-route init. gr.Blocks with explicit api_name on
+each click avoids the same code path and exposes both endpoints cleanly.
+Backbone: Qwen2.5-Coder-7B-Instruct + Surrogate-1 v1 LoRA, bnb int4.
+"""
+import json
+import os
+import gradio as gr
+import spaces
+import torch
+BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-7B-Instruct")
+LORA_REPO = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
+HF_TOKEN = os.environ.get("HF_TOKEN", "")
+SYSTEM = ("You are Surrogate-1, expert DevSecOps + SRE + coding agent. "
+          "Cite real APIs only. Say IDK rather than confabulate.")
+DOMAIN_HINTS = {
+    "code-python":     "Python coding tasks, idiomatic, type-hinted",
+    "code-typescript": "TypeScript / React / Node tasks, strict types",
+    "code-rust":       "Rust ownership, async, performance",
+    "code-go":         "Go concurrency, stdlib, microservices",
+    "devops-tf":       "Terraform AWS/GCP modules, best practices",
+    "devops-k8s":      "Kubernetes manifests, helm, troubleshooting",
+    "devops-cdk":      "AWS CDK constructs, TypeScript",
+    "ci-github":       "GitHub Actions workflows, reusable, secure",
+    "sec-iam":         "IAM least-privilege policies, AssumeRole",
+    "sec-cve":         "CVE remediation, SCA, dependency hygiene",
+    "sre-runbook":     "Incident runbooks, on-call, postmortems",
+    "sre-slo":         "SLO/SLI/error budgets, observability",
+    "data-sql":        "SQL queries, indexes, query plans, optimisation",
+    "ai-eng":          "RAG, vLLM, fine-tuning, evals",
+    "api-rest":        "REST API design, OpenAPI, idempotency",
+    "test-pytest":     "pytest fixtures, parametrize, markers",
+}
+_model = None
+_tokenizer = None
 def _load():
+    global _model, _tokenizer
+    if _model is not None:
+        return _model, _tokenizer
+    from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                              BitsAndBytesConfig)
+    _tokenizer = AutoTokenizer.from_pretrained(
+        BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
+    if _tokenizer.pad_token_id is None:
+        _tokenizer.pad_token_id = _tokenizer.eos_token_id
     bnb = BitsAndBytesConfig(load_in_4bit=True,
                              bnb_4bit_compute_dtype=torch.bfloat16,
                              bnb_4bit_quant_type="nf4",
                              bnb_4bit_use_double_quant=True)
+    _model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
         device_map="cuda", quantization_config=bnb)
+    if LORA_REPO:
+        try:
+            from peft import PeftModel
+            _model = PeftModel.from_pretrained(
+                _model, LORA_REPO, token=HF_TOKEN or None)
+            print(f"[ok] LoRA: {LORA_REPO}")
+        except Exception as e:
+            print(f"[skip] LoRA: {e}")
+    return _model, _tokenizer
+def _generate(prompt: str, max_tokens: int = 768,
+              temperature: float = 0.7) -> str:
+    model, tokenizer = _load()
+    msgs = [{"role": "system", "content": SYSTEM},
+            {"role": "user", "content": prompt}]
+    chat = tokenizer.apply_chat_template(
+        msgs, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(chat, return_tensors="pt", truncation=True,
+                       max_length=8000).to("cuda")
+    out = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens, temperature=temperature, do_sample=True,
+        top_p=0.9, pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id)
+    return tokenizer.decode(
+        out[0][inputs["input_ids"].shape[1]:],
+        skip_special_tokens=True).strip()
 @spaces.GPU(duration=300)
+def respond(message: str) -> str:
+    if not message or not message.strip():
+        return "(empty)"
+    return _generate(message, max_tokens=768, temperature=0.4)
+@spaces.GPU(duration=600)
+def synth_batch(domain: str, count) -> str:
+    """Magpie-style synthetic pair generation. Returns N JSONL lines."""
+    domain = (domain or "code-python").strip()
+    try:
+        count = int(count or 12)
+    except (TypeError, ValueError):
+        count = 12
+    count = max(1, min(20, count))
+    hint = DOMAIN_HINTS.get(domain, domain)
+    seed = (f"Generate ONE realistic technical question a senior engineer "
+            f"would ask about {hint}. Output JUST the question text, no "
+            f"preamble or quotes. Make it specific and answerable in "
+            f"200-500 words with code/config examples.")
+    pairs = []
+    for _ in range(count):
+        try:
+            instruction = _generate(seed, max_tokens=200, temperature=0.95)
+            instruction = (instruction.split("\n")[0]
+                           .strip().strip('"').strip("'")[:600])
+            if len(instruction) < 30:
+                continue
+            response = _generate(instruction, max_tokens=900,
+                                 temperature=0.4)
+            if len(response) < 80:
+                continue
+            pairs.append(json.dumps({
+                "prompt": instruction,
+                "response": response,
+                "source": f"surrogate-1-zero-gpu/synth-{domain}",
+                "meta": {"domain": domain, "magpie": True},
+            }, ensure_ascii=False))
+        except Exception as e:
+            print(f"[synth_batch] err: {e}")
+            continue
+    return "\n".join(pairs)
+with gr.Blocks(title="Surrogate-1 ZeroGPU") as demo:
+    gr.Markdown("# Surrogate-1 (7B + v1 LoRA, ZeroGPU A10G)")
+    gr.Markdown(
+        "Qwen2.5-Coder-7B + Surrogate-1 v1 LoRA on free PRO ZeroGPU. "
+        "Two API endpoints: `/run/respond` (chat) and `/run/synth_batch` "
+        "(synthetic training pair batch — used by synth-puller cron).")
+    with gr.Tab("chat"):
+        chat_in = gr.Textbox(
+            lines=4,
+            placeholder="ask Surrogate-1: code, devops, security…")
+        chat_out = gr.Textbox(lines=20, label="response")
+        gr.Button("send", variant="primary").click(
+            respond, chat_in, chat_out, api_name="respond")
+        gr.Examples(
+            [["Write a Terraform module for AWS S3 with KMS encryption "
+              "+ versioning."],
+             ["Implement Redis-based rate limit per-API-key in FastAPI."],
+             ["Diagnose: Lambda cold-start 3s on 256MB. "
+              "Architecture options?"]],
+            inputs=chat_in)
+    with gr.Tab("synth_batch"):
+        gr.Markdown(
+            "Magpie-style: model generates instructions per domain, then "
+            "responds. Output is JSONL (one pair per line). Domains: "
+            + ", ".join(sorted(DOMAIN_HINTS.keys())))
+        synth_dom = gr.Textbox(value="code-python", label="domain")
+        synth_cnt = gr.Number(value=12, precision=0, label="count (1-20)")
+        synth_out = gr.Textbox(lines=20, label="JSONL pairs")
+        gr.Button("generate", variant="primary").click(
+            synth_batch, [synth_dom, synth_cnt], synth_out,
+            api_name="synth_batch")
+demo.queue(max_size=8).launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,11 @@
 # HF ZeroGPU template force-installs gradio[oauth]==4.44.0 + spaces==0.48.2.
 # bitsandbytes 4-bit (proven, no compile required vs autoawq).
 transformers>=4.46.0,<4.50.0
 peft>=0.13.0,<0.15.0

 # HF ZeroGPU template force-installs gradio[oauth]==4.44.0 + spaces==0.48.2.
+# Pin starlette + jinja2 to versions compatible with gradio 4.44 internals
+# (TemplateResponse path errored on starlette>=0.40; pin to <0.40 fixes
+# the "/" route 500 we hit on the previous deploy).
+starlette<0.40
+jinja2<3.2
+fastapi<0.111
 # bitsandbytes 4-bit (proven, no compile required vs autoawq).
 transformers>=4.46.0,<4.50.0
 peft>=0.13.0,<0.15.0