Spaces:
Running on Zero
Running on Zero
Ashira Pitchayapakayakul commited on
Commit ·
87b89b2
1
Parent(s): 99cf609
feat: clone working ashirato ZeroGPU app.py — 2nd PRO endpoint
Browse filesMirror of ashirato/surrogate-1-zero-gpu (which has been RUNNING with
gr.Blocks + /run/respond + /run/synth_batch since cca295a). Uses the
PRO ZeroGPU quota of the surrogate1 account (25K min/mo) — combined
with ashirato's quota = 50K min/mo total for synth-puller fan-out.
Same app.py:
• POST /call/respond — chat completion (used by zero-gpu-bridge.sh)
• POST /call/synth_batch — Magpie pair generator (used by synth-puller)
• Qwen2.5-Coder-7B-Instruct + axentx/surrogate-1-coder-7b-v1 LoRA
• bnb 4-bit NF4
starlette<0.40 + jinja2<3.2 pins prevent the gradio TemplateResponse
500 we hit before the cca295a fix on ashirato.
- README.md +9 -27
- app.py +168 -93
- requirements.txt +7 -0
README.md
CHANGED
|
@@ -1,41 +1,23 @@
|
|
| 1 |
---
|
| 2 |
-
title: Surrogate-1 ZeroGPU
|
| 3 |
emoji: 🚀
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
| 11 |
-
short_description: Surrogate-1 v1 LoRA on
|
| 12 |
suggested_hardware: zero-a10g
|
| 13 |
hf_oauth: false
|
| 14 |
models:
|
| 15 |
- Qwen/Qwen2.5-Coder-7B-Instruct
|
| 16 |
-
- axentx/surrogate-1-coder-7b-
|
| 17 |
---
|
| 18 |
|
| 19 |
-
# Surrogate-1 ZeroGPU
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
## Endpoints
|
| 25 |
-
|
| 26 |
-
- Web UI: this Space (Gradio chat)
|
| 27 |
-
- OpenAI-compatible: `/api/predict` (Gradio API auto-generated)
|
| 28 |
-
- Use programmatically: `gradio_client.Client("ashirato/surrogate-1-zero-gpu")`
|
| 29 |
-
|
| 30 |
-
## Why ZeroGPU
|
| 31 |
-
|
| 32 |
-
PRO unlocks 25K minutes/mo of A10G time at $0/mo. Each request gets fresh
|
| 33 |
-
GPU, so cold-start ~5-10s but no idle cost. Perfect for low-traffic
|
| 34 |
-
agentic loops (self-improve, constitutional, validator-RLVR judge calls).
|
| 35 |
-
|
| 36 |
-
## Connected to axentx/surrogate-1
|
| 37 |
-
|
| 38 |
-
This Space serves inference. The orchestration Space at
|
| 39 |
-
`axentx/surrogate-1` runs cron loops + bulk-mirror harvest + state DBs;
|
| 40 |
-
those loops can call THIS endpoint for actual model output instead of
|
| 41 |
-
free-tier API ladder.
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Surrogate-1 ZeroGPU PRO 2
|
| 3 |
emoji: 🚀
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
| 11 |
+
short_description: Surrogate-1 v1 LoRA on ZeroGPU (2nd PRO endpoint)
|
| 12 |
suggested_hardware: zero-a10g
|
| 13 |
hf_oauth: false
|
| 14 |
models:
|
| 15 |
- Qwen/Qwen2.5-Coder-7B-Instruct
|
| 16 |
+
- axentx/surrogate-1-coder-7b-v1
|
| 17 |
---
|
| 18 |
|
| 19 |
+
# Surrogate-1 ZeroGPU (surrogate1 PRO endpoint)
|
| 20 |
|
| 21 |
+
Twin of ashirato/surrogate-1-zero-gpu — same code, different free PRO
|
| 22 |
+
ZeroGPU quota (25K min/mo each → 50K combined). synth-puller hits both
|
| 23 |
+
in round-robin to double the daily synthetic-pair throughput.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,110 +1,185 @@
|
|
| 1 |
-
"""
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
import os, json, re, gradio as gr, spaces, torch
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
"data-sql", "ai-eng", "api-rest", "test-pytest"]
|
| 15 |
-
SEED_TPL = "a {} task"
|
| 16 |
|
| 17 |
-
JUDGE_RULES = ("Score 0-10: correctness, security, idiomatic, completeness, "
|
| 18 |
-
"real-API citation. Return ONLY JSON: "
|
| 19 |
-
'{"score":float,"why":str}')
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def _load():
|
| 25 |
-
global
|
| 26 |
-
if
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
bnb = BitsAndBytesConfig(load_in_4bit=True,
|
| 32 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 33 |
bnb_4bit_quant_type="nf4",
|
| 34 |
bnb_4bit_use_double_quant=True)
|
| 35 |
-
|
| 36 |
BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
|
| 37 |
device_map="cuda", quantization_config=bnb)
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
@spaces.GPU(duration=300)
|
| 51 |
-
def
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
return
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Surrogate-1 ZeroGPU — chat + synth-batch endpoints (gr.Blocks 4.44).
|
| 2 |
|
| 3 |
+
Two functions exposed via Gradio API:
|
| 4 |
+
• POST /run/respond — single chat completion (also UI tab)
|
| 5 |
+
• POST /run/synth_batch — Magpie-style synthetic training pair batch
|
|
|
|
| 6 |
|
| 7 |
+
synth_batch is hit by ~/.surrogate/bin/v2/synth-puller.sh every 5 min
|
| 8 |
+
on the bulk Space, drains free PRO ZeroGPU budget into training data.
|
| 9 |
+
Each call returns up to 20 JSONL pairs as a single string.
|
| 10 |
+
|
| 11 |
+
Earlier ChatInterface attempts hit a starlette TemplateResponse failure
|
| 12 |
+
during gradio's static-route init. gr.Blocks with explicit api_name on
|
| 13 |
+
each click avoids the same code path and exposes both endpoints cleanly.
|
| 14 |
+
|
| 15 |
+
Backbone: Qwen2.5-Coder-7B-Instruct + Surrogate-1 v1 LoRA, bnb int4.
|
| 16 |
+
"""
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
|
| 20 |
+
import gradio as gr
|
| 21 |
+
import spaces
|
| 22 |
+
import torch
|
|
|
|
|
|
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-7B-Instruct")
|
| 26 |
+
LORA_REPO = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
|
| 27 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 28 |
+
|
| 29 |
+
SYSTEM = ("You are Surrogate-1, expert DevSecOps + SRE + coding agent. "
|
| 30 |
+
"Cite real APIs only. Say IDK rather than confabulate.")
|
| 31 |
+
|
| 32 |
+
DOMAIN_HINTS = {
|
| 33 |
+
"code-python": "Python coding tasks, idiomatic, type-hinted",
|
| 34 |
+
"code-typescript": "TypeScript / React / Node tasks, strict types",
|
| 35 |
+
"code-rust": "Rust ownership, async, performance",
|
| 36 |
+
"code-go": "Go concurrency, stdlib, microservices",
|
| 37 |
+
"devops-tf": "Terraform AWS/GCP modules, best practices",
|
| 38 |
+
"devops-k8s": "Kubernetes manifests, helm, troubleshooting",
|
| 39 |
+
"devops-cdk": "AWS CDK constructs, TypeScript",
|
| 40 |
+
"ci-github": "GitHub Actions workflows, reusable, secure",
|
| 41 |
+
"sec-iam": "IAM least-privilege policies, AssumeRole",
|
| 42 |
+
"sec-cve": "CVE remediation, SCA, dependency hygiene",
|
| 43 |
+
"sre-runbook": "Incident runbooks, on-call, postmortems",
|
| 44 |
+
"sre-slo": "SLO/SLI/error budgets, observability",
|
| 45 |
+
"data-sql": "SQL queries, indexes, query plans, optimisation",
|
| 46 |
+
"ai-eng": "RAG, vLLM, fine-tuning, evals",
|
| 47 |
+
"api-rest": "REST API design, OpenAPI, idempotency",
|
| 48 |
+
"test-pytest": "pytest fixtures, parametrize, markers",
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
_model = None
|
| 53 |
+
_tokenizer = None
|
| 54 |
|
| 55 |
|
| 56 |
def _load():
|
| 57 |
+
global _model, _tokenizer
|
| 58 |
+
if _model is not None:
|
| 59 |
+
return _model, _tokenizer
|
| 60 |
+
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
| 61 |
+
BitsAndBytesConfig)
|
| 62 |
+
_tokenizer = AutoTokenizer.from_pretrained(
|
| 63 |
+
BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
|
| 64 |
+
if _tokenizer.pad_token_id is None:
|
| 65 |
+
_tokenizer.pad_token_id = _tokenizer.eos_token_id
|
| 66 |
bnb = BitsAndBytesConfig(load_in_4bit=True,
|
| 67 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 68 |
bnb_4bit_quant_type="nf4",
|
| 69 |
bnb_4bit_use_double_quant=True)
|
| 70 |
+
_model = AutoModelForCausalLM.from_pretrained(
|
| 71 |
BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True,
|
| 72 |
device_map="cuda", quantization_config=bnb)
|
| 73 |
+
if LORA_REPO:
|
| 74 |
+
try:
|
| 75 |
+
from peft import PeftModel
|
| 76 |
+
_model = PeftModel.from_pretrained(
|
| 77 |
+
_model, LORA_REPO, token=HF_TOKEN or None)
|
| 78 |
+
print(f"[ok] LoRA: {LORA_REPO}")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"[skip] LoRA: {e}")
|
| 81 |
+
return _model, _tokenizer
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _generate(prompt: str, max_tokens: int = 768,
|
| 85 |
+
temperature: float = 0.7) -> str:
|
| 86 |
+
model, tokenizer = _load()
|
| 87 |
+
msgs = [{"role": "system", "content": SYSTEM},
|
| 88 |
+
{"role": "user", "content": prompt}]
|
| 89 |
+
chat = tokenizer.apply_chat_template(
|
| 90 |
+
msgs, tokenize=False, add_generation_prompt=True)
|
| 91 |
+
inputs = tokenizer(chat, return_tensors="pt", truncation=True,
|
| 92 |
+
max_length=8000).to("cuda")
|
| 93 |
+
out = model.generate(
|
| 94 |
+
**inputs,
|
| 95 |
+
max_new_tokens=max_tokens, temperature=temperature, do_sample=True,
|
| 96 |
+
top_p=0.9, pad_token_id=tokenizer.pad_token_id,
|
| 97 |
+
eos_token_id=tokenizer.eos_token_id)
|
| 98 |
+
return tokenizer.decode(
|
| 99 |
+
out[0][inputs["input_ids"].shape[1]:],
|
| 100 |
+
skip_special_tokens=True).strip()
|
| 101 |
|
| 102 |
|
| 103 |
@spaces.GPU(duration=300)
|
| 104 |
+
def respond(message: str) -> str:
|
| 105 |
+
if not message or not message.strip():
|
| 106 |
+
return "(empty)"
|
| 107 |
+
return _generate(message, max_tokens=768, temperature=0.4)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@spaces.GPU(duration=600)
|
| 111 |
+
def synth_batch(domain: str, count) -> str:
|
| 112 |
+
"""Magpie-style synthetic pair generation. Returns N JSONL lines."""
|
| 113 |
+
domain = (domain or "code-python").strip()
|
| 114 |
+
try:
|
| 115 |
+
count = int(count or 12)
|
| 116 |
+
except (TypeError, ValueError):
|
| 117 |
+
count = 12
|
| 118 |
+
count = max(1, min(20, count))
|
| 119 |
+
hint = DOMAIN_HINTS.get(domain, domain)
|
| 120 |
+
|
| 121 |
+
seed = (f"Generate ONE realistic technical question a senior engineer "
|
| 122 |
+
f"would ask about {hint}. Output JUST the question text, no "
|
| 123 |
+
f"preamble or quotes. Make it specific and answerable in "
|
| 124 |
+
f"200-500 words with code/config examples.")
|
| 125 |
+
|
| 126 |
+
pairs = []
|
| 127 |
+
for _ in range(count):
|
| 128 |
+
try:
|
| 129 |
+
instruction = _generate(seed, max_tokens=200, temperature=0.95)
|
| 130 |
+
instruction = (instruction.split("\n")[0]
|
| 131 |
+
.strip().strip('"').strip("'")[:600])
|
| 132 |
+
if len(instruction) < 30:
|
| 133 |
+
continue
|
| 134 |
+
response = _generate(instruction, max_tokens=900,
|
| 135 |
+
temperature=0.4)
|
| 136 |
+
if len(response) < 80:
|
| 137 |
+
continue
|
| 138 |
+
pairs.append(json.dumps({
|
| 139 |
+
"prompt": instruction,
|
| 140 |
+
"response": response,
|
| 141 |
+
"source": f"surrogate-1-zero-gpu/synth-{domain}",
|
| 142 |
+
"meta": {"domain": domain, "magpie": True},
|
| 143 |
+
}, ensure_ascii=False))
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"[synth_batch] err: {e}")
|
| 146 |
+
continue
|
| 147 |
+
return "\n".join(pairs)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
with gr.Blocks(title="Surrogate-1 ZeroGPU") as demo:
|
| 151 |
+
gr.Markdown("# Surrogate-1 (7B + v1 LoRA, ZeroGPU A10G)")
|
| 152 |
+
gr.Markdown(
|
| 153 |
+
"Qwen2.5-Coder-7B + Surrogate-1 v1 LoRA on free PRO ZeroGPU. "
|
| 154 |
+
"Two API endpoints: `/run/respond` (chat) and `/run/synth_batch` "
|
| 155 |
+
"(synthetic training pair batch — used by synth-puller cron).")
|
| 156 |
+
|
| 157 |
+
with gr.Tab("chat"):
|
| 158 |
+
chat_in = gr.Textbox(
|
| 159 |
+
lines=4,
|
| 160 |
+
placeholder="ask Surrogate-1: code, devops, security…")
|
| 161 |
+
chat_out = gr.Textbox(lines=20, label="response")
|
| 162 |
+
gr.Button("send", variant="primary").click(
|
| 163 |
+
respond, chat_in, chat_out, api_name="respond")
|
| 164 |
+
gr.Examples(
|
| 165 |
+
[["Write a Terraform module for AWS S3 with KMS encryption "
|
| 166 |
+
"+ versioning."],
|
| 167 |
+
["Implement Redis-based rate limit per-API-key in FastAPI."],
|
| 168 |
+
["Diagnose: Lambda cold-start 3s on 256MB. "
|
| 169 |
+
"Architecture options?"]],
|
| 170 |
+
inputs=chat_in)
|
| 171 |
+
|
| 172 |
+
with gr.Tab("synth_batch"):
|
| 173 |
+
gr.Markdown(
|
| 174 |
+
"Magpie-style: model generates instructions per domain, then "
|
| 175 |
+
"responds. Output is JSONL (one pair per line). Domains: "
|
| 176 |
+
+ ", ".join(sorted(DOMAIN_HINTS.keys())))
|
| 177 |
+
synth_dom = gr.Textbox(value="code-python", label="domain")
|
| 178 |
+
synth_cnt = gr.Number(value=12, precision=0, label="count (1-20)")
|
| 179 |
+
synth_out = gr.Textbox(lines=20, label="JSONL pairs")
|
| 180 |
+
gr.Button("generate", variant="primary").click(
|
| 181 |
+
synth_batch, [synth_dom, synth_cnt], synth_out,
|
| 182 |
+
api_name="synth_batch")
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
demo.queue(max_size=8).launch()
|
requirements.txt
CHANGED
|
@@ -1,4 +1,11 @@
|
|
| 1 |
# HF ZeroGPU template force-installs gradio[oauth]==4.44.0 + spaces==0.48.2.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
# bitsandbytes 4-bit (proven, no compile required vs autoawq).
|
| 3 |
transformers>=4.46.0,<4.50.0
|
| 4 |
peft>=0.13.0,<0.15.0
|
|
|
|
| 1 |
# HF ZeroGPU template force-installs gradio[oauth]==4.44.0 + spaces==0.48.2.
|
| 2 |
+
# Pin starlette + jinja2 to versions compatible with gradio 4.44 internals
|
| 3 |
+
# (TemplateResponse path errored on starlette>=0.40; pin to <0.40 fixes
|
| 4 |
+
# the "/" route 500 we hit on the previous deploy).
|
| 5 |
+
starlette<0.40
|
| 6 |
+
jinja2<3.2
|
| 7 |
+
fastapi<0.111
|
| 8 |
+
|
| 9 |
# bitsandbytes 4-bit (proven, no compile required vs autoawq).
|
| 10 |
transformers>=4.46.0,<4.50.0
|
| 11 |
peft>=0.13.0,<0.15.0
|