Spaces:

ashirato
/

surrogate-1-zero-gpu

Running on Zero

App Files Files Community

ashirato commited on 8 days ago

Commit

fe83bcf

verified ·

1 Parent(s): 8bdeff6

initial: Qwen2.5-Coder-7B + Surrogate-1 v1 LoRA on ZeroGPU A10G

Browse files

Files changed (3) hide show

README.md +35 -6
app.py +157 -0
requirements.txt +9 -0

README.md CHANGED Viewed

@@ -1,12 +1,41 @@
 ---
-title: Surrogate 1 Zero Gpu
 emoji: 🚀
-colorFrom: yellow
-colorTo: gray
 sdk: gradio
-sdk_version: 6.13.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Surrogate-1 ZeroGPU
 emoji: 🚀
+colorFrom: indigo
+colorTo: purple
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
+pinned: true
+license: apache-2.0
+short_description: Surrogate-1 v1 LoRA on Qwen2.5-Coder-7B (ZeroGPU A10G)
+suggested_hardware: zero-a10g
+hf_oauth: false
+models:
+  - Qwen/Qwen2.5-Coder-7B-Instruct
+  - axentx/surrogate-1-coder-7b-lora-v1
 ---
+# Surrogate-1 ZeroGPU
+DevSecOps + SRE + coding agent. Qwen2.5-Coder-7B-Instruct + Surrogate-1 v1
+LoRA, served via HF ZeroGPU (A10G, 60-120s per request).
+## Endpoints
+- Web UI: this Space (Gradio chat)
+- OpenAI-compatible: `/api/predict` (Gradio API auto-generated)
+- Use programmatically: `gradio_client.Client("ashirato/surrogate-1-zero-gpu")`
+## Why ZeroGPU
+PRO unlocks 25K minutes/mo of A10G time at $0/mo. Each request gets fresh
+GPU, so cold-start ~5-10s but no idle cost. Perfect for low-traffic
+agentic loops (self-improve, constitutional, validator-RLVR judge calls).
+## Connected to axentx/surrogate-1
+This Space serves inference. The orchestration Space at
+`axentx/surrogate-1` runs cron loops + bulk-mirror harvest + state DBs;
+those loops can call THIS endpoint for actual model output instead of
+free-tier API ladder.

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""Surrogate-1 ZeroGPU Space — Qwen2.5-Coder-7B + Surrogate-1 v1 LoRA.
+Loads on CPU, swaps to ZeroGPU (A10G) per request via @spaces.GPU.
+PRO subscription on owner account = 25K GPU-min/mo at $0.
+"""
+import os
+import gradio as gr
+import spaces
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
+BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
+LORA_REPO  = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
+HF_TOKEN   = os.environ.get("HF_TOKEN", "")
+SYSTEM = (
+    "You are Surrogate-1, an expert DevSecOps + SRE + coding agent. "
+    "You handle Terraform/CDK/CFN, Kubernetes, IAM least-privilege, CVE "
+    "remediation, SLO/runbooks, and full-stack code in Python/TypeScript/"
+    "Go/Rust. Cite real APIs only — no phantom imports. When uncertain, "
+    "say 'I don't know' rather than confabulate."
+)
+print(f"[boot] loading tokenizer: {BASE_MODEL}")
+tokenizer = AutoTokenizer.from_pretrained(
+    BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
+print(f"[boot] loading base model on CPU (will move to GPU on call): {BASE_MODEL}")
+model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL, torch_dtype=torch.bfloat16,
+    token=HF_TOKEN or None, trust_remote_code=True,
+    device_map="cpu")
+# Try to apply LoRA — graceful fallback to base if LoRA repo unavailable
+LORA_ACTIVE = False
+try:
+    from peft import PeftModel
+    print(f"[boot] applying LoRA: {LORA_REPO}")
+    model = PeftModel.from_pretrained(model, LORA_REPO, token=HF_TOKEN or None)
+    LORA_ACTIVE = True
+    print("[boot] LoRA applied OK")
+except Exception as e:
+    print(f"[boot] LoRA apply failed (using base only): {e}")
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+def render_messages(history: list[tuple[str, str]], user_msg: str) -> str:
+    msgs = [{"role": "system", "content": SYSTEM}]
+    for u, a in history:
+        if u: msgs.append({"role": "user", "content": u})
+        if a: msgs.append({"role": "assistant", "content": a})
+    msgs.append({"role": "user", "content": user_msg})
+    return tokenizer.apply_chat_template(
+        msgs, tokenize=False, add_generation_prompt=True)
+@spaces.GPU(duration=120)
+def chat(user_msg, history, max_new_tokens=512, temperature=0.4, top_p=0.9):
+    if not user_msg or not user_msg.strip():
+        yield ""; return
+    prompt_text = render_messages(history or [], user_msg)
+    inputs = tokenizer(prompt_text, return_tensors="pt", truncation=True,
+                        max_length=24000).to("cuda")
+    model.to("cuda")
+    streamer = TextIteratorStreamer(
+        tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=60)
+    gen_kwargs = dict(
+        **inputs,
+        max_new_tokens=int(max_new_tokens),
+        temperature=float(temperature) if temperature > 0 else 1e-5,
+        top_p=float(top_p),
+        do_sample=temperature > 0,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        streamer=streamer,
+        use_cache=True,
+    )
+    th = Thread(target=model.generate, kwargs=gen_kwargs)
+    th.start()
+    out = ""
+    for chunk in streamer:
+        out += chunk
+        yield out
+    th.join()
+CSS = """
+.gradio-container { max-width: 1100px !important; }
+.contain { font-family: ui-monospace, SFMono-Regular, monospace; }
+"""
+with gr.Blocks(title="Surrogate-1 ZeroGPU", css=CSS,
+               theme=gr.themes.Base()) as demo:
+    gr.Markdown(
+        f"# Surrogate-1 ZeroGPU\n"
+        f"**Base**: `{BASE_MODEL}`  \n"
+        f"**LoRA**: `{LORA_REPO}` {'✅ active' if LORA_ACTIVE else '⚠️ base only (LoRA load failed)'}  \n"
+        f"**Hardware**: ZeroGPU A10G (PRO subscription, 25K min/mo @ $0)\n"
+    )
+    with gr.Row():
+        with gr.Column(scale=4):
+            chatbot = gr.Chatbot(
+                height=560,
+                show_label=False,
+                avatar_images=(None, None),
+                bubble_full_width=False,
+            )
+            msg = gr.Textbox(
+                placeholder="ask Surrogate-1 anything: code, devops, security, sre...",
+                show_label=False,
+                lines=2,
+            )
+            with gr.Row():
+                submit = gr.Button("send", variant="primary")
+                clear  = gr.Button("clear")
+        with gr.Column(scale=1):
+            max_new = gr.Slider(64, 2048, value=512, step=64, label="max new tokens")
+            temp    = gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature")
+            top_p   = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="top_p")
+    def _user(user_msg, hist):
+        return "", (hist or []) + [(user_msg, None)]
+    def _bot(hist, mn, t, tp):
+        if not hist: return
+        user_msg = hist[-1][0]
+        h_for_render = hist[:-1]
+        partial = ""
+        for chunk in chat(user_msg, h_for_render, mn, t, tp):
+            partial = chunk
+            hist[-1] = (user_msg, partial)
+            yield hist
+    submit.click(_user, [msg, chatbot], [msg, chatbot], queue=False) \
+          .then(_bot, [chatbot, max_new, temp, top_p], chatbot)
+    msg.submit(_user, [msg, chatbot], [msg, chatbot], queue=False) \
+       .then(_bot, [chatbot, max_new, temp, top_p], chatbot)
+    clear.click(lambda: None, None, chatbot, queue=False)
+    gr.Markdown(
+        "---\n"
+        "**API**: any caller can hit `/api/predict` on this Space (Gradio API).  \n"
+        "**Programmatic**: `from gradio_client import Client; "
+        "Client('ashirato/surrogate-1-zero-gpu').predict(...)`.  \n"
+        "**Source**: [github.com/axentx/surrogate-1](https://github.com/axentx) "
+        "(orchestration on `axentx/surrogate-1`, inference here)."
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch==2.4.0
+transformers>=4.46.0
+peft>=0.13.0
+accelerate>=1.0.0
+bitsandbytes>=0.44.0
+sentencepiece
+gradio>=4.44.0
+spaces
+huggingface_hub>=0.26.0