Spaces:

ashirato
/

surrogate-1-zero-gpu

Running on Zero

App Files Files Community

ashirato commited on 8 days ago

Commit

0367d10

verified ·

1 Parent(s): 16a3ce3

fix: use gr.ChatInterface (simpler sig, avoids _json_schema bug)

Browse files

Files changed (1) hide show

app.py +46 -104

app.py CHANGED Viewed

@@ -1,14 +1,14 @@
-"""Surrogate-1 ZeroGPU Space — Qwen2.5-Coder-7B + Surrogate-1 v1 LoRA.
-Loads on CPU, swaps to ZeroGPU (A10G) per request via @spaces.GPU.
-PRO subscription on owner account = 25K GPU-min/mo at $0.
 """
 import os
 import gradio as gr
 import spaces
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from threading import Thread
 BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
 LORA_REPO  = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
@@ -16,60 +16,48 @@ HF_TOKEN   = os.environ.get("HF_TOKEN", "")
 SYSTEM = (
     "You are Surrogate-1, an expert DevSecOps + SRE + coding agent. "
-    "You handle Terraform/CDK/CFN, Kubernetes, IAM least-privilege, CVE "
-    "remediation, SLO/runbooks, and full-stack code in Python/TypeScript/"
-    "Go/Rust. Cite real APIs only — no phantom imports. When uncertain, "
-    "say 'I don't know' rather than confabulate."
 )
-print(f"[boot] loading tokenizer: {BASE_MODEL}")
 tokenizer = AutoTokenizer.from_pretrained(
     BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
-print(f"[boot] loading base model on CPU (will move to GPU on call): {BASE_MODEL}")
 model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL, torch_dtype=torch.bfloat16,
     token=HF_TOKEN or None, trust_remote_code=True,
     device_map="cpu")
-# Try to apply LoRA — graceful fallback to base if LoRA repo unavailable
 LORA_ACTIVE = False
 try:
     from peft import PeftModel
-    print(f"[boot] applying LoRA: {LORA_REPO}")
     model = PeftModel.from_pretrained(model, LORA_REPO, token=HF_TOKEN or None)
     LORA_ACTIVE = True
-    print("[boot] LoRA applied OK")
 except Exception as e:
-    print(f"[boot] LoRA apply failed (using base only): {e}")
-if tokenizer.pad_token_id is None:
-    tokenizer.pad_token_id = tokenizer.eos_token_id
-def render_messages(history: list[tuple[str, str]], user_msg: str) -> str:
     msgs = [{"role": "system", "content": SYSTEM}]
-    for u, a in history:
         if u: msgs.append({"role": "user", "content": u})
         if a: msgs.append({"role": "assistant", "content": a})
-    msgs.append({"role": "user", "content": user_msg})
-    return tokenizer.apply_chat_template(
-        msgs, tokenize=False, add_generation_prompt=True)
-@spaces.GPU(duration=120)
-def chat(user_msg, history, max_new_tokens=512, temperature=0.4, top_p=0.9):
-    if not user_msg or not user_msg.strip():
-        yield ""; return
-    prompt_text = render_messages(history or [], user_msg)
-    inputs = tokenizer(prompt_text, return_tensors="pt", truncation=True,
                         max_length=24000).to("cuda")
     model.to("cuda")
-    streamer = TextIteratorStreamer(
-        tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=60)
-    gen_kwargs = dict(
         **inputs,
         max_new_tokens=int(max_new_tokens),
         temperature=float(temperature) if temperature > 0 else 1e-5,
@@ -77,81 +65,35 @@ def chat(user_msg, history, max_new_tokens=512, temperature=0.4, top_p=0.9):
         do_sample=temperature > 0,
         pad_token_id=tokenizer.pad_token_id,
         eos_token_id=tokenizer.eos_token_id,
-        streamer=streamer,
         use_cache=True,
     )
-    th = Thread(target=model.generate, kwargs=gen_kwargs)
-    th.start()
-    out = ""
-    for chunk in streamer:
-        out += chunk
-        yield out
-    th.join()
-CSS = """
-.gradio-container { max-width: 1100px !important; }
-.contain { font-family: ui-monospace, SFMono-Regular, monospace; }
-"""
-with gr.Blocks(title="Surrogate-1 ZeroGPU", css=CSS,
-               theme=gr.themes.Base()) as demo:
-    gr.Markdown(
-        f"# Surrogate-1 ZeroGPU\n"
-        f"**Base**: `{BASE_MODEL}`  \n"
-        f"**LoRA**: `{LORA_REPO}` {'✅ active' if LORA_ACTIVE else '⚠️ base only (LoRA load failed)'}  \n"
-        f"**Hardware**: ZeroGPU A10G (PRO subscription, 25K min/mo @ $0)\n"
-    )
-    with gr.Row():
-        with gr.Column(scale=4):
-            chatbot = gr.Chatbot(
-                height=560,
-                show_label=False,
-                avatar_images=(None, None),
-                bubble_full_width=False,
-            )
-            msg = gr.Textbox(
-                placeholder="ask Surrogate-1 anything: code, devops, security, sre...",
-                show_label=False,
-                lines=2,
-            )
-            with gr.Row():
-                submit = gr.Button("send", variant="primary")
-                clear  = gr.Button("clear")
-        with gr.Column(scale=1):
-            max_new = gr.Slider(64, 2048, value=512, step=64, label="max new tokens")
-            temp    = gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature")
-            top_p   = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="top_p")
-    def _user(user_msg, hist):
-        return "", (hist or []) + [(user_msg, None)]
-    def _bot(hist, mn, t, tp):
-        if not hist: return
-        user_msg = hist[-1][0]
-        h_for_render = hist[:-1]
-        partial = ""
-        for chunk in chat(user_msg, h_for_render, mn, t, tp):
-            partial = chunk
-            hist[-1] = (user_msg, partial)
-            yield hist
-    submit.click(_user, [msg, chatbot], [msg, chatbot], queue=False) \
-          .then(_bot, [chatbot, max_new, temp, top_p], chatbot)
-    msg.submit(_user, [msg, chatbot], [msg, chatbot], queue=False) \
-       .then(_bot, [chatbot, max_new, temp, top_p], chatbot)
-    clear.click(lambda: None, None, chatbot, queue=False)
-    gr.Markdown(
-        "---\n"
-        "**API**: any caller can hit `/api/predict` on this Space (Gradio API).  \n"
-        "**Programmatic**: `from gradio_client import Client; "
-        "Client('ashirato/surrogate-1-zero-gpu').predict(...)`.  \n"
-        "**Source**: [github.com/axentx/surrogate-1](https://github.com/axentx) "
-        "(orchestration on `axentx/surrogate-1`, inference here)."
-    )
 if __name__ == "__main__":
     demo.queue(max_size=20).launch()

+"""Surrogate-1 ZeroGPU Space — Qwen2.5-Coder-7B + v1 LoRA.
+Rewritten 2026-04-30 to use gr.ChatInterface (simpler signature, avoids
+the gradio_client._json_schema_to_python_type recursion bug that broke
+the previous custom-Blocks app.py).
 """
 import os
 import gradio as gr
 import spaces
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
 LORA_REPO  = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
 SYSTEM = (
     "You are Surrogate-1, an expert DevSecOps + SRE + coding agent. "
+    "Cite real APIs only — no phantom imports. When uncertain, say "
+    "'I don't know' rather than confabulate."
 )
+print(f"[boot] tokenizer: {BASE_MODEL}")
 tokenizer = AutoTokenizer.from_pretrained(
     BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+print(f"[boot] base model on CPU: {BASE_MODEL}")
 model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL, torch_dtype=torch.bfloat16,
     token=HF_TOKEN or None, trust_remote_code=True,
     device_map="cpu")
 LORA_ACTIVE = False
 try:
     from peft import PeftModel
+    print(f"[boot] LoRA: {LORA_REPO}")
     model = PeftModel.from_pretrained(model, LORA_REPO, token=HF_TOKEN or None)
     LORA_ACTIVE = True
+    print("[boot] LoRA applied")
 except Exception as e:
+    print(f"[boot] LoRA failed (using base only): {e}")
+@spaces.GPU(duration=120)
+def respond(message, history, max_new_tokens=512, temperature=0.4, top_p=0.9):
     msgs = [{"role": "system", "content": SYSTEM}]
+    for u, a in (history or []):
         if u: msgs.append({"role": "user", "content": u})
         if a: msgs.append({"role": "assistant", "content": a})
+    msgs.append({"role": "user", "content": message})
+    prompt = tokenizer.apply_chat_template(
+        msgs, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
                         max_length=24000).to("cuda")
     model.to("cuda")
+    out = model.generate(
         **inputs,
         max_new_tokens=int(max_new_tokens),
         temperature=float(temperature) if temperature > 0 else 1e-5,
         do_sample=temperature > 0,
         pad_token_id=tokenizer.pad_token_id,
         eos_token_id=tokenizer.eos_token_id,
         use_cache=True,
     )
+    new_tokens = out[0][inputs["input_ids"].shape[1]:]
+    return tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
+desc = (
+    f"**Base**: `{BASE_MODEL}` &nbsp;&nbsp; "
+    f"**LoRA**: `{LORA_REPO}` "
+    f"{'✅ active' if LORA_ACTIVE else '⚠️ base only'}<br>"
+    f"**Hardware**: ZeroGPU A10G (PRO subscription, 25K min/mo @ $0)"
+)
+demo = gr.ChatInterface(
+    fn=respond,
+    title="Surrogate-1 — DevSecOps + SRE + Code Agent",
+    description=desc,
+    additional_inputs=[
+        gr.Slider(64, 2048, value=512, step=64, label="max new tokens"),
+        gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature"),
+        gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="top_p"),
+    ],
+    examples=[
+        "Write a Terraform module for an S3 bucket with KMS encryption + versioning.",
+        "Diagnose: AWS Lambda cold start latency 3s. Architecture suggestions?",
+        "Review this IAM policy for least-privilege violations: <paste here>",
+        "Implement rate-limit per-API-key in FastAPI with Redis.",
+    ],
+)
 if __name__ == "__main__":
     demo.queue(max_size=20).launch()