ashirato commited on
Commit
fe83bcf
Β·
verified Β·
1 Parent(s): 8bdeff6

initial: Qwen2.5-Coder-7B + Surrogate-1 v1 LoRA on ZeroGPU A10G

Browse files
Files changed (3) hide show
  1. README.md +35 -6
  2. app.py +157 -0
  3. requirements.txt +9 -0
README.md CHANGED
@@ -1,12 +1,41 @@
1
  ---
2
- title: Surrogate 1 Zero Gpu
3
  emoji: πŸš€
4
- colorFrom: yellow
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.13.0
8
  app_file: app.py
9
- pinned: false
 
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Surrogate-1 ZeroGPU
3
  emoji: πŸš€
4
+ colorFrom: indigo
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ short_description: Surrogate-1 v1 LoRA on Qwen2.5-Coder-7B (ZeroGPU A10G)
12
+ suggested_hardware: zero-a10g
13
+ hf_oauth: false
14
+ models:
15
+ - Qwen/Qwen2.5-Coder-7B-Instruct
16
+ - axentx/surrogate-1-coder-7b-lora-v1
17
  ---
18
 
19
+ # Surrogate-1 ZeroGPU
20
+
21
+ DevSecOps + SRE + coding agent. Qwen2.5-Coder-7B-Instruct + Surrogate-1 v1
22
+ LoRA, served via HF ZeroGPU (A10G, 60-120s per request).
23
+
24
+ ## Endpoints
25
+
26
+ - Web UI: this Space (Gradio chat)
27
+ - OpenAI-compatible: `/api/predict` (Gradio API auto-generated)
28
+ - Use programmatically: `gradio_client.Client("ashirato/surrogate-1-zero-gpu")`
29
+
30
+ ## Why ZeroGPU
31
+
32
+ PRO unlocks 25K minutes/mo of A10G time at $0/mo. Each request gets fresh
33
+ GPU, so cold-start ~5-10s but no idle cost. Perfect for low-traffic
34
+ agentic loops (self-improve, constitutional, validator-RLVR judge calls).
35
+
36
+ ## Connected to axentx/surrogate-1
37
+
38
+ This Space serves inference. The orchestration Space at
39
+ `axentx/surrogate-1` runs cron loops + bulk-mirror harvest + state DBs;
40
+ those loops can call THIS endpoint for actual model output instead of
41
+ free-tier API ladder.
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Surrogate-1 ZeroGPU Space β€” Qwen2.5-Coder-7B + Surrogate-1 v1 LoRA.
2
+
3
+ Loads on CPU, swaps to ZeroGPU (A10G) per request via @spaces.GPU.
4
+ PRO subscription on owner account = 25K GPU-min/mo at $0.
5
+ """
6
+ import os
7
+ import gradio as gr
8
+ import spaces
9
+ import torch
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
11
+ from threading import Thread
12
+
13
+ BASE_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
14
+ LORA_REPO = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
15
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
16
+
17
+ SYSTEM = (
18
+ "You are Surrogate-1, an expert DevSecOps + SRE + coding agent. "
19
+ "You handle Terraform/CDK/CFN, Kubernetes, IAM least-privilege, CVE "
20
+ "remediation, SLO/runbooks, and full-stack code in Python/TypeScript/"
21
+ "Go/Rust. Cite real APIs only β€” no phantom imports. When uncertain, "
22
+ "say 'I don't know' rather than confabulate."
23
+ )
24
+
25
+ print(f"[boot] loading tokenizer: {BASE_MODEL}")
26
+ tokenizer = AutoTokenizer.from_pretrained(
27
+ BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
28
+
29
+ print(f"[boot] loading base model on CPU (will move to GPU on call): {BASE_MODEL}")
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ BASE_MODEL, torch_dtype=torch.bfloat16,
32
+ token=HF_TOKEN or None, trust_remote_code=True,
33
+ device_map="cpu")
34
+
35
+ # Try to apply LoRA β€” graceful fallback to base if LoRA repo unavailable
36
+ LORA_ACTIVE = False
37
+ try:
38
+ from peft import PeftModel
39
+ print(f"[boot] applying LoRA: {LORA_REPO}")
40
+ model = PeftModel.from_pretrained(model, LORA_REPO, token=HF_TOKEN or None)
41
+ LORA_ACTIVE = True
42
+ print("[boot] LoRA applied OK")
43
+ except Exception as e:
44
+ print(f"[boot] LoRA apply failed (using base only): {e}")
45
+
46
+ if tokenizer.pad_token_id is None:
47
+ tokenizer.pad_token_id = tokenizer.eos_token_id
48
+
49
+
50
+ def render_messages(history: list[tuple[str, str]], user_msg: str) -> str:
51
+ msgs = [{"role": "system", "content": SYSTEM}]
52
+ for u, a in history:
53
+ if u: msgs.append({"role": "user", "content": u})
54
+ if a: msgs.append({"role": "assistant", "content": a})
55
+ msgs.append({"role": "user", "content": user_msg})
56
+ return tokenizer.apply_chat_template(
57
+ msgs, tokenize=False, add_generation_prompt=True)
58
+
59
+
60
+ @spaces.GPU(duration=120)
61
+ def chat(user_msg, history, max_new_tokens=512, temperature=0.4, top_p=0.9):
62
+ if not user_msg or not user_msg.strip():
63
+ yield ""; return
64
+
65
+ prompt_text = render_messages(history or [], user_msg)
66
+ inputs = tokenizer(prompt_text, return_tensors="pt", truncation=True,
67
+ max_length=24000).to("cuda")
68
+ model.to("cuda")
69
+
70
+ streamer = TextIteratorStreamer(
71
+ tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=60)
72
+ gen_kwargs = dict(
73
+ **inputs,
74
+ max_new_tokens=int(max_new_tokens),
75
+ temperature=float(temperature) if temperature > 0 else 1e-5,
76
+ top_p=float(top_p),
77
+ do_sample=temperature > 0,
78
+ pad_token_id=tokenizer.pad_token_id,
79
+ eos_token_id=tokenizer.eos_token_id,
80
+ streamer=streamer,
81
+ use_cache=True,
82
+ )
83
+ th = Thread(target=model.generate, kwargs=gen_kwargs)
84
+ th.start()
85
+ out = ""
86
+ for chunk in streamer:
87
+ out += chunk
88
+ yield out
89
+ th.join()
90
+
91
+
92
+ CSS = """
93
+ .gradio-container { max-width: 1100px !important; }
94
+ .contain { font-family: ui-monospace, SFMono-Regular, monospace; }
95
+ """
96
+
97
+ with gr.Blocks(title="Surrogate-1 ZeroGPU", css=CSS,
98
+ theme=gr.themes.Base()) as demo:
99
+ gr.Markdown(
100
+ f"# Surrogate-1 ZeroGPU\n"
101
+ f"**Base**: `{BASE_MODEL}` \n"
102
+ f"**LoRA**: `{LORA_REPO}` {'βœ… active' if LORA_ACTIVE else '⚠️ base only (LoRA load failed)'} \n"
103
+ f"**Hardware**: ZeroGPU A10G (PRO subscription, 25K min/mo @ $0)\n"
104
+ )
105
+
106
+ with gr.Row():
107
+ with gr.Column(scale=4):
108
+ chatbot = gr.Chatbot(
109
+ height=560,
110
+ show_label=False,
111
+ avatar_images=(None, None),
112
+ bubble_full_width=False,
113
+ )
114
+ msg = gr.Textbox(
115
+ placeholder="ask Surrogate-1 anything: code, devops, security, sre...",
116
+ show_label=False,
117
+ lines=2,
118
+ )
119
+ with gr.Row():
120
+ submit = gr.Button("send", variant="primary")
121
+ clear = gr.Button("clear")
122
+ with gr.Column(scale=1):
123
+ max_new = gr.Slider(64, 2048, value=512, step=64, label="max new tokens")
124
+ temp = gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature")
125
+ top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="top_p")
126
+
127
+ def _user(user_msg, hist):
128
+ return "", (hist or []) + [(user_msg, None)]
129
+
130
+ def _bot(hist, mn, t, tp):
131
+ if not hist: return
132
+ user_msg = hist[-1][0]
133
+ h_for_render = hist[:-1]
134
+ partial = ""
135
+ for chunk in chat(user_msg, h_for_render, mn, t, tp):
136
+ partial = chunk
137
+ hist[-1] = (user_msg, partial)
138
+ yield hist
139
+
140
+ submit.click(_user, [msg, chatbot], [msg, chatbot], queue=False) \
141
+ .then(_bot, [chatbot, max_new, temp, top_p], chatbot)
142
+ msg.submit(_user, [msg, chatbot], [msg, chatbot], queue=False) \
143
+ .then(_bot, [chatbot, max_new, temp, top_p], chatbot)
144
+ clear.click(lambda: None, None, chatbot, queue=False)
145
+
146
+ gr.Markdown(
147
+ "---\n"
148
+ "**API**: any caller can hit `/api/predict` on this Space (Gradio API). \n"
149
+ "**Programmatic**: `from gradio_client import Client; "
150
+ "Client('ashirato/surrogate-1-zero-gpu').predict(...)`. \n"
151
+ "**Source**: [github.com/axentx/surrogate-1](https://github.com/axentx) "
152
+ "(orchestration on `axentx/surrogate-1`, inference here)."
153
+ )
154
+
155
+
156
+ if __name__ == "__main__":
157
+ demo.queue(max_size=20).launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.4.0
2
+ transformers>=4.46.0
3
+ peft>=0.13.0
4
+ accelerate>=1.0.0
5
+ bitsandbytes>=0.44.0
6
+ sentencepiece
7
+ gradio>=4.44.0
8
+ spaces
9
+ huggingface_hub>=0.26.0