surrogate1 commited on
Commit
6d6a848
·
verified ·
1 Parent(s): e42436a

init mirror from ashirato shard

Browse files
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Surrogate-1 ZeroGPU Space — minimal, works.
2
+
3
+ Switched to Qwen2.5-Coder-3B (~6GB BF16 vs 14GB on 7B) for faster cold
4
+ load (≤60s on A10G). Same Surrogate-1 v1 LoRA applies — only base model
5
+ size differs. For long form / hard tasks, the chat ladder includes 7B
6
+ fallback via free APIs; this Space serves the fast path.
7
+ """
8
+ import os
9
+ import gradio as gr
10
+ import spaces
11
+ import torch
12
+
13
+ BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-3B-Instruct")
14
+ LORA_REPO = os.environ.get("LORA_REPO", "axentx/surrogate-1-coder-7b-lora-v1")
15
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
16
+
17
+ SYSTEM = (
18
+ "You are Surrogate-1, an expert DevSecOps + SRE + coding agent. "
19
+ "Cite real APIs only. Say 'I don't know' rather than confabulate."
20
+ )
21
+
22
+ # Module-level cache
23
+ _model = None
24
+ _tok = None
25
+
26
+
27
+ def _load_lazy():
28
+ """Load only inside @spaces.GPU function (i.e., on GPU worker)."""
29
+ global _model, _tok
30
+ if _model is not None:
31
+ return _model, _tok
32
+ from transformers import AutoModelForCausalLM, AutoTokenizer
33
+ _tok = AutoTokenizer.from_pretrained(
34
+ BASE_MODEL, token=HF_TOKEN or None, trust_remote_code=True)
35
+ if _tok.pad_token_id is None:
36
+ _tok.pad_token_id = _tok.eos_token_id
37
+ _model = AutoModelForCausalLM.from_pretrained(
38
+ BASE_MODEL, torch_dtype=torch.bfloat16,
39
+ token=HF_TOKEN or None, trust_remote_code=True,
40
+ device_map="cuda")
41
+ # LoRA optional — base model size mismatch (3B vs 7B) makes v1 LoRA
42
+ # incompatible. We serve the base 3B for now; on 7B Space we apply LoRA.
43
+ if BASE_MODEL.endswith("7B-Instruct"):
44
+ try:
45
+ from peft import PeftModel
46
+ _model = PeftModel.from_pretrained(_model, LORA_REPO,
47
+ token=HF_TOKEN or None)
48
+ except Exception as e:
49
+ print(f"[load] LoRA skip: {e}")
50
+ return _model, _tok
51
+
52
+
53
+ @spaces.GPU(duration=300)
54
+ def respond(message, history, max_new_tokens=512, temperature=0.4):
55
+ if not message or not message.strip():
56
+ return ""
57
+ model, tok = _load_lazy()
58
+ msgs = [{"role": "system", "content": SYSTEM}]
59
+ for u, a in (history or []):
60
+ if u: msgs.append({"role": "user", "content": u})
61
+ if a: msgs.append({"role": "assistant", "content": a})
62
+ msgs.append({"role": "user", "content": message})
63
+
64
+ prompt = tok.apply_chat_template(msgs, tokenize=False,
65
+ add_generation_prompt=True)
66
+ inputs = tok(prompt, return_tensors="pt", truncation=True,
67
+ max_length=8000).to("cuda")
68
+ out = model.generate(
69
+ **inputs,
70
+ max_new_tokens=int(max_new_tokens),
71
+ temperature=float(temperature) if temperature > 0 else 1e-5,
72
+ do_sample=temperature > 0,
73
+ pad_token_id=tok.pad_token_id,
74
+ eos_token_id=tok.eos_token_id,
75
+ )
76
+ new_tokens = out[0][inputs["input_ids"].shape[1]:]
77
+ return tok.decode(new_tokens, skip_special_tokens=True).strip()
78
+
79
+
80
+ demo = gr.ChatInterface(
81
+ fn=respond,
82
+ title=f"Surrogate-1 — {BASE_MODEL.split('/')[-1]}",
83
+ description=f"ZeroGPU A10G — {BASE_MODEL}. First request ~30-60s cold load.",
84
+ additional_inputs=[
85
+ gr.Slider(64, 2048, value=512, step=64, label="max new tokens"),
86
+ gr.Slider(0.0, 1.5, value=0.4, step=0.05, label="temperature"),
87
+ ],
88
+ )
89
+
90
+ if __name__ == "__main__":
91
+ demo.queue(max_size=10).launch()