Spaces:
Running
Running
v3.5: top-level imports + @spaces.GPU (canonical GRM-2.6-Opus pattern)
Browse files
app.py
CHANGED
|
@@ -1,50 +1,87 @@
|
|
| 1 |
-
import os, json
|
| 2 |
-
import gradio as gr
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
m = gr.Dropdown(choices=list(SPECS.keys()), value="llama3.2-1b", label="Model")
|
| 39 |
-
p = gr.Textbox(label="Prompt", lines=3)
|
|
|
|
| 40 |
t = gr.Slider(0.0, 2.0, 0.7, step=0.05, label="Temperature")
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
gr.
|
| 47 |
-
gr.
|
| 48 |
-
gr.
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
demo.launch(mcp_server=True)
|
|
|
|
| 1 |
+
import os, time, json
|
| 2 |
+
import spaces, torch, gradio as gr
|
| 3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
|
| 4 |
+
from threading import Thread
|
| 5 |
+
|
| 6 |
+
SPECS = {
|
| 7 |
+
"llama3.2-1b": ("meta-llama/Llama-3.2-1B-Instruct", 1.5),
|
| 8 |
+
"qwen2.5-0.5b": ("Qwen/Qwen2.5-0.5B-Instruct", 1.0),
|
| 9 |
+
"deepseek-r1-1.5b": ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", 2.2),
|
| 10 |
+
"gemma3-4b": ("google/gemma-3-4b-it", 6.0),
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
BNB = BitsAndBytesConfig(
|
| 14 |
+
load_in_4bit=True, bnb_4bit_quant_type="nf4",
|
| 15 |
+
bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
def load_model(spec_id):
|
| 19 |
+
hid, vram = SPECS[spec_id]
|
| 20 |
+
tok = AutoTokenizer.from_pretrained(hid, trust_remote_code=True)
|
| 21 |
+
if tok.pad_token is None: tok.pad_token = tok.eos_token
|
| 22 |
+
mod = AutoModelForCausalLM.from_pretrained(
|
| 23 |
+
hid, trust_remote_code=True, quantization_config=BNB,
|
| 24 |
+
device_map="auto", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True,
|
| 25 |
+
)
|
| 26 |
+
return mod, tok
|
| 27 |
+
|
| 28 |
+
_model_cache = {}
|
| 29 |
+
|
| 30 |
+
def get_model(spec_id):
|
| 31 |
+
if spec_id not in _model_cache:
|
| 32 |
+
_model_cache[spec_id] = load_model(spec_id)
|
| 33 |
+
return _model_cache[spec_id]
|
| 34 |
+
|
| 35 |
+
@spaces.GPU(duration=lambda *a: 180, size="large")
|
| 36 |
+
def generate(spec_id, prompt, system, temp, top_p, max_tokens):
|
| 37 |
+
mod, tok = get_model(spec_id)
|
| 38 |
+
msgs = []
|
| 39 |
+
if system: msgs.append({"role":"system","content":system})
|
| 40 |
+
msgs.append({"role":"user","content":prompt})
|
| 41 |
+
txt = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
|
| 42 |
+
inp = tok(txt, return_tensors="pt", truncation=True, max_length=8192).to(next(mod.parameters()).device)
|
| 43 |
+
s = TextIteratorStreamer(tok, skip_prompt=True, skip_special_tokens=True)
|
| 44 |
+
kw = dict(
|
| 45 |
+
**inp, streamer=s, max_new_tokens=max_tokens,
|
| 46 |
+
do_sample=temp>0.01, temperature=max(temp,1e-5), top_p=top_p,
|
| 47 |
+
pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id,
|
| 48 |
+
)
|
| 49 |
+
Thread(target=mod.generate, kwargs=kw).start()
|
| 50 |
+
raw = ""
|
| 51 |
+
for chunk in s:
|
| 52 |
+
raw += chunk
|
| 53 |
+
yield raw
|
| 54 |
+
|
| 55 |
+
@spaces.GPU(duration=lambda *a: 180, size="large")
|
| 56 |
+
def temp_sweep(spec_id, prompt, temp_range, top_p, max_tokens):
|
| 57 |
+
temps = [float(x.strip()) for x in temp_range.split(",") if x.strip()]
|
| 58 |
+
out = []
|
| 59 |
+
for t in temps:
|
| 60 |
+
txt = ""
|
| 61 |
+
for p in generate(spec_id, prompt, "", t, top_p, max_tokens):
|
| 62 |
+
txt = p
|
| 63 |
+
out.append(f"--- T={t:.2f} ---\n{txt.strip()}\n")
|
| 64 |
+
return "\n".join(out)
|
| 65 |
+
|
| 66 |
+
with gr.Blocks(title="NEXUS LAB v3.5") as demo:
|
| 67 |
+
gr.Markdown("# NEXUS LAB v3.5 - ZeroGPU Team Edition")
|
| 68 |
+
with gr.Tab("Chat"):
|
| 69 |
m = gr.Dropdown(choices=list(SPECS.keys()), value="llama3.2-1b", label="Model")
|
| 70 |
+
p = gr.Textbox(label="Prompt", lines=3, value="Explain the NEXUS OS TICOS guard system.")
|
| 71 |
+
sy = gr.Textbox(label="System", value="Think step by step.")
|
| 72 |
t = gr.Slider(0.0, 2.0, 0.7, step=0.05, label="Temperature")
|
| 73 |
+
pp = gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p")
|
| 74 |
+
mt = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens")
|
| 75 |
+
o = gr.Textbox(label="Output", lines=20)
|
| 76 |
+
gr.Button("Generate", variant="primary").click(generate, [m,p,sy,t,pp,mt], o)
|
| 77 |
+
with gr.Tab("Temp Sweep"):
|
| 78 |
+
m2 = gr.Dropdown(choices=list(SPECS.keys()), value="llama3.2-1b", label="Model")
|
| 79 |
+
p2 = gr.Textbox(label="Prompt")
|
| 80 |
+
tr = gr.Textbox(label="Temps", value="0.0,0.3,0.6,0.9,1.2")
|
| 81 |
+
pp2 = gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p")
|
| 82 |
+
mt2 = gr.Slider(64, 1024, 256, step=64, label="Max Tokens")
|
| 83 |
+
o2 = gr.Textbox(label="Results", lines=20)
|
| 84 |
+
gr.Button("Run Sweep", variant="primary").click(temp_sweep, [m2,p2,tr,pp2,mt2], o2)
|
| 85 |
+
gr.Markdown("---")
|
| 86 |
+
gr.Markdown("MCP: /gradio_api/mcp/sse | ZeroGPU Team 40min/day | Bucket: specimba/nexus-vap-bucket")
|
| 87 |
demo.launch(mcp_server=True)
|