specimba commited on
Commit
7017e8e
·
verified ·
1 Parent(s): 521505b

v3.5: top-level imports + @spaces.GPU (canonical GRM-2.6-Opus pattern)

Browse files
Files changed (1) hide show
  1. app.py +84 -47
app.py CHANGED
@@ -1,50 +1,87 @@
1
- import os, json
2
- import gradio as gr
3
-
4
- SAFE = {"qwen2.5-0.5b": ("Qwen/Qwen2.5-0.5B-Instruct", 1.0), "llama3.2-1b": ("meta-llama/Llama-3.2-1B-Instruct", 1.5)}
5
- SPECS = {**SAFE, "deepseek-r1-1.5b": ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", 2.2), "gemma3-4b": ("google/gemma-3-4b-it", 6.0)}
6
-
7
- def gpu_ok():
8
- try:
9
- import torch
10
- return torch.cuda.is_available()
11
- except: return False
12
-
13
- def gen(m, p, t):
14
- if not gpu_ok():
15
- return "ERROR: No GPU. Personal accounts need PRO for ZeroGPU. Use MCP client for 25min/day free, or run hf jobs for pay-as-you-go GPU. https://huggingface.co/mcp"
16
- import spaces, torch
17
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
18
- hid, vram = SPECS[m]
19
- bnb = torch.classes if False else None # dummy
20
- bnb = None
21
- try:
22
- bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
23
- except: pass
24
- tok = AutoTokenizer.from_pretrained(hid, trust_remote_code=True)
25
- if tok.pad_token is None: tok.pad_token = tok.eos_token
26
- kw = {"trust_remote_code": True, "low_cpu_mem_usage": True, "torch_dtype": torch.bfloat16}
27
- if bnb: kw["quantization_config"] = bnb; kw["device_map"] = "auto"
28
- mod = AutoModelForCausalLM.from_pretrained(hid, **kw)
29
- msgs = [{"role":"user","content":p}]
30
- txt = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
31
- inp = tok(txt, return_tensors="pt", truncation=True, max_length=16384).to(next(mod.parameters()).device)
32
- out = mod.generate(**inp, max_new_tokens=512, do_sample=t>0.01, temperature=max(t,1e-5), top_p=0.95, top_k=40, pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id)
33
- return tok.decode(out[0], skip_special_tokens=True)
34
-
35
- with gr.Blocks(title="NEXUS LAB v3.4") as demo:
36
- gr.Markdown("# NEXUS LAB v3.4")
37
- gr.Markdown("**Status:** " + ("GPU Ready" if gpu_ok() else "CPU Only — No GPU detected"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  m = gr.Dropdown(choices=list(SPECS.keys()), value="llama3.2-1b", label="Model")
39
- p = gr.Textbox(label="Prompt", lines=3)
 
40
  t = gr.Slider(0.0, 2.0, 0.7, step=0.05, label="Temperature")
41
- o = gr.Textbox(label="Output", lines=15)
42
- def wrapped(m, p, t):
43
- if not gpu_ok(): return "ERROR: No GPU. Enable ZeroGPU (PRO), use MCP for 25min/day free: https://huggingface.co/mcp, or run hf jobs."
44
- return gen(m, p, t)
45
- gr.Button("Generate", variant="primary").click(wrapped, [m, p, t], o)
46
- gr.Markdown("---")
47
- gr.Markdown("MCP: `/gradio_api/mcp/sse` — Use MCP client for free GPU via agents.md")
48
- gr.Markdown("Jobs: `hf jobs run --flavor gpu-t4-small python train.py` — Pay-as-you-go, no PRO needed")
49
-
 
 
 
 
 
50
  demo.launch(mcp_server=True)
 
1
+ import os, time, json
2
+ import spaces, torch, gradio as gr
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
4
+ from threading import Thread
5
+
6
+ SPECS = {
7
+ "llama3.2-1b": ("meta-llama/Llama-3.2-1B-Instruct", 1.5),
8
+ "qwen2.5-0.5b": ("Qwen/Qwen2.5-0.5B-Instruct", 1.0),
9
+ "deepseek-r1-1.5b": ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", 2.2),
10
+ "gemma3-4b": ("google/gemma-3-4b-it", 6.0),
11
+ }
12
+
13
+ BNB = BitsAndBytesConfig(
14
+ load_in_4bit=True, bnb_4bit_quant_type="nf4",
15
+ bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16,
16
+ )
17
+
18
+ def load_model(spec_id):
19
+ hid, vram = SPECS[spec_id]
20
+ tok = AutoTokenizer.from_pretrained(hid, trust_remote_code=True)
21
+ if tok.pad_token is None: tok.pad_token = tok.eos_token
22
+ mod = AutoModelForCausalLM.from_pretrained(
23
+ hid, trust_remote_code=True, quantization_config=BNB,
24
+ device_map="auto", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True,
25
+ )
26
+ return mod, tok
27
+
28
+ _model_cache = {}
29
+
30
+ def get_model(spec_id):
31
+ if spec_id not in _model_cache:
32
+ _model_cache[spec_id] = load_model(spec_id)
33
+ return _model_cache[spec_id]
34
+
35
+ @spaces.GPU(duration=lambda *a: 180, size="large")
36
+ def generate(spec_id, prompt, system, temp, top_p, max_tokens):
37
+ mod, tok = get_model(spec_id)
38
+ msgs = []
39
+ if system: msgs.append({"role":"system","content":system})
40
+ msgs.append({"role":"user","content":prompt})
41
+ txt = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
42
+ inp = tok(txt, return_tensors="pt", truncation=True, max_length=8192).to(next(mod.parameters()).device)
43
+ s = TextIteratorStreamer(tok, skip_prompt=True, skip_special_tokens=True)
44
+ kw = dict(
45
+ **inp, streamer=s, max_new_tokens=max_tokens,
46
+ do_sample=temp>0.01, temperature=max(temp,1e-5), top_p=top_p,
47
+ pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id,
48
+ )
49
+ Thread(target=mod.generate, kwargs=kw).start()
50
+ raw = ""
51
+ for chunk in s:
52
+ raw += chunk
53
+ yield raw
54
+
55
+ @spaces.GPU(duration=lambda *a: 180, size="large")
56
+ def temp_sweep(spec_id, prompt, temp_range, top_p, max_tokens):
57
+ temps = [float(x.strip()) for x in temp_range.split(",") if x.strip()]
58
+ out = []
59
+ for t in temps:
60
+ txt = ""
61
+ for p in generate(spec_id, prompt, "", t, top_p, max_tokens):
62
+ txt = p
63
+ out.append(f"--- T={t:.2f} ---\n{txt.strip()}\n")
64
+ return "\n".join(out)
65
+
66
+ with gr.Blocks(title="NEXUS LAB v3.5") as demo:
67
+ gr.Markdown("# NEXUS LAB v3.5 - ZeroGPU Team Edition")
68
+ with gr.Tab("Chat"):
69
  m = gr.Dropdown(choices=list(SPECS.keys()), value="llama3.2-1b", label="Model")
70
+ p = gr.Textbox(label="Prompt", lines=3, value="Explain the NEXUS OS TICOS guard system.")
71
+ sy = gr.Textbox(label="System", value="Think step by step.")
72
  t = gr.Slider(0.0, 2.0, 0.7, step=0.05, label="Temperature")
73
+ pp = gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p")
74
+ mt = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens")
75
+ o = gr.Textbox(label="Output", lines=20)
76
+ gr.Button("Generate", variant="primary").click(generate, [m,p,sy,t,pp,mt], o)
77
+ with gr.Tab("Temp Sweep"):
78
+ m2 = gr.Dropdown(choices=list(SPECS.keys()), value="llama3.2-1b", label="Model")
79
+ p2 = gr.Textbox(label="Prompt")
80
+ tr = gr.Textbox(label="Temps", value="0.0,0.3,0.6,0.9,1.2")
81
+ pp2 = gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p")
82
+ mt2 = gr.Slider(64, 1024, 256, step=64, label="Max Tokens")
83
+ o2 = gr.Textbox(label="Results", lines=20)
84
+ gr.Button("Run Sweep", variant="primary").click(temp_sweep, [m2,p2,tr,pp2,mt2], o2)
85
+ gr.Markdown("---")
86
+ gr.Markdown("MCP: /gradio_api/mcp/sse | ZeroGPU Team 40min/day | Bucket: specimba/nexus-vap-bucket")
87
  demo.launch(mcp_server=True)