Spaces:
Running
Running
v3.4 safe mode: lazy import transformers only when GPU available
Browse files
app.py
CHANGED
|
@@ -1,59 +1,50 @@
|
|
| 1 |
-
import os,
|
| 2 |
-
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
'llama3.2-1b': ('meta-llama/Llama-3.2-1B-Instruct', 1.5, 'public'),
|
| 7 |
-
'deepseek-r1-1.5b': ('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', 2.2, 'public'),
|
| 8 |
-
'gemma3-4b': ('google/gemma-3-4b-it', 6.0, 'public'),
|
| 9 |
-
}
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
)
|
|
|
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
|
| 32 |
-
def gen(m, p, t, top_p):
|
| 33 |
-
if not p or not p.strip():
|
| 34 |
-
return 'Empty prompt.'
|
| 35 |
-
hid, vram, status = SPECS[m]
|
| 36 |
-
mod, tok = load_kv(hid)
|
| 37 |
-
msgs = [{'role':'user','content':p.strip()}]
|
| 38 |
-
txt = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
|
| 39 |
-
inp = tok(txt, return_tensors='pt', truncation=True, max_length=16384).to(next(mod.parameters()).device)
|
| 40 |
-
out = mod.generate(
|
| 41 |
-
**inp, max_new_tokens=512, do_sample=t>0.01,
|
| 42 |
-
temperature=max(t,1e-5), top_p=top_p, top_k=40,
|
| 43 |
-
repetition_penalty=1.0, use_cache=True,
|
| 44 |
-
pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id,
|
| 45 |
-
)
|
| 46 |
-
r = tok.decode(out[0], skip_special_tokens=True)
|
| 47 |
-
return r
|
| 48 |
-
|
| 49 |
-
with gr.Blocks(title='NEXUS LAB v3.3 - ZeroGPU') as demo:
|
| 50 |
-
gr.Markdown('## NEXUS LAB v3.3 - ZeroGPU Inference')
|
| 51 |
-
gr.Markdown('Enable ZeroGPU in Settings > Hardware. First load may take 1-2 minutes.')
|
| 52 |
-
m = gr.Dropdown(choices=list(SPECS.keys()), value='llama3.2-1b', label='Model')
|
| 53 |
-
p = gr.Textbox(label='Prompt', lines=3, value='Write a purple team strategy.')
|
| 54 |
-
t = gr.Slider(0.0, 2.0, 0.7, step=0.05, label='Temperature')
|
| 55 |
-
top_p = gr.Slider(0.1, 1.0, 0.95, step=0.05, label='Top-p')
|
| 56 |
-
o = gr.Textbox(label='Output', lines=15)
|
| 57 |
-
gr.Button('Generate', variant='primary').click(gen, [m, p, t, top_p], o)
|
| 58 |
-
gr.Markdown('MCP: /gradio_api/mcp/sse')
|
| 59 |
-
demo.queue()
|
|
|
|
| 1 |
+
import os, json
|
| 2 |
+
import gradio as gr
|
| 3 |
|
| 4 |
+
SAFE = {"qwen2.5-0.5b": ("Qwen/Qwen2.5-0.5B-Instruct", 1.0), "llama3.2-1b": ("meta-llama/Llama-3.2-1B-Instruct", 1.5)}
|
| 5 |
+
SPECS = {**SAFE, "deepseek-r1-1.5b": ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", 2.2), "gemma3-4b": ("google/gemma-3-4b-it", 6.0)}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
def gpu_ok():
|
| 8 |
+
try:
|
| 9 |
+
import torch
|
| 10 |
+
return torch.cuda.is_available()
|
| 11 |
+
except: return False
|
| 12 |
|
| 13 |
+
def gen(m, p, t):
|
| 14 |
+
if not gpu_ok():
|
| 15 |
+
return "ERROR: No GPU. Personal accounts need PRO for ZeroGPU. Use MCP client for 25min/day free, or run hf jobs for pay-as-you-go GPU. https://huggingface.co/mcp"
|
| 16 |
+
import spaces, torch
|
| 17 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 18 |
+
hid, vram = SPECS[m]
|
| 19 |
+
bnb = torch.classes if False else None # dummy
|
| 20 |
+
bnb = None
|
| 21 |
+
try:
|
| 22 |
+
bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
|
| 23 |
+
except: pass
|
| 24 |
+
tok = AutoTokenizer.from_pretrained(hid, trust_remote_code=True)
|
| 25 |
+
if tok.pad_token is None: tok.pad_token = tok.eos_token
|
| 26 |
+
kw = {"trust_remote_code": True, "low_cpu_mem_usage": True, "torch_dtype": torch.bfloat16}
|
| 27 |
+
if bnb: kw["quantization_config"] = bnb; kw["device_map"] = "auto"
|
| 28 |
+
mod = AutoModelForCausalLM.from_pretrained(hid, **kw)
|
| 29 |
+
msgs = [{"role":"user","content":p}]
|
| 30 |
+
txt = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
|
| 31 |
+
inp = tok(txt, return_tensors="pt", truncation=True, max_length=16384).to(next(mod.parameters()).device)
|
| 32 |
+
out = mod.generate(**inp, max_new_tokens=512, do_sample=t>0.01, temperature=max(t,1e-5), top_p=0.95, top_k=40, pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id)
|
| 33 |
+
return tok.decode(out[0], skip_special_tokens=True)
|
| 34 |
|
| 35 |
+
with gr.Blocks(title="NEXUS LAB v3.4") as demo:
|
| 36 |
+
gr.Markdown("# NEXUS LAB v3.4")
|
| 37 |
+
gr.Markdown("**Status:** " + ("GPU Ready" if gpu_ok() else "CPU Only — No GPU detected"))
|
| 38 |
+
m = gr.Dropdown(choices=list(SPECS.keys()), value="llama3.2-1b", label="Model")
|
| 39 |
+
p = gr.Textbox(label="Prompt", lines=3)
|
| 40 |
+
t = gr.Slider(0.0, 2.0, 0.7, step=0.05, label="Temperature")
|
| 41 |
+
o = gr.Textbox(label="Output", lines=15)
|
| 42 |
+
def wrapped(m, p, t):
|
| 43 |
+
if not gpu_ok(): return "ERROR: No GPU. Enable ZeroGPU (PRO), use MCP for 25min/day free: https://huggingface.co/mcp, or run hf jobs."
|
| 44 |
+
return gen(m, p, t)
|
| 45 |
+
gr.Button("Generate", variant="primary").click(wrapped, [m, p, t], o)
|
| 46 |
+
gr.Markdown("---")
|
| 47 |
+
gr.Markdown("MCP: `/gradio_api/mcp/sse` — Use MCP client for free GPU via agents.md")
|
| 48 |
+
gr.Markdown("Jobs: `hf jobs run --flavor gpu-t4-small python train.py` — Pay-as-you-go, no PRO needed")
|
| 49 |
|
| 50 |
+
demo.launch(mcp_server=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|