specimba commited on
Commit
8eb9365
·
verified ·
1 Parent(s): a98156b

v3.4 safe mode: lazy import transformers only when GPU available

Browse files
Files changed (1) hide show
  1. app.py +45 -54
app.py CHANGED
@@ -1,59 +1,50 @@
1
- import os, spaces, torch, gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
3
 
4
- SPECS = {
5
- 'qwen2.5-0.5b': ('Qwen/Qwen2.5-0.5B-Instruct', 1.0, 'public'),
6
- 'llama3.2-1b': ('meta-llama/Llama-3.2-1B-Instruct', 1.5, 'public'),
7
- 'deepseek-r1-1.5b': ('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', 2.2, 'public'),
8
- 'gemma3-4b': ('google/gemma-3-4b-it', 6.0, 'public'),
9
- }
10
 
11
- BNB = BitsAndBytesConfig(
12
- load_in_4bit=True, bnb_4bit_quant_type='nf4',
13
- bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16,
14
- )
 
15
 
16
- _cache = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- def load_kv(hid):
19
- if hid in _cache:
20
- return _cache[hid]
21
- tok = AutoTokenizer.from_pretrained(hid, trust_remote_code=True)
22
- if tok.pad_token is None: tok.pad_token = tok.eos_token
23
- mod = AutoModelForCausalLM.from_pretrained(
24
- hid, quantization_config=BNB, device_map='auto',
25
- torch_dtype=torch.bfloat16, trust_remote_code=True,
26
- low_cpu_mem_usage=True, attn_implementation='sdpa',
27
- )
28
- _cache[hid] = (mod, tok)
29
- return mod, tok
 
 
30
 
31
- @spaces.GPU(duration=lambda *a: 180, size='large')
32
- def gen(m, p, t, top_p):
33
- if not p or not p.strip():
34
- return 'Empty prompt.'
35
- hid, vram, status = SPECS[m]
36
- mod, tok = load_kv(hid)
37
- msgs = [{'role':'user','content':p.strip()}]
38
- txt = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
39
- inp = tok(txt, return_tensors='pt', truncation=True, max_length=16384).to(next(mod.parameters()).device)
40
- out = mod.generate(
41
- **inp, max_new_tokens=512, do_sample=t>0.01,
42
- temperature=max(t,1e-5), top_p=top_p, top_k=40,
43
- repetition_penalty=1.0, use_cache=True,
44
- pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id,
45
- )
46
- r = tok.decode(out[0], skip_special_tokens=True)
47
- return r
48
-
49
- with gr.Blocks(title='NEXUS LAB v3.3 - ZeroGPU') as demo:
50
- gr.Markdown('## NEXUS LAB v3.3 - ZeroGPU Inference')
51
- gr.Markdown('Enable ZeroGPU in Settings > Hardware. First load may take 1-2 minutes.')
52
- m = gr.Dropdown(choices=list(SPECS.keys()), value='llama3.2-1b', label='Model')
53
- p = gr.Textbox(label='Prompt', lines=3, value='Write a purple team strategy.')
54
- t = gr.Slider(0.0, 2.0, 0.7, step=0.05, label='Temperature')
55
- top_p = gr.Slider(0.1, 1.0, 0.95, step=0.05, label='Top-p')
56
- o = gr.Textbox(label='Output', lines=15)
57
- gr.Button('Generate', variant='primary').click(gen, [m, p, t, top_p], o)
58
- gr.Markdown('MCP: /gradio_api/mcp/sse')
59
- demo.queue()
 
1
+ import os, json
2
+ import gradio as gr
3
 
4
+ SAFE = {"qwen2.5-0.5b": ("Qwen/Qwen2.5-0.5B-Instruct", 1.0), "llama3.2-1b": ("meta-llama/Llama-3.2-1B-Instruct", 1.5)}
5
+ SPECS = {**SAFE, "deepseek-r1-1.5b": ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", 2.2), "gemma3-4b": ("google/gemma-3-4b-it", 6.0)}
 
 
 
 
6
 
7
+ def gpu_ok():
8
+ try:
9
+ import torch
10
+ return torch.cuda.is_available()
11
+ except: return False
12
 
13
+ def gen(m, p, t):
14
+ if not gpu_ok():
15
+ return "ERROR: No GPU. Personal accounts need PRO for ZeroGPU. Use MCP client for 25min/day free, or run hf jobs for pay-as-you-go GPU. https://huggingface.co/mcp"
16
+ import spaces, torch
17
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
18
+ hid, vram = SPECS[m]
19
+ bnb = torch.classes if False else None # dummy
20
+ bnb = None
21
+ try:
22
+ bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
23
+ except: pass
24
+ tok = AutoTokenizer.from_pretrained(hid, trust_remote_code=True)
25
+ if tok.pad_token is None: tok.pad_token = tok.eos_token
26
+ kw = {"trust_remote_code": True, "low_cpu_mem_usage": True, "torch_dtype": torch.bfloat16}
27
+ if bnb: kw["quantization_config"] = bnb; kw["device_map"] = "auto"
28
+ mod = AutoModelForCausalLM.from_pretrained(hid, **kw)
29
+ msgs = [{"role":"user","content":p}]
30
+ txt = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
31
+ inp = tok(txt, return_tensors="pt", truncation=True, max_length=16384).to(next(mod.parameters()).device)
32
+ out = mod.generate(**inp, max_new_tokens=512, do_sample=t>0.01, temperature=max(t,1e-5), top_p=0.95, top_k=40, pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id)
33
+ return tok.decode(out[0], skip_special_tokens=True)
34
 
35
+ with gr.Blocks(title="NEXUS LAB v3.4") as demo:
36
+ gr.Markdown("# NEXUS LAB v3.4")
37
+ gr.Markdown("**Status:** " + ("GPU Ready" if gpu_ok() else "CPU Only — No GPU detected"))
38
+ m = gr.Dropdown(choices=list(SPECS.keys()), value="llama3.2-1b", label="Model")
39
+ p = gr.Textbox(label="Prompt", lines=3)
40
+ t = gr.Slider(0.0, 2.0, 0.7, step=0.05, label="Temperature")
41
+ o = gr.Textbox(label="Output", lines=15)
42
+ def wrapped(m, p, t):
43
+ if not gpu_ok(): return "ERROR: No GPU. Enable ZeroGPU (PRO), use MCP for 25min/day free: https://huggingface.co/mcp, or run hf jobs."
44
+ return gen(m, p, t)
45
+ gr.Button("Generate", variant="primary").click(wrapped, [m, p, t], o)
46
+ gr.Markdown("---")
47
+ gr.Markdown("MCP: `/gradio_api/mcp/sse` — Use MCP client for free GPU via agents.md")
48
+ gr.Markdown("Jobs: `hf jobs run --flavor gpu-t4-small python train.py` — Pay-as-you-go, no PRO needed")
49
 
50
+ demo.launch(mcp_server=True)