Spaces:
Running
Running
| import os, time, json | |
| import spaces, torch, gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer | |
| from threading import Thread | |
| SPECS = { | |
| "llama3.2-1b": ("meta-llama/Llama-3.2-1B-Instruct", 1.5), | |
| "qwen2.5-0.5b": ("Qwen/Qwen2.5-0.5B-Instruct", 1.0), | |
| "deepseek-r1-1.5b": ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", 2.2), | |
| "gemma3-4b": ("google/gemma-3-4b-it", 6.0), | |
| } | |
| BNB = BitsAndBytesConfig( | |
| load_in_4bit=True, bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| def load_model(spec_id): | |
| hid, vram = SPECS[spec_id] | |
| tok = AutoTokenizer.from_pretrained(hid, trust_remote_code=True) | |
| if tok.pad_token is None: tok.pad_token = tok.eos_token | |
| mod = AutoModelForCausalLM.from_pretrained( | |
| hid, trust_remote_code=True, quantization_config=BNB, | |
| device_map="auto", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, | |
| ) | |
| return mod, tok | |
| _model_cache = {} | |
| def get_model(spec_id): | |
| if spec_id not in _model_cache: | |
| _model_cache[spec_id] = load_model(spec_id) | |
| return _model_cache[spec_id] | |
| def generate(spec_id, prompt, system, temp, top_p, max_tokens): | |
| mod, tok = get_model(spec_id) | |
| msgs = [] | |
| if system: msgs.append({"role":"system","content":system}) | |
| msgs.append({"role":"user","content":prompt}) | |
| txt = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) | |
| inp = tok(txt, return_tensors="pt", truncation=True, max_length=8192).to(next(mod.parameters()).device) | |
| s = TextIteratorStreamer(tok, skip_prompt=True, skip_special_tokens=True) | |
| kw = dict( | |
| **inp, streamer=s, max_new_tokens=max_tokens, | |
| do_sample=temp>0.01, temperature=max(temp,1e-5), top_p=top_p, | |
| pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id, | |
| ) | |
| Thread(target=mod.generate, kwargs=kw).start() | |
| raw = "" | |
| for chunk in s: | |
| raw += chunk | |
| yield raw | |
| def temp_sweep(spec_id, prompt, temp_range, top_p, max_tokens): | |
| temps = [float(x.strip()) for x in temp_range.split(",") if x.strip()] | |
| out = [] | |
| for t in temps: | |
| txt = "" | |
| for p in generate(spec_id, prompt, "", t, top_p, max_tokens): | |
| txt = p | |
| out.append(f"--- T={t:.2f} ---\n{txt.strip()}\n") | |
| return "\n".join(out) | |
| with gr.Blocks(title="NEXUS LAB v3.5") as demo: | |
| gr.Markdown("# NEXUS LAB v3.5 - ZeroGPU Team Edition") | |
| with gr.Tab("Chat"): | |
| m = gr.Dropdown(choices=list(SPECS.keys()), value="llama3.2-1b", label="Model") | |
| p = gr.Textbox(label="Prompt", lines=3, value="Explain the NEXUS OS TICOS guard system.") | |
| sy = gr.Textbox(label="System", value="Think step by step.") | |
| t = gr.Slider(0.0, 2.0, 0.7, step=0.05, label="Temperature") | |
| pp = gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p") | |
| mt = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens") | |
| o = gr.Textbox(label="Output", lines=20) | |
| gr.Button("Generate", variant="primary").click(generate, [m,p,sy,t,pp,mt], o) | |
| with gr.Tab("Temp Sweep"): | |
| m2 = gr.Dropdown(choices=list(SPECS.keys()), value="llama3.2-1b", label="Model") | |
| p2 = gr.Textbox(label="Prompt") | |
| tr = gr.Textbox(label="Temps", value="0.0,0.3,0.6,0.9,1.2") | |
| pp2 = gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p") | |
| mt2 = gr.Slider(64, 1024, 256, step=64, label="Max Tokens") | |
| o2 = gr.Textbox(label="Results", lines=20) | |
| gr.Button("Run Sweep", variant="primary").click(temp_sweep, [m2,p2,tr,pp2,mt2], o2) | |
| gr.Markdown("---") | |
| gr.Markdown("MCP: /gradio_api/mcp/sse | ZeroGPU Team 40min/day | Bucket: specimba/nexus-vap-bucket") | |
| demo.launch(mcp_server=True) | |