Spaces:
Running on Zero
Running on Zero
| import os | |
| from threading import Thread | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| BitsAndBytesConfig, | |
| TextIteratorStreamer, | |
| ) | |
| MODEL_ID = "Qwen/Qwen3.6-27B" | |
| TITLE = "Qwen3.6-27B Zero" | |
| SUBTITLE = "Text-only Qwen3.6 deployment for ZeroGPU with 4-bit loading, thinking controls, and streaming chat." | |
| DESCRIPTION = ( | |
| "Optimized for ZeroGPU usage: text-only chat, NF4 4-bit quantization, bounded context, " | |
| "and shorter default generation lengths for better queue behavior." | |
| ) | |
| SYSTEM_PROMPT = ( | |
| "You are Qwen3.6-27B, a highly capable assistant for coding, research, and long-form reasoning. " | |
| "Be clear, accurate, and useful." | |
| ) | |
| PLACEHOLDER = ( | |
| "Ask for code, debugging, planning, long-form answers, or agentic workflows. " | |
| "Thinking mode is enabled by default." | |
| ) | |
| MAX_INPUT_TOKENS = 16384 | |
| DEFAULT_MAX_NEW_TOKENS = 4096 | |
| MAX_NEW_TOKENS = 8192 | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| BNB_CONFIG = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True, | |
| token=HF_TOKEN, | |
| device_map={"": 0}, | |
| dtype=torch.bfloat16, | |
| quantization_config=BNB_CONFIG, | |
| attn_implementation="sdpa", | |
| low_cpu_mem_usage=True, | |
| ) | |
| model.eval() | |
| def model_input_device(): | |
| return next(model.parameters()).device | |
| def estimate_duration( | |
| message, | |
| history, | |
| system_prompt, | |
| enable_thinking, | |
| preserve_thinking, | |
| temperature, | |
| max_new_tokens, | |
| top_p, | |
| top_k, | |
| repetition_penalty, | |
| ): | |
| del message, history, system_prompt, enable_thinking, preserve_thinking, temperature, top_p, top_k, repetition_penalty | |
| return min(240, max(90, 60 + int(max_new_tokens / 64))) | |
| def build_messages(history, message, system_prompt): | |
| messages = [] | |
| if system_prompt.strip(): | |
| messages.append({"role": "system", "content": system_prompt.strip()}) | |
| trimmed_history = history[-8:] | |
| for user_text, assistant_text in trimmed_history: | |
| if user_text: | |
| messages.append({"role": "user", "content": user_text}) | |
| if assistant_text: | |
| messages.append({"role": "assistant", "content": assistant_text}) | |
| messages.append({"role": "user", "content": message}) | |
| return messages | |
| def stream_chat( | |
| message: str, | |
| history: list, | |
| system_prompt: str, | |
| enable_thinking: bool, | |
| preserve_thinking: bool, | |
| temperature: float, | |
| max_new_tokens: int, | |
| top_p: float, | |
| top_k: int, | |
| repetition_penalty: float, | |
| ): | |
| messages = build_messages(history, message, system_prompt) | |
| rendered_prompt = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| chat_template_kwargs={ | |
| "enable_thinking": enable_thinking, | |
| "preserve_thinking": preserve_thinking, | |
| }, | |
| ) | |
| inputs = tokenizer( | |
| rendered_prompt, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=MAX_INPUT_TOKENS, | |
| ).to(model_input_device()) | |
| streamer = TextIteratorStreamer( | |
| tokenizer, | |
| timeout=120.0, | |
| skip_prompt=True, | |
| skip_special_tokens=True, | |
| ) | |
| generation_kwargs = dict( | |
| **inputs, | |
| streamer=streamer, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=temperature > 0, | |
| temperature=max(temperature, 1e-5), | |
| top_p=top_p, | |
| top_k=top_k, | |
| repetition_penalty=repetition_penalty, | |
| use_cache=True, | |
| ) | |
| worker = Thread(target=model.generate, kwargs=generation_kwargs) | |
| worker.start() | |
| output = "" | |
| for chunk in streamer: | |
| output += chunk | |
| yield output | |
| CSS = """ | |
| .gradio-container { max-width: 1180px !important; margin: 0 auto !important; } | |
| .title h1 { text-align: center; margin-bottom: 0.2rem !important; } | |
| .subtitle p, .meta p { text-align: center; } | |
| .meta p { font-size: 0.95rem; color: #6b7280; margin-top: 0.35rem !important; } | |
| .duplicate-button { margin: 0 auto 14px auto !important; } | |
| """ | |
| chatbot = gr.Chatbot(height=680, placeholder=PLACEHOLDER) | |
| with gr.Blocks(css=CSS, theme="soft") as demo: | |
| gr.Markdown(f"# {TITLE}", elem_classes="title") | |
| gr.Markdown(SUBTITLE, elem_classes="subtitle") | |
| gr.Markdown( | |
| f"{DESCRIPTION} Model: [{MODEL_ID}](https://huggingface.co/{MODEL_ID})", | |
| elem_classes="meta", | |
| ) | |
| gr.DuplicateButton("Duplicate Space", elem_classes="duplicate-button") | |
| gr.ChatInterface( | |
| fn=stream_chat, | |
| chatbot=chatbot, | |
| fill_height=True, | |
| additional_inputs_accordion=gr.Accordion("⚙️ Parameters", open=False, render=False), | |
| additional_inputs=[ | |
| gr.Textbox(value=SYSTEM_PROMPT, label="System prompt", lines=3, render=False), | |
| gr.Checkbox(value=True, label="Enable thinking", render=False), | |
| gr.Checkbox(value=False, label="Preserve thinking across turns", render=False), | |
| gr.Slider(minimum=0.0, maximum=1.2, step=0.05, value=1.0, label="Temperature", render=False), | |
| gr.Slider( | |
| minimum=1024, | |
| maximum=MAX_NEW_TOKENS, | |
| step=512, | |
| value=DEFAULT_MAX_NEW_TOKENS, | |
| label="Max new tokens", | |
| render=False, | |
| ), | |
| gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.95, label="Top-p", render=False), | |
| gr.Slider(minimum=1, maximum=100, step=1, value=20, label="Top-k", render=False), | |
| gr.Slider(minimum=1.0, maximum=1.5, step=0.05, value=1.0, label="Repetition penalty", render=False), | |
| ], | |
| examples=[ | |
| ["Design a production-ready architecture for a SaaS analytics platform with clear tradeoffs."], | |
| ["Write a detailed debugging plan for a flaky async Python test suite."], | |
| ["Build a responsive landing page in React and Tailwind for a premium AI coding product."], | |
| ["Refactor this idea into a clear engineering plan: multi-tenant background job processing with retries and observability."], | |
| ], | |
| cache_examples=False, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |