import os from threading import Thread import gradio as gr import spaces import torch from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer, ) MODEL_ID = "Qwen/Qwen3.6-27B" TITLE = "Qwen3.6-27B Zero" SUBTITLE = "Text-only Qwen3.6 deployment for ZeroGPU with 4-bit loading, thinking controls, and streaming chat." DESCRIPTION = ( "Optimized for ZeroGPU usage: text-only chat, NF4 4-bit quantization, bounded context, " "and shorter default generation lengths for better queue behavior." ) SYSTEM_PROMPT = ( "You are Qwen3.6-27B, a highly capable assistant for coding, research, and long-form reasoning. " "Be clear, accurate, and useful." ) PLACEHOLDER = ( "Ask for code, debugging, planning, long-form answers, or agentic workflows. " "Thinking mode is enabled by default." ) MAX_INPUT_TOKENS = 16384 DEFAULT_MAX_NEW_TOKENS = 4096 MAX_NEW_TOKENS = 8192 HF_TOKEN = os.environ.get("HF_TOKEN") os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") torch.backends.cuda.matmul.allow_tf32 = True BNB_CONFIG = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16, ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_ID, trust_remote_code=True, token=HF_TOKEN, device_map={"": 0}, dtype=torch.bfloat16, quantization_config=BNB_CONFIG, attn_implementation="sdpa", low_cpu_mem_usage=True, ) model.eval() def model_input_device(): return next(model.parameters()).device def estimate_duration( message, history, system_prompt, enable_thinking, preserve_thinking, temperature, max_new_tokens, top_p, top_k, repetition_penalty, ): del message, history, system_prompt, enable_thinking, preserve_thinking, temperature, top_p, top_k, repetition_penalty return min(240, max(90, 60 + int(max_new_tokens / 64))) def build_messages(history, message, system_prompt): messages = [] if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt.strip()}) trimmed_history = history[-8:] for user_text, assistant_text in trimmed_history: if user_text: messages.append({"role": "user", "content": user_text}) if assistant_text: messages.append({"role": "assistant", "content": assistant_text}) messages.append({"role": "user", "content": message}) return messages @spaces.GPU(duration=estimate_duration, size="large") def stream_chat( message: str, history: list, system_prompt: str, enable_thinking: bool, preserve_thinking: bool, temperature: float, max_new_tokens: int, top_p: float, top_k: int, repetition_penalty: float, ): messages = build_messages(history, message, system_prompt) rendered_prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, chat_template_kwargs={ "enable_thinking": enable_thinking, "preserve_thinking": preserve_thinking, }, ) inputs = tokenizer( rendered_prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKENS, ).to(model_input_device()) streamer = TextIteratorStreamer( tokenizer, timeout=120.0, skip_prompt=True, skip_special_tokens=True, ) generation_kwargs = dict( **inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=temperature > 0, temperature=max(temperature, 1e-5), top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, use_cache=True, ) worker = Thread(target=model.generate, kwargs=generation_kwargs) worker.start() output = "" for chunk in streamer: output += chunk yield output CSS = """ .gradio-container { max-width: 1180px !important; margin: 0 auto !important; } .title h1 { text-align: center; margin-bottom: 0.2rem !important; } .subtitle p, .meta p { text-align: center; } .meta p { font-size: 0.95rem; color: #6b7280; margin-top: 0.35rem !important; } .duplicate-button { margin: 0 auto 14px auto !important; } """ chatbot = gr.Chatbot(height=680, placeholder=PLACEHOLDER) with gr.Blocks(css=CSS, theme="soft") as demo: gr.Markdown(f"# {TITLE}", elem_classes="title") gr.Markdown(SUBTITLE, elem_classes="subtitle") gr.Markdown( f"{DESCRIPTION} Model: [{MODEL_ID}](https://huggingface.co/{MODEL_ID})", elem_classes="meta", ) gr.DuplicateButton("Duplicate Space", elem_classes="duplicate-button") gr.ChatInterface( fn=stream_chat, chatbot=chatbot, fill_height=True, additional_inputs_accordion=gr.Accordion("⚙️ Parameters", open=False, render=False), additional_inputs=[ gr.Textbox(value=SYSTEM_PROMPT, label="System prompt", lines=3, render=False), gr.Checkbox(value=True, label="Enable thinking", render=False), gr.Checkbox(value=False, label="Preserve thinking across turns", render=False), gr.Slider(minimum=0.0, maximum=1.2, step=0.05, value=1.0, label="Temperature", render=False), gr.Slider( minimum=1024, maximum=MAX_NEW_TOKENS, step=512, value=DEFAULT_MAX_NEW_TOKENS, label="Max new tokens", render=False, ), gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.95, label="Top-p", render=False), gr.Slider(minimum=1, maximum=100, step=1, value=20, label="Top-k", render=False), gr.Slider(minimum=1.0, maximum=1.5, step=0.05, value=1.0, label="Repetition penalty", render=False), ], examples=[ ["Design a production-ready architecture for a SaaS analytics platform with clear tradeoffs."], ["Write a detailed debugging plan for a flaky async Python test suite."], ["Build a responsive landing page in React and Tailwind for a premium AI coding product."], ["Refactor this idea into a clear engineering plan: multi-tenant background job processing with retries and observability."], ], cache_examples=False, ) if __name__ == "__main__": demo.launch()