artificialguybr's picture
Switch to text-only causal LM loading
99a0cab verified
import os
from threading import Thread
import gradio as gr
import spaces
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TextIteratorStreamer,
)
MODEL_ID = "Qwen/Qwen3.6-27B"
TITLE = "Qwen3.6-27B Zero"
SUBTITLE = "Text-only Qwen3.6 deployment for ZeroGPU with 4-bit loading, thinking controls, and streaming chat."
DESCRIPTION = (
"Optimized for ZeroGPU usage: text-only chat, NF4 4-bit quantization, bounded context, "
"and shorter default generation lengths for better queue behavior."
)
SYSTEM_PROMPT = (
"You are Qwen3.6-27B, a highly capable assistant for coding, research, and long-form reasoning. "
"Be clear, accurate, and useful."
)
PLACEHOLDER = (
"Ask for code, debugging, planning, long-form answers, or agentic workflows. "
"Thinking mode is enabled by default."
)
MAX_INPUT_TOKENS = 16384
DEFAULT_MAX_NEW_TOKENS = 4096
MAX_NEW_TOKENS = 8192
HF_TOKEN = os.environ.get("HF_TOKEN")
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
torch.backends.cuda.matmul.allow_tf32 = True
BNB_CONFIG = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
trust_remote_code=True,
token=HF_TOKEN,
device_map={"": 0},
dtype=torch.bfloat16,
quantization_config=BNB_CONFIG,
attn_implementation="sdpa",
low_cpu_mem_usage=True,
)
model.eval()
def model_input_device():
return next(model.parameters()).device
def estimate_duration(
message,
history,
system_prompt,
enable_thinking,
preserve_thinking,
temperature,
max_new_tokens,
top_p,
top_k,
repetition_penalty,
):
del message, history, system_prompt, enable_thinking, preserve_thinking, temperature, top_p, top_k, repetition_penalty
return min(240, max(90, 60 + int(max_new_tokens / 64)))
def build_messages(history, message, system_prompt):
messages = []
if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt.strip()})
trimmed_history = history[-8:]
for user_text, assistant_text in trimmed_history:
if user_text:
messages.append({"role": "user", "content": user_text})
if assistant_text:
messages.append({"role": "assistant", "content": assistant_text})
messages.append({"role": "user", "content": message})
return messages
@spaces.GPU(duration=estimate_duration, size="large")
def stream_chat(
message: str,
history: list,
system_prompt: str,
enable_thinking: bool,
preserve_thinking: bool,
temperature: float,
max_new_tokens: int,
top_p: float,
top_k: int,
repetition_penalty: float,
):
messages = build_messages(history, message, system_prompt)
rendered_prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
chat_template_kwargs={
"enable_thinking": enable_thinking,
"preserve_thinking": preserve_thinking,
},
)
inputs = tokenizer(
rendered_prompt,
return_tensors="pt",
truncation=True,
max_length=MAX_INPUT_TOKENS,
).to(model_input_device())
streamer = TextIteratorStreamer(
tokenizer,
timeout=120.0,
skip_prompt=True,
skip_special_tokens=True,
)
generation_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=temperature > 0,
temperature=max(temperature, 1e-5),
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
use_cache=True,
)
worker = Thread(target=model.generate, kwargs=generation_kwargs)
worker.start()
output = ""
for chunk in streamer:
output += chunk
yield output
CSS = """
.gradio-container { max-width: 1180px !important; margin: 0 auto !important; }
.title h1 { text-align: center; margin-bottom: 0.2rem !important; }
.subtitle p, .meta p { text-align: center; }
.meta p { font-size: 0.95rem; color: #6b7280; margin-top: 0.35rem !important; }
.duplicate-button { margin: 0 auto 14px auto !important; }
"""
chatbot = gr.Chatbot(height=680, placeholder=PLACEHOLDER)
with gr.Blocks(css=CSS, theme="soft") as demo:
gr.Markdown(f"# {TITLE}", elem_classes="title")
gr.Markdown(SUBTITLE, elem_classes="subtitle")
gr.Markdown(
f"{DESCRIPTION} Model: [{MODEL_ID}](https://huggingface.co/{MODEL_ID})",
elem_classes="meta",
)
gr.DuplicateButton("Duplicate Space", elem_classes="duplicate-button")
gr.ChatInterface(
fn=stream_chat,
chatbot=chatbot,
fill_height=True,
additional_inputs_accordion=gr.Accordion("⚙️ Parameters", open=False, render=False),
additional_inputs=[
gr.Textbox(value=SYSTEM_PROMPT, label="System prompt", lines=3, render=False),
gr.Checkbox(value=True, label="Enable thinking", render=False),
gr.Checkbox(value=False, label="Preserve thinking across turns", render=False),
gr.Slider(minimum=0.0, maximum=1.2, step=0.05, value=1.0, label="Temperature", render=False),
gr.Slider(
minimum=1024,
maximum=MAX_NEW_TOKENS,
step=512,
value=DEFAULT_MAX_NEW_TOKENS,
label="Max new tokens",
render=False,
),
gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.95, label="Top-p", render=False),
gr.Slider(minimum=1, maximum=100, step=1, value=20, label="Top-k", render=False),
gr.Slider(minimum=1.0, maximum=1.5, step=0.05, value=1.0, label="Repetition penalty", render=False),
],
examples=[
["Design a production-ready architecture for a SaaS analytics platform with clear tradeoffs."],
["Write a detailed debugging plan for a flaky async Python test suite."],
["Build a responsive landing page in React and Tailwind for a premium AI coding product."],
["Refactor this idea into a clear engineering plan: multi-tenant background job processing with retries and observability."],
],
cache_examples=False,
)
if __name__ == "__main__":
demo.launch()