Spaces:

artificialguybr
/

Qwen3.6-27B-zero

Running on Zero

App Files Files Community

artificialguybr commited on 17 days ago

Commit

e8e5451

verified ·

1 Parent(s): be95cba

Initial ZeroGPU Qwen3.6-27B Space

Browse files

Files changed (4) hide show

README.md +13 -6
__pycache__/app.cpython-314.pyc +0 -0
app.py +196 -0
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,12 +1,19 @@
 ---
-title: Qwen3.6 27B Zero
-emoji: 🐨
-colorFrom: purple
-colorTo: red
 sdk: gradio
-sdk_version: 6.13.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Qwen3.6-27B Zero
+emoji: 🧠
+colorFrom: gray
+colorTo: purple
 sdk: gradio
+sdk_version: 6.11.0
 app_file: app.py
 pinned: false
 ---
+Text-only ZeroGPU Space for `Qwen/Qwen3.6-27B`.
+Notes:
+- Built for ZeroGPU with `@spaces.GPU`
+- Uses 4-bit NF4 quantization to reduce memory pressure
+- Keeps the UI text-only because the Qwen model card explicitly recommends text-only deployment to save memory and free more KV cache
+- Exposes Qwen3.6 thinking controls through `enable_thinking` and `preserve_thinking`
+- Uses shorter default generation lengths than the model card recommendations to behave better in shared ZeroGPU queues

__pycache__/app.cpython-314.pyc ADDED Viewed

Binary file (7.92 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import os
+from threading import Thread
+import gradio as gr
+import spaces
+import torch
+from transformers import (
+    AutoModelForImageTextToText,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TextIteratorStreamer,
+)
+MODEL_ID = "Qwen/Qwen3.6-27B"
+TITLE = "Qwen3.6-27B Zero"
+SUBTITLE = "Text-only Qwen3.6 deployment for ZeroGPU with 4-bit loading, thinking controls, and streaming chat."
+DESCRIPTION = (
+    "Optimized for ZeroGPU usage: text-only chat, NF4 4-bit quantization, bounded context, "
+    "and shorter default generation lengths for better queue behavior."
+)
+SYSTEM_PROMPT = (
+    "You are Qwen3.6-27B, a highly capable assistant for coding, research, and long-form reasoning. "
+    "Be clear, accurate, and useful."
+)
+PLACEHOLDER = (
+    "Ask for code, debugging, planning, long-form answers, or agentic workflows. "
+    "Thinking mode is enabled by default."
+)
+MAX_INPUT_TOKENS = 16384
+DEFAULT_MAX_NEW_TOKENS = 4096
+MAX_NEW_TOKENS = 8192
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+torch.backends.cuda.matmul.allow_tf32 = True
+BNB_CONFIG = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+model = AutoModelForImageTextToText.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    quantization_config=BNB_CONFIG,
+    attn_implementation="sdpa",
+)
+model.eval()
+def estimate_duration(
+    message,
+    history,
+    system_prompt,
+    enable_thinking,
+    preserve_thinking,
+    temperature,
+    max_new_tokens,
+    top_p,
+    top_k,
+    repetition_penalty,
+):
+    del message, history, system_prompt, enable_thinking, preserve_thinking, temperature, top_p, top_k, repetition_penalty
+    return min(240, max(90, 60 + int(max_new_tokens / 64)))
+def build_messages(history, message, system_prompt):
+    messages = []
+    if system_prompt.strip():
+        messages.append({"role": "system", "content": system_prompt.strip()})
+    trimmed_history = history[-8:]
+    for user_text, assistant_text in trimmed_history:
+        if user_text:
+            messages.append({"role": "user", "content": user_text})
+        if assistant_text:
+            messages.append({"role": "assistant", "content": assistant_text})
+    messages.append({"role": "user", "content": message})
+    return messages
+@spaces.GPU(duration=estimate_duration, size="large")
+def stream_chat(
+    message: str,
+    history: list,
+    system_prompt: str,
+    enable_thinking: bool,
+    preserve_thinking: bool,
+    temperature: float,
+    max_new_tokens: int,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: float,
+):
+    messages = build_messages(history, message, system_prompt)
+    rendered_prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        chat_template_kwargs={
+            "enable_thinking": enable_thinking,
+            "preserve_thinking": preserve_thinking,
+        },
+    )
+    inputs = tokenizer(
+        rendered_prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=MAX_INPUT_TOKENS,
+    ).to(model.device)
+    streamer = TextIteratorStreamer(
+        tokenizer,
+        timeout=120.0,
+        skip_prompt=True,
+        skip_special_tokens=True,
+    )
+    generation_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=temperature > 0,
+        temperature=max(temperature, 1e-5),
+        top_p=top_p,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        use_cache=True,
+    )
+    worker = Thread(target=model.generate, kwargs=generation_kwargs)
+    worker.start()
+    output = ""
+    for chunk in streamer:
+        output += chunk
+        yield output
+CSS = """
+.gradio-container { max-width: 1180px !important; margin: 0 auto !important; }
+.title h1 { text-align: center; margin-bottom: 0.2rem !important; }
+.subtitle p, .meta p { text-align: center; }
+.meta p { font-size: 0.95rem; color: #6b7280; margin-top: 0.35rem !important; }
+.duplicate-button { margin: 0 auto 14px auto !important; }
+"""
+chatbot = gr.Chatbot(height=680, placeholder=PLACEHOLDER)
+with gr.Blocks(css=CSS, theme="soft") as demo:
+    gr.Markdown(f"# {TITLE}", elem_classes="title")
+    gr.Markdown(SUBTITLE, elem_classes="subtitle")
+    gr.Markdown(
+        f"{DESCRIPTION} Model: [{MODEL_ID}](https://huggingface.co/{MODEL_ID})",
+        elem_classes="meta",
+    )
+    gr.DuplicateButton("Duplicate Space", elem_classes="duplicate-button")
+    gr.ChatInterface(
+        fn=stream_chat,
+        chatbot=chatbot,
+        fill_height=True,
+        additional_inputs_accordion=gr.Accordion("⚙️ Parameters", open=False, render=False),
+        additional_inputs=[
+            gr.Textbox(value=SYSTEM_PROMPT, label="System prompt", lines=3, render=False),
+            gr.Checkbox(value=True, label="Enable thinking", render=False),
+            gr.Checkbox(value=False, label="Preserve thinking across turns", render=False),
+            gr.Slider(minimum=0.0, maximum=1.2, step=0.05, value=1.0, label="Temperature", render=False),
+            gr.Slider(
+                minimum=1024,
+                maximum=MAX_NEW_TOKENS,
+                step=512,
+                value=DEFAULT_MAX_NEW_TOKENS,
+                label="Max new tokens",
+                render=False,
+            ),
+            gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.95, label="Top-p", render=False),
+            gr.Slider(minimum=1, maximum=100, step=1, value=20, label="Top-k", render=False),
+            gr.Slider(minimum=1.0, maximum=1.5, step=0.05, value=1.0, label="Repetition penalty", render=False),
+        ],
+        examples=[
+            ["Design a production-ready architecture for a SaaS analytics platform with clear tradeoffs."],
+            ["Write a detailed debugging plan for a flaky async Python test suite."],
+            ["Build a responsive landing page in React and Tailwind for a premium AI coding product."],
+            ["Refactor this idea into a clear engineering plan: multi-tenant background job processing with retries and observability."],
+        ],
+        cache_examples=False,
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=6.11.0
+spaces>=0.41.0
+torch==2.8.0
+transformers>=4.57.1
+accelerate>=1.10.0
+bitsandbytes>=0.48.1
+sentencepiece>=0.2.0