artificialguybr commited on
Commit
e8e5451
·
verified ·
1 Parent(s): be95cba

Initial ZeroGPU Qwen3.6-27B Space

Browse files
Files changed (4) hide show
  1. README.md +13 -6
  2. __pycache__/app.cpython-314.pyc +0 -0
  3. app.py +196 -0
  4. requirements.txt +7 -0
README.md CHANGED
@@ -1,12 +1,19 @@
1
  ---
2
- title: Qwen3.6 27B Zero
3
- emoji: 🐨
4
- colorFrom: purple
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 6.13.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Qwen3.6-27B Zero
3
+ emoji: 🧠
4
+ colorFrom: gray
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 6.11.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ Text-only ZeroGPU Space for `Qwen/Qwen3.6-27B`.
13
+
14
+ Notes:
15
+ - Built for ZeroGPU with `@spaces.GPU`
16
+ - Uses 4-bit NF4 quantization to reduce memory pressure
17
+ - Keeps the UI text-only because the Qwen model card explicitly recommends text-only deployment to save memory and free more KV cache
18
+ - Exposes Qwen3.6 thinking controls through `enable_thinking` and `preserve_thinking`
19
+ - Uses shorter default generation lengths than the model card recommendations to behave better in shared ZeroGPU queues
__pycache__/app.cpython-314.pyc ADDED
Binary file (7.92 kB). View file
 
app.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from threading import Thread
3
+
4
+ import gradio as gr
5
+ import spaces
6
+ import torch
7
+ from transformers import (
8
+ AutoModelForImageTextToText,
9
+ AutoTokenizer,
10
+ BitsAndBytesConfig,
11
+ TextIteratorStreamer,
12
+ )
13
+
14
+ MODEL_ID = "Qwen/Qwen3.6-27B"
15
+ TITLE = "Qwen3.6-27B Zero"
16
+ SUBTITLE = "Text-only Qwen3.6 deployment for ZeroGPU with 4-bit loading, thinking controls, and streaming chat."
17
+ DESCRIPTION = (
18
+ "Optimized for ZeroGPU usage: text-only chat, NF4 4-bit quantization, bounded context, "
19
+ "and shorter default generation lengths for better queue behavior."
20
+ )
21
+ SYSTEM_PROMPT = (
22
+ "You are Qwen3.6-27B, a highly capable assistant for coding, research, and long-form reasoning. "
23
+ "Be clear, accurate, and useful."
24
+ )
25
+ PLACEHOLDER = (
26
+ "Ask for code, debugging, planning, long-form answers, or agentic workflows. "
27
+ "Thinking mode is enabled by default."
28
+ )
29
+ MAX_INPUT_TOKENS = 16384
30
+ DEFAULT_MAX_NEW_TOKENS = 4096
31
+ MAX_NEW_TOKENS = 8192
32
+
33
+ os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
34
+ torch.backends.cuda.matmul.allow_tf32 = True
35
+
36
+ BNB_CONFIG = BitsAndBytesConfig(
37
+ load_in_4bit=True,
38
+ bnb_4bit_quant_type="nf4",
39
+ bnb_4bit_use_double_quant=True,
40
+ bnb_4bit_compute_dtype=torch.bfloat16,
41
+ )
42
+
43
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
44
+ if tokenizer.pad_token is None:
45
+ tokenizer.pad_token = tokenizer.eos_token
46
+
47
+ model = AutoModelForImageTextToText.from_pretrained(
48
+ MODEL_ID,
49
+ trust_remote_code=True,
50
+ device_map="auto",
51
+ torch_dtype=torch.bfloat16,
52
+ quantization_config=BNB_CONFIG,
53
+ attn_implementation="sdpa",
54
+ )
55
+ model.eval()
56
+
57
+
58
+ def estimate_duration(
59
+ message,
60
+ history,
61
+ system_prompt,
62
+ enable_thinking,
63
+ preserve_thinking,
64
+ temperature,
65
+ max_new_tokens,
66
+ top_p,
67
+ top_k,
68
+ repetition_penalty,
69
+ ):
70
+ del message, history, system_prompt, enable_thinking, preserve_thinking, temperature, top_p, top_k, repetition_penalty
71
+ return min(240, max(90, 60 + int(max_new_tokens / 64)))
72
+
73
+
74
+ def build_messages(history, message, system_prompt):
75
+ messages = []
76
+ if system_prompt.strip():
77
+ messages.append({"role": "system", "content": system_prompt.strip()})
78
+ trimmed_history = history[-8:]
79
+ for user_text, assistant_text in trimmed_history:
80
+ if user_text:
81
+ messages.append({"role": "user", "content": user_text})
82
+ if assistant_text:
83
+ messages.append({"role": "assistant", "content": assistant_text})
84
+ messages.append({"role": "user", "content": message})
85
+ return messages
86
+
87
+
88
+ @spaces.GPU(duration=estimate_duration, size="large")
89
+ def stream_chat(
90
+ message: str,
91
+ history: list,
92
+ system_prompt: str,
93
+ enable_thinking: bool,
94
+ preserve_thinking: bool,
95
+ temperature: float,
96
+ max_new_tokens: int,
97
+ top_p: float,
98
+ top_k: int,
99
+ repetition_penalty: float,
100
+ ):
101
+ messages = build_messages(history, message, system_prompt)
102
+ rendered_prompt = tokenizer.apply_chat_template(
103
+ messages,
104
+ tokenize=False,
105
+ add_generation_prompt=True,
106
+ chat_template_kwargs={
107
+ "enable_thinking": enable_thinking,
108
+ "preserve_thinking": preserve_thinking,
109
+ },
110
+ )
111
+ inputs = tokenizer(
112
+ rendered_prompt,
113
+ return_tensors="pt",
114
+ truncation=True,
115
+ max_length=MAX_INPUT_TOKENS,
116
+ ).to(model.device)
117
+
118
+ streamer = TextIteratorStreamer(
119
+ tokenizer,
120
+ timeout=120.0,
121
+ skip_prompt=True,
122
+ skip_special_tokens=True,
123
+ )
124
+
125
+ generation_kwargs = dict(
126
+ **inputs,
127
+ streamer=streamer,
128
+ max_new_tokens=max_new_tokens,
129
+ do_sample=temperature > 0,
130
+ temperature=max(temperature, 1e-5),
131
+ top_p=top_p,
132
+ top_k=top_k,
133
+ repetition_penalty=repetition_penalty,
134
+ use_cache=True,
135
+ )
136
+
137
+ worker = Thread(target=model.generate, kwargs=generation_kwargs)
138
+ worker.start()
139
+
140
+ output = ""
141
+ for chunk in streamer:
142
+ output += chunk
143
+ yield output
144
+
145
+
146
+ CSS = """
147
+ .gradio-container { max-width: 1180px !important; margin: 0 auto !important; }
148
+ .title h1 { text-align: center; margin-bottom: 0.2rem !important; }
149
+ .subtitle p, .meta p { text-align: center; }
150
+ .meta p { font-size: 0.95rem; color: #6b7280; margin-top: 0.35rem !important; }
151
+ .duplicate-button { margin: 0 auto 14px auto !important; }
152
+ """
153
+
154
+ chatbot = gr.Chatbot(height=680, placeholder=PLACEHOLDER)
155
+
156
+ with gr.Blocks(css=CSS, theme="soft") as demo:
157
+ gr.Markdown(f"# {TITLE}", elem_classes="title")
158
+ gr.Markdown(SUBTITLE, elem_classes="subtitle")
159
+ gr.Markdown(
160
+ f"{DESCRIPTION} Model: [{MODEL_ID}](https://huggingface.co/{MODEL_ID})",
161
+ elem_classes="meta",
162
+ )
163
+ gr.DuplicateButton("Duplicate Space", elem_classes="duplicate-button")
164
+ gr.ChatInterface(
165
+ fn=stream_chat,
166
+ chatbot=chatbot,
167
+ fill_height=True,
168
+ additional_inputs_accordion=gr.Accordion("⚙️ Parameters", open=False, render=False),
169
+ additional_inputs=[
170
+ gr.Textbox(value=SYSTEM_PROMPT, label="System prompt", lines=3, render=False),
171
+ gr.Checkbox(value=True, label="Enable thinking", render=False),
172
+ gr.Checkbox(value=False, label="Preserve thinking across turns", render=False),
173
+ gr.Slider(minimum=0.0, maximum=1.2, step=0.05, value=1.0, label="Temperature", render=False),
174
+ gr.Slider(
175
+ minimum=1024,
176
+ maximum=MAX_NEW_TOKENS,
177
+ step=512,
178
+ value=DEFAULT_MAX_NEW_TOKENS,
179
+ label="Max new tokens",
180
+ render=False,
181
+ ),
182
+ gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.95, label="Top-p", render=False),
183
+ gr.Slider(minimum=1, maximum=100, step=1, value=20, label="Top-k", render=False),
184
+ gr.Slider(minimum=1.0, maximum=1.5, step=0.05, value=1.0, label="Repetition penalty", render=False),
185
+ ],
186
+ examples=[
187
+ ["Design a production-ready architecture for a SaaS analytics platform with clear tradeoffs."],
188
+ ["Write a detailed debugging plan for a flaky async Python test suite."],
189
+ ["Build a responsive landing page in React and Tailwind for a premium AI coding product."],
190
+ ["Refactor this idea into a clear engineering plan: multi-tenant background job processing with retries and observability."],
191
+ ],
192
+ cache_examples=False,
193
+ )
194
+
195
+ if __name__ == "__main__":
196
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=6.11.0
2
+ spaces>=0.41.0
3
+ torch==2.8.0
4
+ transformers>=4.57.1
5
+ accelerate>=1.10.0
6
+ bitsandbytes>=0.48.1
7
+ sentencepiece>=0.2.0