DedeProGames commited on
Commit
ece3e79
·
verified ·
1 Parent(s): ef0c1da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +245 -60
app.py CHANGED
@@ -1,4 +1,6 @@
1
  import os
 
 
2
  from threading import Thread
3
 
4
  import gradio as gr
@@ -12,24 +14,20 @@ from transformers import (
12
  )
13
 
14
  MODEL_ID = "OrionLLM/GRM-2.6-Opus"
15
- TITLE = "GRM-2.6-Opus Zero"
16
- SUBTITLE = "Text-only GRM-2.6-Opus deployment for ZeroGPU with 4-bit loading, thinking controls, and streaming chat."
17
  DESCRIPTION = (
18
- "Optimized for ZeroGPU usage: text-only chat, NF4 4-bit quantization, bounded context, "
19
- "and shorter default generation lengths for better queue behavior."
20
- )
21
- SYSTEM_PROMPT = (
22
- "You are GRM-2.6-Opus, an advanced reasoning assistant by OrionLLM for coding, research, "
23
- "agentic workflows, terminal tasks, and long-form problem solving. Be clear, accurate, useful, "
24
- "and think carefully before answering."
25
  )
 
26
  PLACEHOLDER = (
27
  "Ask GRM-2.6-Opus for code, debugging, planning, research, long-form reasoning, "
28
- "terminal-agent tasks, or complex multi-step workflows. Thinking mode is enabled by default."
29
  )
 
30
  MAX_INPUT_TOKENS = 16384
31
- DEFAULT_MAX_NEW_TOKENS = 4096
32
- MAX_NEW_TOKENS = 8192
33
  HF_TOKEN = os.environ.get("HF_TOKEN")
34
 
35
  os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
@@ -42,7 +40,12 @@ BNB_CONFIG = BitsAndBytesConfig(
42
  bnb_4bit_compute_dtype=torch.bfloat16,
43
  )
44
 
45
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
 
 
 
 
 
46
  if tokenizer.pad_token is None:
47
  tokenizer.pad_token = tokenizer.eos_token
48
 
@@ -56,6 +59,7 @@ model = AutoModelForCausalLM.from_pretrained(
56
  attn_implementation="sdpa",
57
  low_cpu_mem_usage=True,
58
  )
 
59
  model.eval()
60
 
61
 
@@ -63,59 +67,162 @@ def model_input_device():
63
  return next(model.parameters()).device
64
 
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  def estimate_duration(
67
  message,
68
  history,
69
- system_prompt,
70
  enable_thinking,
71
  preserve_thinking,
72
  temperature,
73
- max_new_tokens,
74
  top_p,
75
  top_k,
76
  repetition_penalty,
77
  ):
78
- del message, history, system_prompt, enable_thinking, preserve_thinking, temperature, top_p, top_k, repetition_penalty
79
- return min(240, max(90, 60 + int(max_new_tokens / 64)))
80
 
81
-
82
- def build_messages(history, message, system_prompt):
83
- messages = []
84
- if system_prompt.strip():
85
- messages.append({"role": "system", "content": system_prompt.strip()})
86
- trimmed_history = history[-8:]
87
- for user_text, assistant_text in trimmed_history:
88
- if user_text:
89
- messages.append({"role": "user", "content": user_text})
90
- if assistant_text:
91
- messages.append({"role": "assistant", "content": assistant_text})
92
- messages.append({"role": "user", "content": message})
93
- return messages
94
 
95
 
96
  @spaces.GPU(duration=estimate_duration, size="large")
97
  def stream_chat(
98
  message: str,
99
  history: list,
100
- system_prompt: str,
101
  enable_thinking: bool,
102
  preserve_thinking: bool,
103
  temperature: float,
104
- max_new_tokens: int,
105
  top_p: float,
106
  top_k: int,
107
  repetition_penalty: float,
108
  ):
109
- messages = build_messages(history, message, system_prompt)
 
 
 
 
 
110
  rendered_prompt = tokenizer.apply_chat_template(
111
  messages,
112
  tokenize=False,
113
  add_generation_prompt=True,
114
- chat_template_kwargs={
115
- "enable_thinking": enable_thinking,
116
- "preserve_thinking": preserve_thinking,
117
- },
118
  )
 
119
  inputs = tokenizer(
120
  rendered_prompt,
121
  return_tensors="pt",
@@ -133,33 +240,78 @@ def stream_chat(
133
  generation_kwargs = dict(
134
  **inputs,
135
  streamer=streamer,
136
- max_new_tokens=max_new_tokens,
137
  do_sample=temperature > 0,
138
  temperature=max(temperature, 1e-5),
139
  top_p=top_p,
140
  top_k=top_k,
141
  repetition_penalty=repetition_penalty,
142
  use_cache=True,
 
 
143
  )
144
 
145
  worker = Thread(target=model.generate, kwargs=generation_kwargs)
146
  worker.start()
147
 
148
- output = ""
 
149
  for chunk in streamer:
150
- output += chunk
151
- yield output
152
 
153
 
154
  CSS = """
155
- .gradio-container { max-width: 1180px !important; margin: 0 auto !important; }
156
- .title h1 { text-align: center; margin-bottom: 0.2rem !important; }
157
- .subtitle p, .meta p { text-align: center; }
158
- .meta p { font-size: 0.95rem; color: #6b7280; margin-top: 0.35rem !important; }
159
- .duplicate-button { margin: 0 auto 14px auto !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  """
161
 
162
- chatbot = gr.Chatbot(height=680, placeholder=PLACEHOLDER)
 
 
 
 
163
 
164
  with gr.Blocks(css=CSS, theme="soft") as demo:
165
  gr.Markdown(f"# {TITLE}", elem_classes="title")
@@ -168,28 +320,61 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
168
  f"{DESCRIPTION} Model: [{MODEL_ID}](https://huggingface.co/{MODEL_ID})",
169
  elem_classes="meta",
170
  )
 
171
  gr.DuplicateButton("Duplicate Space", elem_classes="duplicate-button")
 
172
  gr.ChatInterface(
173
  fn=stream_chat,
174
  chatbot=chatbot,
175
  fill_height=True,
176
- additional_inputs_accordion=gr.Accordion("⚙️ Parameters", open=False, render=False),
 
 
 
 
177
  additional_inputs=[
178
- gr.Textbox(value=SYSTEM_PROMPT, label="System prompt", lines=3, render=False),
179
- gr.Checkbox(value=True, label="Enable thinking", render=False),
180
- gr.Checkbox(value=False, label="Preserve thinking across turns", render=False),
181
- gr.Slider(minimum=0.0, maximum=1.2, step=0.05, value=1.0, label="Temperature", render=False),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  gr.Slider(
183
- minimum=1024,
184
- maximum=MAX_NEW_TOKENS,
185
- step=512,
186
- value=DEFAULT_MAX_NEW_TOKENS,
187
- label="Max new tokens",
188
  render=False,
189
  ),
190
- gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.95, label="Top-p", render=False),
191
- gr.Slider(minimum=1, maximum=100, step=1, value=20, label="Top-k", render=False),
192
- gr.Slider(minimum=1.0, maximum=1.5, step=0.05, value=1.0, label="Repetition penalty", render=False),
193
  ],
194
  examples=[
195
  ["Design a production-ready architecture for a local AI terminal-agent platform using GRM-2.6-Opus."],
 
1
  import os
2
+ import re
3
+ import html
4
  from threading import Thread
5
 
6
  import gradio as gr
 
14
  )
15
 
16
  MODEL_ID = "OrionLLM/GRM-2.6-Opus"
17
+ TITLE = "GRM-2.6-Opus"
18
+ SUBTITLE = "Chat with GRM-2.6-Opus on ZeroGPU"
19
  DESCRIPTION = (
20
+ "Chat with GRM-2.6-Opus in a ZeroGPU Space, optimized with text-only chat, "
21
+ "NF4 4-bit loading, bounded context, streaming output, and thinking parsing."
 
 
 
 
 
22
  )
23
+
24
  PLACEHOLDER = (
25
  "Ask GRM-2.6-Opus for code, debugging, planning, research, long-form reasoning, "
26
+ "terminal-agent tasks, or complex multi-step workflows."
27
  )
28
+
29
  MAX_INPUT_TOKENS = 16384
30
+ INTERNAL_MAX_NEW_TOKENS = 4096
 
31
  HF_TOKEN = os.environ.get("HF_TOKEN")
32
 
33
  os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
 
40
  bnb_4bit_compute_dtype=torch.bfloat16,
41
  )
42
 
43
+ tokenizer = AutoTokenizer.from_pretrained(
44
+ MODEL_ID,
45
+ trust_remote_code=True,
46
+ token=HF_TOKEN,
47
+ )
48
+
49
  if tokenizer.pad_token is None:
50
  tokenizer.pad_token = tokenizer.eos_token
51
 
 
59
  attn_implementation="sdpa",
60
  low_cpu_mem_usage=True,
61
  )
62
+
63
  model.eval()
64
 
65
 
 
67
  return next(model.parameters()).device
68
 
69
 
70
+ def strip_thinking(text: str) -> str:
71
+ if not text:
72
+ return ""
73
+
74
+ text = re.sub(
75
+ r"(?is)<details[^>]*>\s*<summary>.*?</summary>.*?</details>",
76
+ "",
77
+ text,
78
+ )
79
+
80
+ text = re.sub(r"(?is)<think>.*?</think>", "", text)
81
+ text = re.sub(r"(?is)<think>.*$", "", text)
82
+
83
+ return text.strip()
84
+
85
+
86
+ def render_thinking(raw_text: str) -> str:
87
+ """
88
+ Converts model output like:
89
+
90
+ <think>
91
+ reasoning here
92
+ </think>
93
+ final answer here
94
+
95
+ into a clean collapsible Thinking block in Gradio.
96
+ Also handles incomplete streaming <think> blocks.
97
+ """
98
+ if not raw_text:
99
+ return ""
100
+
101
+ text = raw_text
102
+ lower = text.lower()
103
+
104
+ output_parts = []
105
+ pos = 0
106
+
107
+ while True:
108
+ start = lower.find("<think>", pos)
109
+
110
+ if start == -1:
111
+ answer = text[pos:]
112
+ if answer:
113
+ output_parts.append(answer)
114
+ break
115
+
116
+ before = text[pos:start]
117
+ if before:
118
+ output_parts.append(before)
119
+
120
+ think_content_start = start + len("<think>")
121
+ end = lower.find("</think>", think_content_start)
122
+
123
+ if end == -1:
124
+ thinking = text[think_content_start:]
125
+ thinking = html.escape(thinking.strip())
126
+
127
+ output_parts.append(
128
+ "\n\n<details open>"
129
+ "<summary>🧠 Thinking</summary>\n\n"
130
+ f"<pre>{thinking}</pre>\n\n"
131
+ "</details>\n\n"
132
+ )
133
+ break
134
+
135
+ thinking = text[think_content_start:end]
136
+ thinking = html.escape(thinking.strip())
137
+
138
+ output_parts.append(
139
+ "\n\n<details>"
140
+ "<summary>🧠 Thinking</summary>\n\n"
141
+ f"<pre>{thinking}</pre>\n\n"
142
+ "</details>\n\n"
143
+ )
144
+
145
+ pos = end + len("</think>")
146
+
147
+ rendered = "".join(output_parts).strip()
148
+ return rendered
149
+
150
+
151
+ def build_messages(history, message):
152
+ messages = []
153
+
154
+ trimmed_history = history[-8:]
155
+
156
+ for user_text, assistant_text in trimmed_history:
157
+ if user_text:
158
+ messages.append(
159
+ {
160
+ "role": "user",
161
+ "content": str(user_text).strip(),
162
+ }
163
+ )
164
+
165
+ if assistant_text:
166
+ clean_answer = strip_thinking(str(assistant_text))
167
+ if clean_answer:
168
+ messages.append(
169
+ {
170
+ "role": "assistant",
171
+ "content": clean_answer,
172
+ }
173
+ )
174
+
175
+ messages.append(
176
+ {
177
+ "role": "user",
178
+ "content": message.strip(),
179
+ }
180
+ )
181
+
182
+ return messages
183
+
184
+
185
  def estimate_duration(
186
  message,
187
  history,
 
188
  enable_thinking,
189
  preserve_thinking,
190
  temperature,
 
191
  top_p,
192
  top_k,
193
  repetition_penalty,
194
  ):
195
+ del message, history, enable_thinking, preserve_thinking
196
+ del temperature, top_p, top_k, repetition_penalty
197
 
198
+ return 180
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
 
201
  @spaces.GPU(duration=estimate_duration, size="large")
202
  def stream_chat(
203
  message: str,
204
  history: list,
 
205
  enable_thinking: bool,
206
  preserve_thinking: bool,
207
  temperature: float,
 
208
  top_p: float,
209
  top_k: int,
210
  repetition_penalty: float,
211
  ):
212
+ if not message or not message.strip():
213
+ yield ""
214
+ return
215
+
216
+ messages = build_messages(history, message)
217
+
218
  rendered_prompt = tokenizer.apply_chat_template(
219
  messages,
220
  tokenize=False,
221
  add_generation_prompt=True,
222
+ enable_thinking=enable_thinking,
223
+ preserve_thinking=preserve_thinking,
 
 
224
  )
225
+
226
  inputs = tokenizer(
227
  rendered_prompt,
228
  return_tensors="pt",
 
240
  generation_kwargs = dict(
241
  **inputs,
242
  streamer=streamer,
243
+ max_new_tokens=INTERNAL_MAX_NEW_TOKENS,
244
  do_sample=temperature > 0,
245
  temperature=max(temperature, 1e-5),
246
  top_p=top_p,
247
  top_k=top_k,
248
  repetition_penalty=repetition_penalty,
249
  use_cache=True,
250
+ pad_token_id=tokenizer.pad_token_id,
251
+ eos_token_id=tokenizer.eos_token_id,
252
  )
253
 
254
  worker = Thread(target=model.generate, kwargs=generation_kwargs)
255
  worker.start()
256
 
257
+ raw_output = ""
258
+
259
  for chunk in streamer:
260
+ raw_output += chunk
261
+ yield render_thinking(raw_output)
262
 
263
 
264
  CSS = """
265
+ .gradio-container {
266
+ max-width: 1180px !important;
267
+ margin: 0 auto !important;
268
+ }
269
+
270
+ .title h1 {
271
+ text-align: center;
272
+ margin-bottom: 0.2rem !important;
273
+ }
274
+
275
+ .subtitle p,
276
+ .meta p {
277
+ text-align: center;
278
+ }
279
+
280
+ .meta p {
281
+ font-size: 0.95rem;
282
+ color: #6b7280;
283
+ margin-top: 0.35rem !important;
284
+ }
285
+
286
+ .duplicate-button {
287
+ margin: 0 auto 14px auto !important;
288
+ }
289
+
290
+ details {
291
+ border: 1px solid #37415133;
292
+ border-radius: 12px;
293
+ padding: 0.75rem 1rem;
294
+ margin: 0.5rem 0 1rem 0;
295
+ background: rgba(127, 127, 127, 0.08);
296
+ }
297
+
298
+ summary {
299
+ cursor: pointer;
300
+ font-weight: 600;
301
+ }
302
+
303
+ pre {
304
+ white-space: pre-wrap;
305
+ word-break: break-word;
306
+ margin: 0.75rem 0 0 0;
307
+ }
308
  """
309
 
310
+ chatbot = gr.Chatbot(
311
+ height=680,
312
+ placeholder=PLACEHOLDER,
313
+ sanitize_html=False,
314
+ )
315
 
316
  with gr.Blocks(css=CSS, theme="soft") as demo:
317
  gr.Markdown(f"# {TITLE}", elem_classes="title")
 
320
  f"{DESCRIPTION} Model: [{MODEL_ID}](https://huggingface.co/{MODEL_ID})",
321
  elem_classes="meta",
322
  )
323
+
324
  gr.DuplicateButton("Duplicate Space", elem_classes="duplicate-button")
325
+
326
  gr.ChatInterface(
327
  fn=stream_chat,
328
  chatbot=chatbot,
329
  fill_height=True,
330
+ additional_inputs_accordion=gr.Accordion(
331
+ "⚙️ Parameters",
332
+ open=False,
333
+ render=False,
334
+ ),
335
  additional_inputs=[
336
+ gr.Checkbox(
337
+ value=True,
338
+ label="Enable thinking",
339
+ render=False,
340
+ ),
341
+ gr.Checkbox(
342
+ value=False,
343
+ label="Preserve thinking across turns",
344
+ render=False,
345
+ ),
346
+ gr.Slider(
347
+ minimum=0.0,
348
+ maximum=1.2,
349
+ step=0.05,
350
+ value=1.0,
351
+ label="Temperature",
352
+ render=False,
353
+ ),
354
+ gr.Slider(
355
+ minimum=0.1,
356
+ maximum=1.0,
357
+ step=0.05,
358
+ value=0.95,
359
+ label="Top-p",
360
+ render=False,
361
+ ),
362
+ gr.Slider(
363
+ minimum=1,
364
+ maximum=100,
365
+ step=1,
366
+ value=20,
367
+ label="Top-k",
368
+ render=False,
369
+ ),
370
  gr.Slider(
371
+ minimum=1.0,
372
+ maximum=1.5,
373
+ step=0.05,
374
+ value=1.0,
375
+ label="Repetition penalty",
376
  render=False,
377
  ),
 
 
 
378
  ],
379
  examples=[
380
  ["Design a production-ready architecture for a local AI terminal-agent platform using GRM-2.6-Opus."],