GRM-2.6-Opus

Running on Zero

App Files Files Community

DedeProGames commited on 8 days ago

Commit

ece3e79

verified ·

1 Parent(s): ef0c1da

Update app.py

Browse files

Files changed (1) hide show

app.py +245 -60

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os
 from threading import Thread
 import gradio as gr
@@ -12,24 +14,20 @@ from transformers import (
 )
 MODEL_ID = "OrionLLM/GRM-2.6-Opus"
-TITLE = "GRM-2.6-Opus Zero"
-SUBTITLE = "Text-only GRM-2.6-Opus deployment for ZeroGPU with 4-bit loading, thinking controls, and streaming chat."
 DESCRIPTION = (
-    "Optimized for ZeroGPU usage: text-only chat, NF4 4-bit quantization, bounded context, "
-    "and shorter default generation lengths for better queue behavior."
-)
-SYSTEM_PROMPT = (
-    "You are GRM-2.6-Opus, an advanced reasoning assistant by OrionLLM for coding, research, "
-    "agentic workflows, terminal tasks, and long-form problem solving. Be clear, accurate, useful, "
-    "and think carefully before answering."
 )
 PLACEHOLDER = (
     "Ask GRM-2.6-Opus for code, debugging, planning, research, long-form reasoning, "
-    "terminal-agent tasks, or complex multi-step workflows. Thinking mode is enabled by default."
 )
 MAX_INPUT_TOKENS = 16384
-DEFAULT_MAX_NEW_TOKENS = 4096
-MAX_NEW_TOKENS = 8192
 HF_TOKEN = os.environ.get("HF_TOKEN")
 os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
@@ -42,7 +40,12 @@ BNB_CONFIG = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16,
 )
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
@@ -56,6 +59,7 @@ model = AutoModelForCausalLM.from_pretrained(
     attn_implementation="sdpa",
     low_cpu_mem_usage=True,
 )
 model.eval()
@@ -63,59 +67,162 @@ def model_input_device():
     return next(model.parameters()).device
 def estimate_duration(
     message,
     history,
-    system_prompt,
     enable_thinking,
     preserve_thinking,
     temperature,
-    max_new_tokens,
     top_p,
     top_k,
     repetition_penalty,
 ):
-    del message, history, system_prompt, enable_thinking, preserve_thinking, temperature, top_p, top_k, repetition_penalty
-    return min(240, max(90, 60 + int(max_new_tokens / 64)))
-def build_messages(history, message, system_prompt):
-    messages = []
-    if system_prompt.strip():
-        messages.append({"role": "system", "content": system_prompt.strip()})
-    trimmed_history = history[-8:]
-    for user_text, assistant_text in trimmed_history:
-        if user_text:
-            messages.append({"role": "user", "content": user_text})
-        if assistant_text:
-            messages.append({"role": "assistant", "content": assistant_text})
-    messages.append({"role": "user", "content": message})
-    return messages
 @spaces.GPU(duration=estimate_duration, size="large")
 def stream_chat(
     message: str,
     history: list,
-    system_prompt: str,
     enable_thinking: bool,
     preserve_thinking: bool,
     temperature: float,
-    max_new_tokens: int,
     top_p: float,
     top_k: int,
     repetition_penalty: float,
 ):
-    messages = build_messages(history, message, system_prompt)
     rendered_prompt = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True,
-        chat_template_kwargs={
-            "enable_thinking": enable_thinking,
-            "preserve_thinking": preserve_thinking,
-        },
     )
     inputs = tokenizer(
         rendered_prompt,
         return_tensors="pt",
@@ -133,33 +240,78 @@ def stream_chat(
     generation_kwargs = dict(
         **inputs,
         streamer=streamer,
-        max_new_tokens=max_new_tokens,
         do_sample=temperature > 0,
         temperature=max(temperature, 1e-5),
         top_p=top_p,
         top_k=top_k,
         repetition_penalty=repetition_penalty,
         use_cache=True,
     )
     worker = Thread(target=model.generate, kwargs=generation_kwargs)
     worker.start()
-    output = ""
     for chunk in streamer:
-        output += chunk
-        yield output
 CSS = """
-.gradio-container { max-width: 1180px !important; margin: 0 auto !important; }
-.title h1 { text-align: center; margin-bottom: 0.2rem !important; }
-.subtitle p, .meta p { text-align: center; }
-.meta p { font-size: 0.95rem; color: #6b7280; margin-top: 0.35rem !important; }
-.duplicate-button { margin: 0 auto 14px auto !important; }
 """
-chatbot = gr.Chatbot(height=680, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="soft") as demo:
     gr.Markdown(f"# {TITLE}", elem_classes="title")
@@ -168,28 +320,61 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
         f"{DESCRIPTION} Model: [{MODEL_ID}](https://huggingface.co/{MODEL_ID})",
         elem_classes="meta",
     )
     gr.DuplicateButton("Duplicate Space", elem_classes="duplicate-button")
     gr.ChatInterface(
         fn=stream_chat,
         chatbot=chatbot,
         fill_height=True,
-        additional_inputs_accordion=gr.Accordion("⚙️ Parameters", open=False, render=False),
         additional_inputs=[
-            gr.Textbox(value=SYSTEM_PROMPT, label="System prompt", lines=3, render=False),
-            gr.Checkbox(value=True, label="Enable thinking", render=False),
-            gr.Checkbox(value=False, label="Preserve thinking across turns", render=False),
-            gr.Slider(minimum=0.0, maximum=1.2, step=0.05, value=1.0, label="Temperature", render=False),
             gr.Slider(
-                minimum=1024,
-                maximum=MAX_NEW_TOKENS,
-                step=512,
-                value=DEFAULT_MAX_NEW_TOKENS,
-                label="Max new tokens",
                 render=False,
             ),
-            gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.95, label="Top-p", render=False),
-            gr.Slider(minimum=1, maximum=100, step=1, value=20, label="Top-k", render=False),
-            gr.Slider(minimum=1.0, maximum=1.5, step=0.05, value=1.0, label="Repetition penalty", render=False),
         ],
         examples=[
             ["Design a production-ready architecture for a local AI terminal-agent platform using GRM-2.6-Opus."],

 import os
+import re
+import html
 from threading import Thread
 import gradio as gr
 )
 MODEL_ID = "OrionLLM/GRM-2.6-Opus"
+TITLE = "GRM-2.6-Opus"
+SUBTITLE = "Chat with GRM-2.6-Opus on ZeroGPU"
 DESCRIPTION = (
+    "Chat with GRM-2.6-Opus in a ZeroGPU Space, optimized with text-only chat, "
+    "NF4 4-bit loading, bounded context, streaming output, and thinking parsing."
 )
 PLACEHOLDER = (
     "Ask GRM-2.6-Opus for code, debugging, planning, research, long-form reasoning, "
+    "terminal-agent tasks, or complex multi-step workflows."
 )
 MAX_INPUT_TOKENS = 16384
+INTERNAL_MAX_NEW_TOKENS = 4096
 HF_TOKEN = os.environ.get("HF_TOKEN")
 os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
     bnb_4bit_compute_dtype=torch.bfloat16,
 )
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    token=HF_TOKEN,
+)
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
     attn_implementation="sdpa",
     low_cpu_mem_usage=True,
 )
 model.eval()
     return next(model.parameters()).device
+def strip_thinking(text: str) -> str:
+    if not text:
+        return ""
+    text = re.sub(
+        r"(?is)<details[^>]*>\s*<summary>.*?</summary>.*?</details>",
+        "",
+        text,
+    )
+    text = re.sub(r"(?is)<think>.*?</think>", "", text)
+    text = re.sub(r"(?is)<think>.*$", "", text)
+    return text.strip()
+def render_thinking(raw_text: str) -> str:
+    """
+    Converts model output like:
+    <think>
+    reasoning here
+    </think>
+    final answer here
+    into a clean collapsible Thinking block in Gradio.
+    Also handles incomplete streaming <think> blocks.
+    """
+    if not raw_text:
+        return ""
+    text = raw_text
+    lower = text.lower()
+    output_parts = []
+    pos = 0
+    while True:
+        start = lower.find("<think>", pos)
+        if start == -1:
+            answer = text[pos:]
+            if answer:
+                output_parts.append(answer)
+            break
+        before = text[pos:start]
+        if before:
+            output_parts.append(before)
+        think_content_start = start + len("<think>")
+        end = lower.find("</think>", think_content_start)
+        if end == -1:
+            thinking = text[think_content_start:]
+            thinking = html.escape(thinking.strip())
+            output_parts.append(
+                "\n\n<details open>"
+                "<summary>🧠 Thinking</summary>\n\n"
+                f"<pre>{thinking}</pre>\n\n"
+                "</details>\n\n"
+            )
+            break
+        thinking = text[think_content_start:end]
+        thinking = html.escape(thinking.strip())
+        output_parts.append(
+            "\n\n<details>"
+            "<summary>🧠 Thinking</summary>\n\n"
+            f"<pre>{thinking}</pre>\n\n"
+            "</details>\n\n"
+        )
+        pos = end + len("</think>")
+    rendered = "".join(output_parts).strip()
+    return rendered
+def build_messages(history, message):
+    messages = []
+    trimmed_history = history[-8:]
+    for user_text, assistant_text in trimmed_history:
+        if user_text:
+            messages.append(
+                {
+                    "role": "user",
+                    "content": str(user_text).strip(),
+                }
+            )
+        if assistant_text:
+            clean_answer = strip_thinking(str(assistant_text))
+            if clean_answer:
+                messages.append(
+                    {
+                        "role": "assistant",
+                        "content": clean_answer,
+                    }
+                )
+    messages.append(
+        {
+            "role": "user",
+            "content": message.strip(),
+        }
+    )
+    return messages
 def estimate_duration(
     message,
     history,
     enable_thinking,
     preserve_thinking,
     temperature,
     top_p,
     top_k,
     repetition_penalty,
 ):
+    del message, history, enable_thinking, preserve_thinking
+    del temperature, top_p, top_k, repetition_penalty
+    return 180
 @spaces.GPU(duration=estimate_duration, size="large")
 def stream_chat(
     message: str,
     history: list,
     enable_thinking: bool,
     preserve_thinking: bool,
     temperature: float,
     top_p: float,
     top_k: int,
     repetition_penalty: float,
 ):
+    if not message or not message.strip():
+        yield ""
+        return
+    messages = build_messages(history, message)
     rendered_prompt = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True,
+        enable_thinking=enable_thinking,
+        preserve_thinking=preserve_thinking,
     )
     inputs = tokenizer(
         rendered_prompt,
         return_tensors="pt",
     generation_kwargs = dict(
         **inputs,
         streamer=streamer,
+        max_new_tokens=INTERNAL_MAX_NEW_TOKENS,
         do_sample=temperature > 0,
         temperature=max(temperature, 1e-5),
         top_p=top_p,
         top_k=top_k,
         repetition_penalty=repetition_penalty,
         use_cache=True,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
     )
     worker = Thread(target=model.generate, kwargs=generation_kwargs)
     worker.start()
+    raw_output = ""
     for chunk in streamer:
+        raw_output += chunk
+        yield render_thinking(raw_output)
 CSS = """
+.gradio-container {
+    max-width: 1180px !important;
+    margin: 0 auto !important;
+}
+.title h1 {
+    text-align: center;
+    margin-bottom: 0.2rem !important;
+}
+.subtitle p,
+.meta p {
+    text-align: center;
+}
+.meta p {
+    font-size: 0.95rem;
+    color: #6b7280;
+    margin-top: 0.35rem !important;
+}
+.duplicate-button {
+    margin: 0 auto 14px auto !important;
+}
+details {
+    border: 1px solid #37415133;
+    border-radius: 12px;
+    padding: 0.75rem 1rem;
+    margin: 0.5rem 0 1rem 0;
+    background: rgba(127, 127, 127, 0.08);
+}
+summary {
+    cursor: pointer;
+    font-weight: 600;
+}
+pre {
+    white-space: pre-wrap;
+    word-break: break-word;
+    margin: 0.75rem 0 0 0;
+}
 """
+chatbot = gr.Chatbot(
+    height=680,
+    placeholder=PLACEHOLDER,
+    sanitize_html=False,
+)
 with gr.Blocks(css=CSS, theme="soft") as demo:
     gr.Markdown(f"# {TITLE}", elem_classes="title")
         f"{DESCRIPTION} Model: [{MODEL_ID}](https://huggingface.co/{MODEL_ID})",
         elem_classes="meta",
     )
     gr.DuplicateButton("Duplicate Space", elem_classes="duplicate-button")
     gr.ChatInterface(
         fn=stream_chat,
         chatbot=chatbot,
         fill_height=True,
+        additional_inputs_accordion=gr.Accordion(
+            "⚙️ Parameters",
+            open=False,
+            render=False,
+        ),
         additional_inputs=[
+            gr.Checkbox(
+                value=True,
+                label="Enable thinking",
+                render=False,
+            ),
+            gr.Checkbox(
+                value=False,
+                label="Preserve thinking across turns",
+                render=False,
+            ),
+            gr.Slider(
+                minimum=0.0,
+                maximum=1.2,
+                step=0.05,
+                value=1.0,
+                label="Temperature",
+                render=False,
+            ),
+            gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                step=0.05,
+                value=0.95,
+                label="Top-p",
+                render=False,
+            ),
+            gr.Slider(
+                minimum=1,
+                maximum=100,
+                step=1,
+                value=20,
+                label="Top-k",
+                render=False,
+            ),
             gr.Slider(
+                minimum=1.0,
+                maximum=1.5,
+                step=0.05,
+                value=1.0,
+                label="Repetition penalty",
                 render=False,
             ),
         ],
         examples=[
             ["Design a production-ready architecture for a local AI terminal-agent platform using GRM-2.6-Opus."],