Spaces:
Running on Zero
Running on Zero
| import os | |
| import re | |
| import html | |
| from threading import Thread | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| BitsAndBytesConfig, | |
| TextIteratorStreamer, | |
| ) | |
| MODEL_ID = "OrionLLM/GRM-2.6-Opus" | |
| TITLE = "GRM-2.6-Opus" | |
| SUBTITLE = "Chat with GRM-2.6-Opus on ZeroGPU" | |
| DESCRIPTION = ( | |
| "Chat with GRM-2.6-Opus in a ZeroGPU Space, optimized with text-only chat, " | |
| "NF4 4-bit loading, bounded context, streaming output, and thinking parsing." | |
| ) | |
| PLACEHOLDER = ( | |
| "Ask GRM-2.6-Opus for code, debugging, planning, research, long-form reasoning, " | |
| "terminal-agent tasks, or complex multi-step workflows." | |
| ) | |
| MAX_INPUT_TOKENS = 16384 | |
| INTERNAL_MAX_NEW_TOKENS = 4096 | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| BNB_CONFIG = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True, | |
| token=HF_TOKEN, | |
| ) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True, | |
| token=HF_TOKEN, | |
| device_map={"": 0}, | |
| dtype=torch.bfloat16, | |
| quantization_config=BNB_CONFIG, | |
| attn_implementation="sdpa", | |
| low_cpu_mem_usage=True, | |
| ) | |
| model.eval() | |
| def model_input_device(): | |
| return next(model.parameters()).device | |
| def strip_thinking(text: str) -> str: | |
| if not text: | |
| return "" | |
| text = re.sub( | |
| r"(?is)<details[^>]*>\s*<summary>.*?</summary>.*?</details>", | |
| "", | |
| text, | |
| ) | |
| text = re.sub(r"(?is)<think>.*?</think>", "", text) | |
| text = re.sub(r"(?is)<think>.*$", "", text) | |
| return text.strip() | |
| def render_thinking(raw_text: str) -> str: | |
| """ | |
| Converts model output like: | |
| <think> | |
| reasoning here | |
| </think> | |
| final answer here | |
| into a clean collapsible Thinking block in Gradio. | |
| Also handles incomplete streaming <think> blocks. | |
| """ | |
| if not raw_text: | |
| return "" | |
| text = raw_text | |
| lower = text.lower() | |
| output_parts = [] | |
| pos = 0 | |
| while True: | |
| start = lower.find("<think>", pos) | |
| if start == -1: | |
| answer = text[pos:] | |
| if answer: | |
| output_parts.append(answer) | |
| break | |
| before = text[pos:start] | |
| if before: | |
| output_parts.append(before) | |
| think_content_start = start + len("<think>") | |
| end = lower.find("</think>", think_content_start) | |
| if end == -1: | |
| thinking = text[think_content_start:] | |
| thinking = html.escape(thinking.strip()) | |
| output_parts.append( | |
| "\n\n<details open>" | |
| "<summary>🧠 Thinking</summary>\n\n" | |
| f"<pre>{thinking}</pre>\n\n" | |
| "</details>\n\n" | |
| ) | |
| break | |
| thinking = text[think_content_start:end] | |
| thinking = html.escape(thinking.strip()) | |
| output_parts.append( | |
| "\n\n<details>" | |
| "<summary>🧠 Thinking</summary>\n\n" | |
| f"<pre>{thinking}</pre>\n\n" | |
| "</details>\n\n" | |
| ) | |
| pos = end + len("</think>") | |
| rendered = "".join(output_parts).strip() | |
| return rendered | |
| def build_messages(history, message): | |
| messages = [] | |
| trimmed_history = history[-8:] | |
| for user_text, assistant_text in trimmed_history: | |
| if user_text: | |
| messages.append( | |
| { | |
| "role": "user", | |
| "content": str(user_text).strip(), | |
| } | |
| ) | |
| if assistant_text: | |
| clean_answer = strip_thinking(str(assistant_text)) | |
| if clean_answer: | |
| messages.append( | |
| { | |
| "role": "assistant", | |
| "content": clean_answer, | |
| } | |
| ) | |
| messages.append( | |
| { | |
| "role": "user", | |
| "content": message.strip(), | |
| } | |
| ) | |
| return messages | |
| def estimate_duration( | |
| message, | |
| history, | |
| enable_thinking, | |
| preserve_thinking, | |
| temperature, | |
| top_p, | |
| top_k, | |
| repetition_penalty, | |
| ): | |
| del message, history, enable_thinking, preserve_thinking | |
| del temperature, top_p, top_k, repetition_penalty | |
| return 180 | |
| def stream_chat( | |
| message: str, | |
| history: list, | |
| enable_thinking: bool, | |
| preserve_thinking: bool, | |
| temperature: float, | |
| top_p: float, | |
| top_k: int, | |
| repetition_penalty: float, | |
| ): | |
| if not message or not message.strip(): | |
| yield "" | |
| return | |
| messages = build_messages(history, message) | |
| rendered_prompt = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| enable_thinking=enable_thinking, | |
| preserve_thinking=preserve_thinking, | |
| ) | |
| inputs = tokenizer( | |
| rendered_prompt, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=MAX_INPUT_TOKENS, | |
| ).to(model_input_device()) | |
| streamer = TextIteratorStreamer( | |
| tokenizer, | |
| timeout=120.0, | |
| skip_prompt=True, | |
| skip_special_tokens=True, | |
| ) | |
| generation_kwargs = dict( | |
| **inputs, | |
| streamer=streamer, | |
| max_new_tokens=INTERNAL_MAX_NEW_TOKENS, | |
| do_sample=temperature > 0, | |
| temperature=max(temperature, 1e-5), | |
| top_p=top_p, | |
| top_k=top_k, | |
| repetition_penalty=repetition_penalty, | |
| use_cache=True, | |
| pad_token_id=tokenizer.pad_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| worker = Thread(target=model.generate, kwargs=generation_kwargs) | |
| worker.start() | |
| raw_output = "" | |
| for chunk in streamer: | |
| raw_output += chunk | |
| yield render_thinking(raw_output) | |
| CSS = """ | |
| .gradio-container { | |
| max-width: 1180px !important; | |
| margin: 0 auto !important; | |
| } | |
| .title h1 { | |
| text-align: center; | |
| margin-bottom: 0.2rem !important; | |
| } | |
| .subtitle p, | |
| .meta p { | |
| text-align: center; | |
| } | |
| .meta p { | |
| font-size: 0.95rem; | |
| color: #6b7280; | |
| margin-top: 0.35rem !important; | |
| } | |
| .duplicate-button { | |
| margin: 0 auto 14px auto !important; | |
| } | |
| details { | |
| border: 1px solid #37415133; | |
| border-radius: 12px; | |
| padding: 0.75rem 1rem; | |
| margin: 0.5rem 0 1rem 0; | |
| background: rgba(127, 127, 127, 0.08); | |
| } | |
| summary { | |
| cursor: pointer; | |
| font-weight: 600; | |
| } | |
| pre { | |
| white-space: pre-wrap; | |
| word-break: break-word; | |
| margin: 0.75rem 0 0 0; | |
| } | |
| """ | |
| chatbot = gr.Chatbot( | |
| height=680, | |
| placeholder=PLACEHOLDER, | |
| sanitize_html=False, | |
| ) | |
| with gr.Blocks(css=CSS, theme="soft") as demo: | |
| gr.Markdown(f"# {TITLE}", elem_classes="title") | |
| gr.Markdown(SUBTITLE, elem_classes="subtitle") | |
| gr.Markdown( | |
| f"{DESCRIPTION} Model: [{MODEL_ID}](https://huggingface.co/{MODEL_ID})", | |
| elem_classes="meta", | |
| ) | |
| gr.DuplicateButton("Duplicate Space", elem_classes="duplicate-button") | |
| gr.ChatInterface( | |
| fn=stream_chat, | |
| chatbot=chatbot, | |
| fill_height=True, | |
| additional_inputs_accordion=gr.Accordion( | |
| "⚙️ Parameters", | |
| open=False, | |
| render=False, | |
| ), | |
| additional_inputs=[ | |
| gr.Checkbox( | |
| value=True, | |
| label="Enable thinking", | |
| render=False, | |
| ), | |
| gr.Checkbox( | |
| value=False, | |
| label="Preserve thinking across turns", | |
| render=False, | |
| ), | |
| gr.Slider( | |
| minimum=0.0, | |
| maximum=1.2, | |
| step=0.05, | |
| value=1.0, | |
| label="Temperature", | |
| render=False, | |
| ), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| step=0.05, | |
| value=0.95, | |
| label="Top-p", | |
| render=False, | |
| ), | |
| gr.Slider( | |
| minimum=1, | |
| maximum=100, | |
| step=1, | |
| value=20, | |
| label="Top-k", | |
| render=False, | |
| ), | |
| gr.Slider( | |
| minimum=1.0, | |
| maximum=1.5, | |
| step=0.05, | |
| value=1.0, | |
| label="Repetition penalty", | |
| render=False, | |
| ), | |
| ], | |
| examples=[ | |
| ["Design a production-ready architecture for a local AI terminal-agent platform using GRM-2.6-Opus."], | |
| ["Write a detailed debugging plan for a flaky async Python test suite."], | |
| ["Build a responsive landing page in React and Tailwind for a premium AI coding product."], | |
| ["Create an agentic workflow plan for solving a Terminal-Bench style task from scratch."], | |
| ], | |
| cache_examples=False, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |