Spaces:

zhangsq-nju
/

EdgeRazor-PlayGround

Running

App Files Files Community

zhsh17 commited on 10 days ago

Commit

eb7a0c7

1 Parent(s): 5be876f

v0.0.1: Add EdgeRazor Playground

Browse files

Files changed (5) hide show

README.md +17 -7
app.py +325 -0
config.py +61 -0
requirements.txt +3 -0
style.css +31 -0

README.md CHANGED Viewed

@@ -1,14 +1,24 @@
 ---
-title: EdgeRazor PlayGround
-emoji: 🦀
-colorFrom: red
 colorTo: gray
 sdk: gradio
-sdk_version: 6.13.0
 app_file: app.py
-pinned: false
 license: apache-2.0
-short_description: EdgeRazor Playground for low-bit LLMs! CPU-friendly!🚀
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: EdgeRazor Playground
+emoji: 🚀
+colorFrom: blue
 colorTo: gray
 sdk: gradio
+sdk_version: 6.5.1
+python_version: 3.12.2
 app_file: app.py
+pinned: true
 license: apache-2.0
+short_description: EdgeRazor Playground for low-bit LLMs! CPU-friendly! 🚀
 ---
+## EdgeRazor Playground
+A CPU-friendly chatbot powered by **[Qwen3-EdgeRazor-nbit](https://huggingface.co/collections/zhangsq-nju/edgerazor-nbit)**, running locally via [llama.cpp](https://github.com/ggerganov/llama.cpp). Displays real-time efficiency metrics (output tokens, time, decoding throughput) per turn.
+## Dependencies
+- [llama-cpp-python](https://abetlen.github.io/llama-cpp-python/whl/cpu/llama-cpp-python)
+- Qwen3-EdgeRazor-nbit gguf files:
+  - [Qwen3-0.6B-EdgeRazor-GGUF](https://huggingface.co/zhangsq-nju/Qwen3-0.6B-EdgeRazor-GGUF)
+  - [Qwen3-1.7B-EdgeRazor-GGUF](https://huggingface.co/zhangsq-nju/Qwen3-1.7B-EdgeRazor-GGUF)

app.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import os
+import time
+import gradio as gr
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+from config import (
+    FLASH_ATTN,
+    KV_CACHE_TYPE,
+    MAX_TOKENS,
+    MIN_P,
+    N_CTX,
+    PRESENCE_PENALTY,
+    REPEAT_PENALTY,
+    TEMPERATURE,
+    TOP_K,
+    TOP_P,
+    header_info,
+    model_zoo,
+    system_prompt,
+)
+# ──────────────────────────── Constants ───────────────────────────────
+_KV_TYPE: dict[str, int] = {
+    "f32": 0,
+    "f16": 1,
+    "q4_0": 2,
+    "q4_1": 3,
+    "q5_0": 6,
+    "q5_1": 7,
+    "q8_0": 8,
+}
+_THINK_OPEN = "<think>"
+_THINK_CLOSE = "</think>"
+_METRICS_SEP = "\n"
+N_CPU = os.cpu_count() or 4
+N_PHYS = max(1, N_CPU // 2)
+_DEFAULT_MODEL = next(iter(model_zoo))
+_loaded: dict[str, Llama] = {}
+# ──────────────────────────── Think stripping ─────────────────────────
+class ThinkStripper:
+    """Streaming filter that removes <think>…</think> blocks."""
+    def __init__(self) -> None:
+        self.in_think = False
+        self.buf = ""
+    def feed(self, text: str) -> str:
+        self.buf += text
+        out: list[str] = []
+        while self.buf:
+            if self.in_think:
+                end = self.buf.find(_THINK_CLOSE)
+                if end == -1:
+                    self.buf = ""
+                    break
+                self.buf = self.buf[end + len(_THINK_CLOSE) :]
+                self.in_think = False
+                continue
+            start = self.buf.find(_THINK_OPEN)
+            end = self.buf.find(_THINK_CLOSE)
+            if start == -1 and end == -1:
+                out.append(self.buf)
+                self.buf = ""
+            elif start == -1:
+                out.append(self.buf[:end])
+                self.buf = self.buf[end + len(_THINK_CLOSE) :]
+            else:
+                out.append(self.buf[:start])
+                self.buf = self.buf[start + len(_THINK_OPEN) :]
+                self.in_think = True
+        return "".join(out)
+# ──────────────────────────── Model loading ───────────────────────────
+def _load_model(name: str) -> Llama:
+    cfg = model_zoo[name]
+    path = hf_hub_download(repo_id=cfg["repo_id"], filename=cfg["model_file"])
+    base = dict(
+        model_path=path,
+        n_ctx=N_CTX,
+        n_batch=1024,
+        n_ubatch=1024,
+        n_threads=N_PHYS,
+        n_threads_batch=N_CPU,
+        flash_attn=bool(FLASH_ATTN),
+        use_mmap=True,
+        use_mlock=False,
+        verbose=False,
+    )
+    kv = _KV_TYPE.get(KV_CACHE_TYPE)
+    try:
+        model = Llama(**base, type_k=kv, type_v=kv)
+        print(f"KV cache type: {KV_CACHE_TYPE}")
+    except ValueError:
+        print(f"KV cache '{KV_CACHE_TYPE}' unsupported on this backend, using default.")
+        model = Llama(**base)
+    return model
+print(f"Loading {_DEFAULT_MODEL} …")
+_loaded[_DEFAULT_MODEL] = _load_model(_DEFAULT_MODEL)
+think_stripper = ThinkStripper()
+print("Model ready.")
+# ──────────────────────────── History helpers ─────────────────────────
+def _to_str(content) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        return " ".join(b.get("text", "") for b in content if isinstance(b, dict))
+    return str(content)
+def _strip_think(text: str) -> str:
+    return think_stripper.feed(text)
+def _strip_metrics(text: str) -> str:
+    """Drop the trailing metrics line we appended to assistant messages."""
+    return text.split(_METRICS_SEP)[0] if _METRICS_SEP in text else text
+def _display_content(turn: dict) -> str:
+    """User-visible content (without metrics line) of a history turn."""
+    return _strip_metrics(_to_str(turn.get("content", "")))
+def _pick_feed_content(disp_turn: dict, raw_turn: dict | None) -> str:
+    """
+    Choose the content to feed back into the model for a given turn.
+    Prefer the raw version (which keeps <think>…</think>) so the KV-cache
+    prefix can be reused; if the user clearly edited the message via
+    `editable=True`, fall back to the displayed version instead.
+    """
+    disp = _display_content(disp_turn)
+    if not (
+        isinstance(raw_turn, dict) and raw_turn.get("role") == disp_turn.get("role")
+    ):
+        return disp
+    raw = _to_str(raw_turn.get("content", ""))
+    if disp_turn.get("role") == "assistant":
+        # Displayed ≈ _strip_think(raw); if they match, message wasn't edited.
+        if _strip_think(raw).strip() == disp.strip():
+            return raw
+        return disp
+    # User / system messages: raw and displayed should be identical.
+    return raw if raw.strip() == disp.strip() else disp
+# ──────────────────────────── Inference ───────────────────────────────
+def respond(
+    message: str, history: list[dict], model_name: str, raw_history: list[dict]
+):
+    # Lazy-load the requested model.
+    if model_name not in _loaded:
+        print(f"Switching to {model_name} …")
+        _loaded[model_name] = _load_model(model_name)
+        print(f"{model_name} ready.")
+    llm = _loaded[model_name]
+    if not isinstance(history, list):
+        history = []
+    if not isinstance(raw_history, list):
+        raw_history = []
+    # Build messages from raw history (so the KV prefix can be reused).
+    messages: list[dict] = [{"role": "system", "content": system_prompt}]
+    aligned_raw: list[dict] = []
+    for i, turn in enumerate(history):
+        if not isinstance(turn, dict) or "role" not in turn or "content" not in turn:
+            continue
+        raw_turn = raw_history[i] if i < len(raw_history) else None
+        feed = _pick_feed_content(turn, raw_turn)
+        messages.append({"role": turn["role"], "content": feed})
+        aligned_raw.append({"role": turn["role"], "content": feed})
+    messages.append({"role": "user", "content": message})
+    # Stream generation.
+    t_start = time.perf_counter()
+    n_gen = 0
+    raw = ""  # full text incl. <think>
+    prev_visible = ""
+    for chunk in llm.create_chat_completion(
+        messages=messages,
+        max_tokens=MAX_TOKENS,
+        temperature=TEMPERATURE,
+        top_p=TOP_P,
+        top_k=TOP_K,
+        repeat_penalty=REPEAT_PENALTY,
+        presence_penalty=PRESENCE_PENALTY,
+        min_p=MIN_P,
+        stream=True,
+    ):
+        delta = chunk["choices"][0]["delta"].get("content") or ""
+        if not delta:
+            continue
+        raw += delta
+        n_gen += 1
+        visible = _strip_think(raw)
+        if visible != prev_visible:
+            # raw_history stays unchanged during streaming.
+            yield visible, raw_history
+            prev_visible = visible
+    total_time = time.perf_counter() - t_start
+    overall_tps = n_gen / total_time if total_time > 0 else 0.0
+    metrics_line = f"✏️ {n_gen}t | ⏱️ {total_time:.1f}s | 🚀 {overall_tps:.1f}t/s"
+    # Rebuild raw_history to match what Gradio will store after this turn.
+    new_raw_history = [
+        *aligned_raw,
+        {"role": "user", "content": message},
+        {"role": "assistant", "content": raw},
+    ]
+    response = _strip_think(raw)
+    yield f"{response}{_METRICS_SEP}`{metrics_line}`", new_raw_history
+# ──────────────────────────── UI ──────────────────────────────────────
+with open("./style.css") as f:
+    CSS = f.read()
+with gr.Blocks(title="EdgeRazor Playground") as demo:
+    gr.Image(
+        value="https://raw.githubusercontent.com/zhangsq-nju/EdgeRazor/main/asset/Logo-full.png",
+        show_label=False,
+        container=False,
+        interactive=False,
+        elem_classes=["logo-wrap"],
+    )
+    gr.Markdown(header_info, elem_classes=["header-md"])
+    current_model = gr.State(_DEFAULT_MODEL)
+    raw_history_state = gr.State([])  # raw history with <think> blocks
+    with gr.Row():
+        model_dd = gr.Dropdown(
+            choices=list(model_zoo.keys()),
+            value=_DEFAULT_MODEL,
+            label="Model",
+            interactive=True,
+            elem_id="model-selector",
+        )
+    chat_iface = gr.ChatInterface(
+        fn=respond,
+        additional_inputs=[current_model, raw_history_state],
+        additional_outputs=[raw_history_state],
+        additional_inputs_accordion=gr.Accordion(label="", open=False, visible=False),
+        editable=True,
+        chatbot=gr.Chatbot(label="", height=480),
+    )
+    def _on_model_change(new_model, cur_model, history):
+        # Switching model invalidates raw history; reset chat alongside it.
+        # Re-selecting the same model keeps the conversation intact.
+        if new_model == cur_model:
+            safe_history = history if isinstance(history, list) else []
+            return (
+                cur_model,
+                gr.update(value=cur_model),
+                safe_history,
+                safe_history,
+                [],
+            )
+        return (
+            new_model,
+            gr.update(value=new_model),
+            [],
+            [],
+            [],
+        )
+    model_dd.change(
+        fn=_on_model_change,
+        inputs=[model_dd, current_model, chat_iface.chatbot_state],
+        outputs=[
+            current_model,
+            model_dd,
+            chat_iface.chatbot,
+            chat_iface.chatbot_state,
+            raw_history_state,
+        ],
+    )
+if __name__ == "__main__":
+    demo.launch(
+        css=CSS,
+        server_name="0.0.0.0",
+        server_port=7860,
+        ssr_mode=False,
+    )

config.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from collections import OrderedDict
+# Header information to present on the page
+header_info = "Lightweight LLMs on CPU. Check our [Hugging Face Collection](https://huggingface.co/collections/zhangsq-nju/edgerazor-nbit) and [GitHub](https://github.com/zhangsq-nju/EdgeRazor) for more details."
+# Model zoo
+model_zoo = OrderedDict(
+    [
+        (
+            "Qwen3-1.7B-EdgeRazor-1.58bit",
+            {
+                "repo_id": "zhangsq-nju/Qwen3-1.7B-EdgeRazor-GGUF",
+                "model_file": "Qwen3-1.7B-EdgeRazor-TQ2_0.gguf",
+            },
+        ),
+        (
+            "Qwen3-1.7B-EdgeRazor-4bit",
+            {
+                "repo_id": "zhangsq-nju/Qwen3-1.7B-EdgeRazor-GGUF",
+                "model_file": "Qwen3-1.7B-EdgeRazor-Q4_0.gguf",
+            },
+        ),
+        (
+            "Qwen3-0.6B-EdgeRazor-1.58bit",
+            {
+                "repo_id": "zhangsq-nju/Qwen3-1.7B-EdgeRazor-GGUF",
+                "model_file": "Qwen3-1.7B-EdgeRazor-TQ2_0.gguf",
+            },
+        ),
+        (
+            "Qwen3-0.6B-EdgeRazor-4bit",
+            {
+                "repo_id": "zhangsq-nju/Qwen3-1.7B-EdgeRazor-GGUF",
+                "model_file": "Qwen3-1.7B-EdgeRazor-Q4_0.gguf",
+            },
+        ),
+    ]
+)
+# System prompt for the model
+system_prompt = (
+    "You are a helpful, precise, and concise assistant. "
+    "Always respond in English. "
+    "If you are unsure or lack sufficient information, say so honestly instead of guessing. "
+    "Be concise while still fully addressing the user's request. "
+    "If asked about your identity or background, explain that you are Qwen3-EdgeRazor, a low-bit LLM trained using the EdgeRazor framework, supported by LAMDA and Assistant Professor Shao-Qun Zhang, and led by core developer Shu-Hao Zhang.\n\n"
+)
+# Generation configurations
+KV_CACHE_TYPE = "q8_0"
+TEMPERATURE = 0.6
+MIN_P = 0.00
+REPEAT_PENALTY = 1.0
+PRESENCE_PENALTY = 1.5
+TOP_K = 20
+TOP_P = 0.95
+# Context parameters
+N_CTX = 1024
+MAX_TOKENS = 384
+FLASH_ATTN = True

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+# llama_cpp_python==0.3.16 # For local development
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.21/llama_cpp_python-0.3.21-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl # For Hugging Face Space
+huggingface_hub>=0.20.0

style.css ADDED Viewed

	@@ -0,0 +1,31 @@

+/* Logo: natural size, left-aligned, no border/background */
+.logo-wrap {
+    display: flex !important;
+    justify-content: flex-start !important;
+    padding: 8px 0 4px 0 !important;
+    background: none !important;
+    border: none !important;
+    box-shadow: none !important;
+}
+.logo-wrap img {
+    height: 64px !important;
+    width: auto !important;
+    object-fit: contain !important;
+    border-radius: 0 !important;
+}
+/* Hide Gradio image toolbar buttons */
+.logo-wrap .icon-button-wrapper,
+.logo-wrap .download-button {
+    display: none !important;
+}
+/* Header text: left-aligned */
+.header-md {
+    text-align: left !important;
+    margin-bottom: 12px !important;
+}
+/* Efficiency metrics info: code */
+code {
+    padding: 0 0 0 0 !important;
+    background: none !important;
+    border: none !important;
+}