zhsh17 commited on
Commit
eb7a0c7
Β·
1 Parent(s): 5be876f

v0.0.1: Add EdgeRazor Playground

Browse files
Files changed (5) hide show
  1. README.md +17 -7
  2. app.py +325 -0
  3. config.py +61 -0
  4. requirements.txt +3 -0
  5. style.css +31 -0
README.md CHANGED
@@ -1,14 +1,24 @@
1
  ---
2
- title: EdgeRazor PlayGround
3
- emoji: πŸ¦€
4
- colorFrom: red
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.13.0
 
8
  app_file: app.py
9
- pinned: false
10
  license: apache-2.0
11
- short_description: EdgeRazor Playground for low-bit LLMs! CPU-friendly!πŸš€
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: EdgeRazor Playground
3
+ emoji: πŸš€
4
+ colorFrom: blue
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 6.5.1
8
+ python_version: 3.12.2
9
  app_file: app.py
10
+ pinned: true
11
  license: apache-2.0
12
+ short_description: EdgeRazor Playground for low-bit LLMs! CPU-friendly! πŸš€
13
  ---
14
 
15
+ ## EdgeRazor Playground
16
+
17
+ A CPU-friendly chatbot powered by **[Qwen3-EdgeRazor-nbit](https://huggingface.co/collections/zhangsq-nju/edgerazor-nbit)**, running locally via [llama.cpp](https://github.com/ggerganov/llama.cpp). Displays real-time efficiency metrics (output tokens, time, decoding throughput) per turn.
18
+
19
+ ## Dependencies
20
+
21
+ - [llama-cpp-python](https://abetlen.github.io/llama-cpp-python/whl/cpu/llama-cpp-python)
22
+ - Qwen3-EdgeRazor-nbit gguf files:
23
+ - [Qwen3-0.6B-EdgeRazor-GGUF](https://huggingface.co/zhangsq-nju/Qwen3-0.6B-EdgeRazor-GGUF)
24
+ - [Qwen3-1.7B-EdgeRazor-GGUF](https://huggingface.co/zhangsq-nju/Qwen3-1.7B-EdgeRazor-GGUF)
app.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ import gradio as gr
5
+ from huggingface_hub import hf_hub_download
6
+ from llama_cpp import Llama
7
+
8
+ from config import (
9
+ FLASH_ATTN,
10
+ KV_CACHE_TYPE,
11
+ MAX_TOKENS,
12
+ MIN_P,
13
+ N_CTX,
14
+ PRESENCE_PENALTY,
15
+ REPEAT_PENALTY,
16
+ TEMPERATURE,
17
+ TOP_K,
18
+ TOP_P,
19
+ header_info,
20
+ model_zoo,
21
+ system_prompt,
22
+ )
23
+
24
+ # ──────────────────────────── Constants ───────────────────────────────
25
+
26
+ _KV_TYPE: dict[str, int] = {
27
+ "f32": 0,
28
+ "f16": 1,
29
+ "q4_0": 2,
30
+ "q4_1": 3,
31
+ "q5_0": 6,
32
+ "q5_1": 7,
33
+ "q8_0": 8,
34
+ }
35
+
36
+ _THINK_OPEN = "<think>"
37
+ _THINK_CLOSE = "</think>"
38
+ _METRICS_SEP = "\n"
39
+
40
+ N_CPU = os.cpu_count() or 4
41
+ N_PHYS = max(1, N_CPU // 2)
42
+
43
+ _DEFAULT_MODEL = next(iter(model_zoo))
44
+ _loaded: dict[str, Llama] = {}
45
+
46
+
47
+ # ──────────────────────────── Think stripping ─────────────────────────
48
+
49
+
50
+ class ThinkStripper:
51
+ """Streaming filter that removes <think>…</think> blocks."""
52
+
53
+ def __init__(self) -> None:
54
+ self.in_think = False
55
+ self.buf = ""
56
+
57
+ def feed(self, text: str) -> str:
58
+ self.buf += text
59
+ out: list[str] = []
60
+
61
+ while self.buf:
62
+ if self.in_think:
63
+ end = self.buf.find(_THINK_CLOSE)
64
+ if end == -1:
65
+ self.buf = ""
66
+ break
67
+ self.buf = self.buf[end + len(_THINK_CLOSE) :]
68
+ self.in_think = False
69
+ continue
70
+
71
+ start = self.buf.find(_THINK_OPEN)
72
+ end = self.buf.find(_THINK_CLOSE)
73
+
74
+ if start == -1 and end == -1:
75
+ out.append(self.buf)
76
+ self.buf = ""
77
+ elif start == -1:
78
+ out.append(self.buf[:end])
79
+ self.buf = self.buf[end + len(_THINK_CLOSE) :]
80
+ else:
81
+ out.append(self.buf[:start])
82
+ self.buf = self.buf[start + len(_THINK_OPEN) :]
83
+ self.in_think = True
84
+
85
+ return "".join(out)
86
+
87
+
88
+ # ──────────────────────────── Model loading ───────────────────────────
89
+
90
+
91
+ def _load_model(name: str) -> Llama:
92
+ cfg = model_zoo[name]
93
+ path = hf_hub_download(repo_id=cfg["repo_id"], filename=cfg["model_file"])
94
+
95
+ base = dict(
96
+ model_path=path,
97
+ n_ctx=N_CTX,
98
+ n_batch=1024,
99
+ n_ubatch=1024,
100
+ n_threads=N_PHYS,
101
+ n_threads_batch=N_CPU,
102
+ flash_attn=bool(FLASH_ATTN),
103
+ use_mmap=True,
104
+ use_mlock=False,
105
+ verbose=False,
106
+ )
107
+
108
+ kv = _KV_TYPE.get(KV_CACHE_TYPE)
109
+ try:
110
+ model = Llama(**base, type_k=kv, type_v=kv)
111
+ print(f"KV cache type: {KV_CACHE_TYPE}")
112
+ except ValueError:
113
+ print(f"KV cache '{KV_CACHE_TYPE}' unsupported on this backend, using default.")
114
+ model = Llama(**base)
115
+ return model
116
+
117
+
118
+ print(f"Loading {_DEFAULT_MODEL} …")
119
+ _loaded[_DEFAULT_MODEL] = _load_model(_DEFAULT_MODEL)
120
+ think_stripper = ThinkStripper()
121
+ print("Model ready.")
122
+
123
+
124
+ # ──────────────────────────── History helpers ─────────────────────────
125
+
126
+
127
+ def _to_str(content) -> str:
128
+ if isinstance(content, str):
129
+ return content
130
+ if isinstance(content, list):
131
+ return " ".join(b.get("text", "") for b in content if isinstance(b, dict))
132
+ return str(content)
133
+
134
+
135
+ def _strip_think(text: str) -> str:
136
+ return think_stripper.feed(text)
137
+
138
+
139
+ def _strip_metrics(text: str) -> str:
140
+ """Drop the trailing metrics line we appended to assistant messages."""
141
+ return text.split(_METRICS_SEP)[0] if _METRICS_SEP in text else text
142
+
143
+
144
+ def _display_content(turn: dict) -> str:
145
+ """User-visible content (without metrics line) of a history turn."""
146
+ return _strip_metrics(_to_str(turn.get("content", "")))
147
+
148
+
149
+ def _pick_feed_content(disp_turn: dict, raw_turn: dict | None) -> str:
150
+ """
151
+ Choose the content to feed back into the model for a given turn.
152
+
153
+ Prefer the raw version (which keeps <think>…</think>) so the KV-cache
154
+ prefix can be reused; if the user clearly edited the message via
155
+ `editable=True`, fall back to the displayed version instead.
156
+ """
157
+ disp = _display_content(disp_turn)
158
+
159
+ if not (
160
+ isinstance(raw_turn, dict) and raw_turn.get("role") == disp_turn.get("role")
161
+ ):
162
+ return disp
163
+
164
+ raw = _to_str(raw_turn.get("content", ""))
165
+
166
+ if disp_turn.get("role") == "assistant":
167
+ # Displayed β‰ˆ _strip_think(raw); if they match, message wasn't edited.
168
+ if _strip_think(raw).strip() == disp.strip():
169
+ return raw
170
+ return disp
171
+
172
+ # User / system messages: raw and displayed should be identical.
173
+ return raw if raw.strip() == disp.strip() else disp
174
+
175
+
176
+ # ──────────────────────────── Inference ───────────────────────────────
177
+
178
+
179
+ def respond(
180
+ message: str, history: list[dict], model_name: str, raw_history: list[dict]
181
+ ):
182
+ # Lazy-load the requested model.
183
+ if model_name not in _loaded:
184
+ print(f"Switching to {model_name} …")
185
+ _loaded[model_name] = _load_model(model_name)
186
+ print(f"{model_name} ready.")
187
+ llm = _loaded[model_name]
188
+
189
+ if not isinstance(history, list):
190
+ history = []
191
+ if not isinstance(raw_history, list):
192
+ raw_history = []
193
+
194
+ # Build messages from raw history (so the KV prefix can be reused).
195
+ messages: list[dict] = [{"role": "system", "content": system_prompt}]
196
+ aligned_raw: list[dict] = []
197
+ for i, turn in enumerate(history):
198
+ if not isinstance(turn, dict) or "role" not in turn or "content" not in turn:
199
+ continue
200
+ raw_turn = raw_history[i] if i < len(raw_history) else None
201
+ feed = _pick_feed_content(turn, raw_turn)
202
+ messages.append({"role": turn["role"], "content": feed})
203
+ aligned_raw.append({"role": turn["role"], "content": feed})
204
+ messages.append({"role": "user", "content": message})
205
+
206
+ # Stream generation.
207
+ t_start = time.perf_counter()
208
+ n_gen = 0
209
+ raw = "" # full text incl. <think>
210
+ prev_visible = ""
211
+
212
+ for chunk in llm.create_chat_completion(
213
+ messages=messages,
214
+ max_tokens=MAX_TOKENS,
215
+ temperature=TEMPERATURE,
216
+ top_p=TOP_P,
217
+ top_k=TOP_K,
218
+ repeat_penalty=REPEAT_PENALTY,
219
+ presence_penalty=PRESENCE_PENALTY,
220
+ min_p=MIN_P,
221
+ stream=True,
222
+ ):
223
+ delta = chunk["choices"][0]["delta"].get("content") or ""
224
+ if not delta:
225
+ continue
226
+
227
+ raw += delta
228
+ n_gen += 1
229
+ visible = _strip_think(raw)
230
+ if visible != prev_visible:
231
+ # raw_history stays unchanged during streaming.
232
+ yield visible, raw_history
233
+ prev_visible = visible
234
+
235
+ total_time = time.perf_counter() - t_start
236
+ overall_tps = n_gen / total_time if total_time > 0 else 0.0
237
+ metrics_line = f"✏️ {n_gen}t | ⏱️ {total_time:.1f}s | πŸš€ {overall_tps:.1f}t/s"
238
+
239
+ # Rebuild raw_history to match what Gradio will store after this turn.
240
+ new_raw_history = [
241
+ *aligned_raw,
242
+ {"role": "user", "content": message},
243
+ {"role": "assistant", "content": raw},
244
+ ]
245
+
246
+ response = _strip_think(raw)
247
+ yield f"{response}{_METRICS_SEP}`{metrics_line}`", new_raw_history
248
+
249
+
250
+ # ──────────────────────────── UI ──────────────────────────────────────
251
+
252
+ with open("./style.css") as f:
253
+ CSS = f.read()
254
+
255
+ with gr.Blocks(title="EdgeRazor Playground") as demo:
256
+ gr.Image(
257
+ value="https://raw.githubusercontent.com/zhangsq-nju/EdgeRazor/main/asset/Logo-full.png",
258
+ show_label=False,
259
+ container=False,
260
+ interactive=False,
261
+ elem_classes=["logo-wrap"],
262
+ )
263
+ gr.Markdown(header_info, elem_classes=["header-md"])
264
+
265
+ current_model = gr.State(_DEFAULT_MODEL)
266
+ raw_history_state = gr.State([]) # raw history with <think> blocks
267
+
268
+ with gr.Row():
269
+ model_dd = gr.Dropdown(
270
+ choices=list(model_zoo.keys()),
271
+ value=_DEFAULT_MODEL,
272
+ label="Model",
273
+ interactive=True,
274
+ elem_id="model-selector",
275
+ )
276
+
277
+ chat_iface = gr.ChatInterface(
278
+ fn=respond,
279
+ additional_inputs=[current_model, raw_history_state],
280
+ additional_outputs=[raw_history_state],
281
+ additional_inputs_accordion=gr.Accordion(label="", open=False, visible=False),
282
+ editable=True,
283
+ chatbot=gr.Chatbot(label="", height=480),
284
+ )
285
+
286
+ def _on_model_change(new_model, cur_model, history):
287
+ # Switching model invalidates raw history; reset chat alongside it.
288
+ # Re-selecting the same model keeps the conversation intact.
289
+ if new_model == cur_model:
290
+ safe_history = history if isinstance(history, list) else []
291
+ return (
292
+ cur_model,
293
+ gr.update(value=cur_model),
294
+ safe_history,
295
+ safe_history,
296
+ [],
297
+ )
298
+ return (
299
+ new_model,
300
+ gr.update(value=new_model),
301
+ [],
302
+ [],
303
+ [],
304
+ )
305
+
306
+ model_dd.change(
307
+ fn=_on_model_change,
308
+ inputs=[model_dd, current_model, chat_iface.chatbot_state],
309
+ outputs=[
310
+ current_model,
311
+ model_dd,
312
+ chat_iface.chatbot,
313
+ chat_iface.chatbot_state,
314
+ raw_history_state,
315
+ ],
316
+ )
317
+
318
+
319
+ if __name__ == "__main__":
320
+ demo.launch(
321
+ css=CSS,
322
+ server_name="0.0.0.0",
323
+ server_port=7860,
324
+ ssr_mode=False,
325
+ )
config.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ # Header information to present on the page
4
+ header_info = "Lightweight LLMs on CPU. Check our [Hugging Face Collection](https://huggingface.co/collections/zhangsq-nju/edgerazor-nbit) and [GitHub](https://github.com/zhangsq-nju/EdgeRazor) for more details."
5
+
6
+ # Model zoo
7
+ model_zoo = OrderedDict(
8
+ [
9
+ (
10
+ "Qwen3-1.7B-EdgeRazor-1.58bit",
11
+ {
12
+ "repo_id": "zhangsq-nju/Qwen3-1.7B-EdgeRazor-GGUF",
13
+ "model_file": "Qwen3-1.7B-EdgeRazor-TQ2_0.gguf",
14
+ },
15
+ ),
16
+ (
17
+ "Qwen3-1.7B-EdgeRazor-4bit",
18
+ {
19
+ "repo_id": "zhangsq-nju/Qwen3-1.7B-EdgeRazor-GGUF",
20
+ "model_file": "Qwen3-1.7B-EdgeRazor-Q4_0.gguf",
21
+ },
22
+ ),
23
+ (
24
+ "Qwen3-0.6B-EdgeRazor-1.58bit",
25
+ {
26
+ "repo_id": "zhangsq-nju/Qwen3-1.7B-EdgeRazor-GGUF",
27
+ "model_file": "Qwen3-1.7B-EdgeRazor-TQ2_0.gguf",
28
+ },
29
+ ),
30
+ (
31
+ "Qwen3-0.6B-EdgeRazor-4bit",
32
+ {
33
+ "repo_id": "zhangsq-nju/Qwen3-1.7B-EdgeRazor-GGUF",
34
+ "model_file": "Qwen3-1.7B-EdgeRazor-Q4_0.gguf",
35
+ },
36
+ ),
37
+ ]
38
+ )
39
+
40
+ # System prompt for the model
41
+ system_prompt = (
42
+ "You are a helpful, precise, and concise assistant. "
43
+ "Always respond in English. "
44
+ "If you are unsure or lack sufficient information, say so honestly instead of guessing. "
45
+ "Be concise while still fully addressing the user's request. "
46
+ "If asked about your identity or background, explain that you are Qwen3-EdgeRazor, a low-bit LLM trained using the EdgeRazor framework, supported by LAMDA and Assistant Professor Shao-Qun Zhang, and led by core developer Shu-Hao Zhang.\n\n"
47
+ )
48
+
49
+ # Generation configurations
50
+ KV_CACHE_TYPE = "q8_0"
51
+ TEMPERATURE = 0.6
52
+ MIN_P = 0.00
53
+ REPEAT_PENALTY = 1.0
54
+ PRESENCE_PENALTY = 1.5
55
+ TOP_K = 20
56
+ TOP_P = 0.95
57
+
58
+ # Context parameters
59
+ N_CTX = 1024
60
+ MAX_TOKENS = 384
61
+ FLASH_ATTN = True
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # llama_cpp_python==0.3.16 # For local development
2
+ https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.21/llama_cpp_python-0.3.21-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl # For Hugging Face Space
3
+ huggingface_hub>=0.20.0
style.css ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Logo: natural size, left-aligned, no border/background */
2
+ .logo-wrap {
3
+ display: flex !important;
4
+ justify-content: flex-start !important;
5
+ padding: 8px 0 4px 0 !important;
6
+ background: none !important;
7
+ border: none !important;
8
+ box-shadow: none !important;
9
+ }
10
+ .logo-wrap img {
11
+ height: 64px !important;
12
+ width: auto !important;
13
+ object-fit: contain !important;
14
+ border-radius: 0 !important;
15
+ }
16
+ /* Hide Gradio image toolbar buttons */
17
+ .logo-wrap .icon-button-wrapper,
18
+ .logo-wrap .download-button {
19
+ display: none !important;
20
+ }
21
+ /* Header text: left-aligned */
22
+ .header-md {
23
+ text-align: left !important;
24
+ margin-bottom: 12px !important;
25
+ }
26
+ /* Efficiency metrics info: code */
27
+ code {
28
+ padding: 0 0 0 0 !important;
29
+ background: none !important;
30
+ border: none !important;
31
+ }