knoxel commited on
Commit
e17101b
Β·
verified Β·
1 Parent(s): 032479b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +258 -0
app.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BitNet b1.58 2B4T β€” CPU-Only Inference Explorer (bitnet.cpp Edition)
3
+ ====================================================================
4
+ Powered by bitnet.cpp's optimized ternary kernels for 4-10x faster inference.
5
+ Uses llama-server with OpenAI-compatible API for streaming generation.
6
+
7
+ Paper: https://arxiv.org/abs/2504.12285
8
+ Model: https://huggingface.co/microsoft/bitnet-b1.58-2B-4T
9
+ """
10
+
11
+ import os
12
+ import time
13
+ import psutil
14
+ import gradio as gr
15
+ from openai import OpenAI
16
+
17
+ # ─── Configuration ───────────────────────────────────────────────────────────
18
+ SERVER_URL = "http://127.0.0.1:8080/v1"
19
+ MODEL_NAME = "bitnet-b1.58-2B-4T"
20
+
21
+ # Connect to local llama-server
22
+ client = OpenAI(base_url=SERVER_URL, api_key="bitnet")
23
+
24
+ # ─── System Info ─────────────────────────────────────────────────────────────
25
+ cpu_count = psutil.cpu_count(logical=True)
26
+ total_ram = psutil.virtual_memory().total / 1024**3
27
+ proc = psutil.Process(os.getpid())
28
+
29
+
30
+ def get_system_info():
31
+ mem = proc.memory_info().rss / 1024**3
32
+ return f"""### System
33
+ | Metric | Value |
34
+ |---|---|
35
+ | CPU cores | {cpu_count} |
36
+ | Total RAM | {total_ram:.1f} GB |
37
+ | Process RSS | {mem:.2f} GB |
38
+ | Inference engine | bitnet.cpp (I2_S kernel) |
39
+ | Weights | 1.58-bit ternary ({{-1, 0, +1}}) |
40
+ | Activations | 8-bit integer |
41
+ | Context | 4096 tokens |
42
+ | Backend | llama-server (OpenAI API) |
43
+ """
44
+
45
+
46
+ # ─── Paper benchmark table ───────────────────────────────────────────────────
47
+ PAPER_TABLE = """### Published Benchmarks (from the paper)
48
+
49
+ | Benchmark | LLaMA 3.2 1B | Gemma-3 1B | Qwen2.5 1.5B | SmolLM2 1.7B | **BitNet 2B** |
50
+ |---|---|---|---|---|---|
51
+ | **Memory** | 2 GB | 1.4 GB | 2.6 GB | 3.2 GB | **0.4 GB** |
52
+ | **CPU Latency** | 48ms | 41ms | 65ms | 67ms | **29ms** |
53
+ | **Energy/token** | 0.258J | 0.186J | 0.347J | 0.425J | **0.028J** |
54
+ | ARC-Challenge | 37.8 | 38.4 | 46.7 | 43.5 | **49.9** |
55
+ | WinoGrande | 59.5 | 58.5 | 62.8 | 69.0 | **71.9** |
56
+ | GSM8K | 38.2 | 31.2 | 56.8 | 45.1 | **58.4** |
57
+ | MMLU | 45.6 | 39.9 | **60.3** | 49.2 | 53.2 |
58
+ | HumanEval+ | 31.1 | 37.2 | **50.6** | 28.0 | 38.4 |
59
+ | **Average** | 44.9 | 43.7 | **55.2** | 48.7 | 54.2 |
60
+
61
+ *BitNet uses 5-13Γ— less memory and 6-9Γ— less energy than comparable models.*
62
+
63
+ > βœ… This demo uses **bitnet.cpp** with the optimized I2_S kernel β€” the same
64
+ > engine that achieves the 29ms/token latency shown above.
65
+ """
66
+
67
+ # ─── Architecture explainer ──────────────────────────────────────────────────
68
+ ARCHITECTURE_MD = """### How BitNet b1.58 Works
69
+
70
+ ```
71
+ Standard Transformer β†’ BitNet b1.58
72
+ ───────────────────── ─────────────────
73
+ FP16/BF16 weights (16 bits) β†’ Ternary weights: {-1, 0, +1} (1.58 bits)
74
+ FP16 activations β†’ INT8 activations (absmax per-token)
75
+ nn.Linear β†’ BitLinear (absmean quantization)
76
+ SwiGLU activation β†’ Squared ReLU (ReLUΒ²)
77
+ LayerNorm β†’ SubLN normalization
78
+ Standard MatMul β†’ Additions only (no multiplications!)
79
+ ```
80
+
81
+ **Key Insight:** Since weights are only -1, 0, or +1, matrix multiplication
82
+ becomes pure addition/subtraction. This is why CPUs can run BitNet models
83
+ so efficiently β€” you don't need floating-point multiply hardware at all.
84
+
85
+ **bitnet.cpp Kernels:**
86
+ - **I2_S** (Int2 with Scale): MAD-based, lossless, 2 bits/weight storage
87
+ - **TL1/TL2** (Ternary Lookup): LUT-based, lossless, sub-2-bit storage
88
+ - Both achieve **4-6Γ— speedup** over FP16 llama.cpp on the same CPU
89
+
90
+ **Training:** The model was trained **from scratch** with this quantization,
91
+ not post-training quantized. This is crucial β€” native 1-bit training preserves
92
+ quality far better than quantizing a pre-trained FP16 model down to 1-bit.
93
+
94
+ **3-Stage Training Pipeline:**
95
+ 1. **Pre-training** on 4T tokens (text, code, synthetic math)
96
+ 2. **SFT** on instruction-following datasets
97
+ 3. **DPO** for alignment with human preferences
98
+ """
99
+
100
+ # ─── Generation functions ────────────────────────────────────────────────────
101
+
102
+ def chat_respond(message, history, system_prompt, max_new_tokens, temperature, top_p):
103
+ """Streaming chat via bitnet.cpp llama-server."""
104
+ messages = [{"role": "system", "content": system_prompt}]
105
+ for item in history:
106
+ messages.append(item)
107
+ messages.append({"role": "user", "content": message})
108
+
109
+ t0 = time.perf_counter()
110
+ tok_count = 0
111
+ response = ""
112
+
113
+ try:
114
+ stream = client.chat.completions.create(
115
+ model=MODEL_NAME,
116
+ messages=messages,
117
+ max_tokens=int(max_new_tokens),
118
+ temperature=float(temperature) if temperature > 0 else 0.0,
119
+ top_p=float(top_p),
120
+ stream=True,
121
+ )
122
+
123
+ for chunk in stream:
124
+ if chunk.choices[0].delta.content:
125
+ token_text = chunk.choices[0].delta.content
126
+ response += token_text
127
+ tok_count += 1
128
+ elapsed = time.perf_counter() - t0
129
+ tps = tok_count / elapsed if elapsed > 0 else 0
130
+ stats = f"\n\n---\n*⚑ {tok_count} tokens · {tps:.1f} tok/s · {elapsed:.1f}s · bitnet.cpp I2_S*"
131
+ yield response + stats
132
+
133
+ except Exception as e:
134
+ yield f"**Error:** {str(e)}\n\nIs the llama-server running on port 8080?"
135
+
136
+
137
+ def single_benchmark(prompt, max_new_tokens):
138
+ """Run a single non-streaming generation with detailed stats."""
139
+ messages = [
140
+ {"role": "system", "content": "You are a helpful AI assistant."},
141
+ {"role": "user", "content": prompt},
142
+ ]
143
+
144
+ t0 = time.perf_counter()
145
+ try:
146
+ completion = client.chat.completions.create(
147
+ model=MODEL_NAME,
148
+ messages=messages,
149
+ max_tokens=int(max_new_tokens),
150
+ temperature=0.0,
151
+ stream=False,
152
+ )
153
+ elapsed = time.perf_counter() - t0
154
+
155
+ response = completion.choices[0].message.content
156
+ n_generated = completion.usage.completion_tokens if completion.usage else len(response.split())
157
+ n_input = completion.usage.prompt_tokens if completion.usage else 0
158
+ tps = n_generated / elapsed if elapsed > 0 else 0
159
+
160
+ stats_md = f"""### ⚑ Benchmark Results (bitnet.cpp I2_S kernel)
161
+
162
+ | Metric | Value |
163
+ |---|---|
164
+ | Input tokens | {n_input} |
165
+ | Output tokens | {n_generated} |
166
+ | Total time | {elapsed:.2f}s |
167
+ | **Tokens/sec** | **{tps:.2f}** |
168
+ | Avg ms/token | {(elapsed/max(n_generated,1)*1000):.1f}ms |
169
+ | Engine | bitnet.cpp (lossless) |
170
+ | Kernel | I2_S (MAD-based) |
171
+ """
172
+ return response, stats_md
173
+
174
+ except Exception as e:
175
+ return f"Error: {str(e)}", "Server not responding"
176
+
177
+
178
+ # ─── Build Gradio UI ─────────────────────────────────────────────────────────
179
+
180
+ HEADER = """# 🧬 BitNet b1.58 2B4T β€” CPU-Only Inference Explorer
181
+
182
+ **The first open-source native 1-bit LLM** by Microsoft Research β€” powered by **bitnet.cpp** optimized kernels.
183
+
184
+ | | |
185
+ |---|---|
186
+ | πŸ“„ [Paper](https://arxiv.org/abs/2504.12285) | πŸ€— [Model](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T) |
187
+ | πŸ’» [bitnet.cpp](https://github.com/microsoft/BitNet) (38K+ ⭐) | ⚑ Ternary I2_S kernel Β· ~10 tok/s on CPU |
188
+ """
189
+
190
+ with gr.Blocks(
191
+ title="BitNet b1.58 2B4T β€” CPU Inference Explorer",
192
+ ) as demo:
193
+
194
+ gr.Markdown(HEADER)
195
+
196
+ with gr.Tabs():
197
+ # ── Tab 1: Chat ──────────────────────────────────────────────────
198
+ with gr.Tab("πŸ’¬ Chat", id="chat"):
199
+ chat = gr.ChatInterface(
200
+ fn=chat_respond,
201
+ description="Chat with BitNet b1.58 via bitnet.cpp on CPU. Live token/sec stats shown after each response.",
202
+ additional_inputs=[
203
+ gr.Textbox(
204
+ value="You are a helpful, concise AI assistant.",
205
+ label="System Prompt",
206
+ ),
207
+ gr.Slider(1, 2048, value=256, step=1, label="Max New Tokens"),
208
+ gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature (0 = greedy)"),
209
+ gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"),
210
+ ],
211
+ examples=[
212
+ ["Explain what a 1-bit LLM is in 3 sentences."],
213
+ ["Write a Python function to find the nth Fibonacci number."],
214
+ ["What are the pros and cons of running AI on CPUs vs GPUs?"],
215
+ ["Solve: If 3x + 7 = 22, what is x?"],
216
+ ],
217
+ cache_examples=False,
218
+ )
219
+
220
+ # ── Tab 2: Benchmark ─────────────────────────────────────────────
221
+ with gr.Tab("πŸ“Š Benchmark", id="bench"):
222
+ gr.Markdown("### Run a single-shot benchmark (greedy decoding, bitnet.cpp)")
223
+ with gr.Row():
224
+ with gr.Column(scale=2):
225
+ bench_prompt = gr.Textbox(
226
+ value="Write a detailed explanation of how transformer neural networks work, covering attention mechanisms, positional encoding, and the training process.",
227
+ label="Prompt",
228
+ lines=3,
229
+ )
230
+ bench_tokens = gr.Slider(16, 512, value=128, step=16, label="Max New Tokens")
231
+ bench_btn = gr.Button("πŸš€ Run Benchmark", variant="primary")
232
+ with gr.Column(scale=1):
233
+ bench_stats = gr.Markdown("*Click 'Run Benchmark' to start*")
234
+
235
+ bench_output = gr.Textbox(label="Generated Text", lines=10, interactive=False)
236
+ bench_btn.click(
237
+ fn=single_benchmark,
238
+ inputs=[bench_prompt, bench_tokens],
239
+ outputs=[bench_output, bench_stats],
240
+ )
241
+
242
+ # ── Tab 3: Paper Results ─────────────────────────────────────────
243
+ with gr.Tab("πŸ“ˆ Paper Results", id="paper"):
244
+ gr.Markdown(PAPER_TABLE)
245
+
246
+ # ── Tab 4: Architecture ──────────────────────────────────────────
247
+ with gr.Tab("πŸ—οΈ Architecture", id="arch"):
248
+ gr.Markdown(ARCHITECTURE_MD)
249
+
250
+ # ── Tab 5: System Info ───────────────────────────────────────────
251
+ with gr.Tab("βš™οΈ System", id="sys"):
252
+ sys_info = gr.Markdown(get_system_info())
253
+ refresh_btn = gr.Button("πŸ”„ Refresh")
254
+ refresh_btn.click(fn=get_system_info, outputs=sys_info)
255
+
256
+
257
+ if __name__ == "__main__":
258
+ demo.launch(server_name="0.0.0.0", server_port=7860, theme=gr.themes.Soft())