knoxel's picture
Upload app.py
e17101b verified
"""
BitNet b1.58 2B4T β€” CPU-Only Inference Explorer (bitnet.cpp Edition)
====================================================================
Powered by bitnet.cpp's optimized ternary kernels for 4-10x faster inference.
Uses llama-server with OpenAI-compatible API for streaming generation.
Paper: https://arxiv.org/abs/2504.12285
Model: https://huggingface.co/microsoft/bitnet-b1.58-2B-4T
"""
import os
import time
import psutil
import gradio as gr
from openai import OpenAI
# ─── Configuration ───────────────────────────────────────────────────────────
SERVER_URL = "http://127.0.0.1:8080/v1"
MODEL_NAME = "bitnet-b1.58-2B-4T"
# Connect to local llama-server
client = OpenAI(base_url=SERVER_URL, api_key="bitnet")
# ─── System Info ─────────────────────────────────────────────────────────────
cpu_count = psutil.cpu_count(logical=True)
total_ram = psutil.virtual_memory().total / 1024**3
proc = psutil.Process(os.getpid())
def get_system_info():
mem = proc.memory_info().rss / 1024**3
return f"""### System
| Metric | Value |
|---|---|
| CPU cores | {cpu_count} |
| Total RAM | {total_ram:.1f} GB |
| Process RSS | {mem:.2f} GB |
| Inference engine | bitnet.cpp (I2_S kernel) |
| Weights | 1.58-bit ternary ({{-1, 0, +1}}) |
| Activations | 8-bit integer |
| Context | 4096 tokens |
| Backend | llama-server (OpenAI API) |
"""
# ─── Paper benchmark table ───────────────────────────────────────────────────
PAPER_TABLE = """### Published Benchmarks (from the paper)
| Benchmark | LLaMA 3.2 1B | Gemma-3 1B | Qwen2.5 1.5B | SmolLM2 1.7B | **BitNet 2B** |
|---|---|---|---|---|---|
| **Memory** | 2 GB | 1.4 GB | 2.6 GB | 3.2 GB | **0.4 GB** |
| **CPU Latency** | 48ms | 41ms | 65ms | 67ms | **29ms** |
| **Energy/token** | 0.258J | 0.186J | 0.347J | 0.425J | **0.028J** |
| ARC-Challenge | 37.8 | 38.4 | 46.7 | 43.5 | **49.9** |
| WinoGrande | 59.5 | 58.5 | 62.8 | 69.0 | **71.9** |
| GSM8K | 38.2 | 31.2 | 56.8 | 45.1 | **58.4** |
| MMLU | 45.6 | 39.9 | **60.3** | 49.2 | 53.2 |
| HumanEval+ | 31.1 | 37.2 | **50.6** | 28.0 | 38.4 |
| **Average** | 44.9 | 43.7 | **55.2** | 48.7 | 54.2 |
*BitNet uses 5-13Γ— less memory and 6-9Γ— less energy than comparable models.*
> βœ… This demo uses **bitnet.cpp** with the optimized I2_S kernel β€” the same
> engine that achieves the 29ms/token latency shown above.
"""
# ─── Architecture explainer ──────────────────────────────────────────────────
ARCHITECTURE_MD = """### How BitNet b1.58 Works
```
Standard Transformer β†’ BitNet b1.58
───────────────────── ─────────────────
FP16/BF16 weights (16 bits) β†’ Ternary weights: {-1, 0, +1} (1.58 bits)
FP16 activations β†’ INT8 activations (absmax per-token)
nn.Linear β†’ BitLinear (absmean quantization)
SwiGLU activation β†’ Squared ReLU (ReLUΒ²)
LayerNorm β†’ SubLN normalization
Standard MatMul β†’ Additions only (no multiplications!)
```
**Key Insight:** Since weights are only -1, 0, or +1, matrix multiplication
becomes pure addition/subtraction. This is why CPUs can run BitNet models
so efficiently β€” you don't need floating-point multiply hardware at all.
**bitnet.cpp Kernels:**
- **I2_S** (Int2 with Scale): MAD-based, lossless, 2 bits/weight storage
- **TL1/TL2** (Ternary Lookup): LUT-based, lossless, sub-2-bit storage
- Both achieve **4-6Γ— speedup** over FP16 llama.cpp on the same CPU
**Training:** The model was trained **from scratch** with this quantization,
not post-training quantized. This is crucial β€” native 1-bit training preserves
quality far better than quantizing a pre-trained FP16 model down to 1-bit.
**3-Stage Training Pipeline:**
1. **Pre-training** on 4T tokens (text, code, synthetic math)
2. **SFT** on instruction-following datasets
3. **DPO** for alignment with human preferences
"""
# ─── Generation functions ────────────────────────────────────────────────────
def chat_respond(message, history, system_prompt, max_new_tokens, temperature, top_p):
"""Streaming chat via bitnet.cpp llama-server."""
messages = [{"role": "system", "content": system_prompt}]
for item in history:
messages.append(item)
messages.append({"role": "user", "content": message})
t0 = time.perf_counter()
tok_count = 0
response = ""
try:
stream = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=int(max_new_tokens),
temperature=float(temperature) if temperature > 0 else 0.0,
top_p=float(top_p),
stream=True,
)
for chunk in stream:
if chunk.choices[0].delta.content:
token_text = chunk.choices[0].delta.content
response += token_text
tok_count += 1
elapsed = time.perf_counter() - t0
tps = tok_count / elapsed if elapsed > 0 else 0
stats = f"\n\n---\n*⚑ {tok_count} tokens · {tps:.1f} tok/s · {elapsed:.1f}s · bitnet.cpp I2_S*"
yield response + stats
except Exception as e:
yield f"**Error:** {str(e)}\n\nIs the llama-server running on port 8080?"
def single_benchmark(prompt, max_new_tokens):
"""Run a single non-streaming generation with detailed stats."""
messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": prompt},
]
t0 = time.perf_counter()
try:
completion = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=int(max_new_tokens),
temperature=0.0,
stream=False,
)
elapsed = time.perf_counter() - t0
response = completion.choices[0].message.content
n_generated = completion.usage.completion_tokens if completion.usage else len(response.split())
n_input = completion.usage.prompt_tokens if completion.usage else 0
tps = n_generated / elapsed if elapsed > 0 else 0
stats_md = f"""### ⚑ Benchmark Results (bitnet.cpp I2_S kernel)
| Metric | Value |
|---|---|
| Input tokens | {n_input} |
| Output tokens | {n_generated} |
| Total time | {elapsed:.2f}s |
| **Tokens/sec** | **{tps:.2f}** |
| Avg ms/token | {(elapsed/max(n_generated,1)*1000):.1f}ms |
| Engine | bitnet.cpp (lossless) |
| Kernel | I2_S (MAD-based) |
"""
return response, stats_md
except Exception as e:
return f"Error: {str(e)}", "Server not responding"
# ─── Build Gradio UI ─────────────────────────────────────────────────────────
HEADER = """# 🧬 BitNet b1.58 2B4T β€” CPU-Only Inference Explorer
**The first open-source native 1-bit LLM** by Microsoft Research β€” powered by **bitnet.cpp** optimized kernels.
| | |
|---|---|
| πŸ“„ [Paper](https://arxiv.org/abs/2504.12285) | πŸ€— [Model](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T) |
| πŸ’» [bitnet.cpp](https://github.com/microsoft/BitNet) (38K+ ⭐) | ⚑ Ternary I2_S kernel Β· ~10 tok/s on CPU |
"""
with gr.Blocks(
title="BitNet b1.58 2B4T β€” CPU Inference Explorer",
) as demo:
gr.Markdown(HEADER)
with gr.Tabs():
# ── Tab 1: Chat ──────────────────────────────────────────────────
with gr.Tab("πŸ’¬ Chat", id="chat"):
chat = gr.ChatInterface(
fn=chat_respond,
description="Chat with BitNet b1.58 via bitnet.cpp on CPU. Live token/sec stats shown after each response.",
additional_inputs=[
gr.Textbox(
value="You are a helpful, concise AI assistant.",
label="System Prompt",
),
gr.Slider(1, 2048, value=256, step=1, label="Max New Tokens"),
gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature (0 = greedy)"),
gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"),
],
examples=[
["Explain what a 1-bit LLM is in 3 sentences."],
["Write a Python function to find the nth Fibonacci number."],
["What are the pros and cons of running AI on CPUs vs GPUs?"],
["Solve: If 3x + 7 = 22, what is x?"],
],
cache_examples=False,
)
# ── Tab 2: Benchmark ─────────────────────────────────────────────
with gr.Tab("πŸ“Š Benchmark", id="bench"):
gr.Markdown("### Run a single-shot benchmark (greedy decoding, bitnet.cpp)")
with gr.Row():
with gr.Column(scale=2):
bench_prompt = gr.Textbox(
value="Write a detailed explanation of how transformer neural networks work, covering attention mechanisms, positional encoding, and the training process.",
label="Prompt",
lines=3,
)
bench_tokens = gr.Slider(16, 512, value=128, step=16, label="Max New Tokens")
bench_btn = gr.Button("πŸš€ Run Benchmark", variant="primary")
with gr.Column(scale=1):
bench_stats = gr.Markdown("*Click 'Run Benchmark' to start*")
bench_output = gr.Textbox(label="Generated Text", lines=10, interactive=False)
bench_btn.click(
fn=single_benchmark,
inputs=[bench_prompt, bench_tokens],
outputs=[bench_output, bench_stats],
)
# ── Tab 3: Paper Results ─────────────────────────────────────────
with gr.Tab("πŸ“ˆ Paper Results", id="paper"):
gr.Markdown(PAPER_TABLE)
# ── Tab 4: Architecture ──────────────────────────────────────────
with gr.Tab("πŸ—οΈ Architecture", id="arch"):
gr.Markdown(ARCHITECTURE_MD)
# ── Tab 5: System Info ───────────────────────────────────────────
with gr.Tab("βš™οΈ System", id="sys"):
sys_info = gr.Markdown(get_system_info())
refresh_btn = gr.Button("πŸ”„ Refresh")
refresh_btn.click(fn=get_system_info, outputs=sys_info)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, theme=gr.themes.Soft())