Spaces:

knoxel
/

bitnet-cpp-explorer

Runtime error

App Files Files Community

bitnet-cpp-explorer / app.py

knoxel

Upload app.py

e17101b verified 4 days ago

raw

history blame contribute delete

11.6 kB

	"""
	BitNet b1.58 2B4T — CPU-Only Inference Explorer (bitnet.cpp Edition)
	====================================================================
	Powered by bitnet.cpp's optimized ternary kernels for 4-10x faster inference.
	Uses llama-server with OpenAI-compatible API for streaming generation.

	Paper: https://arxiv.org/abs/2504.12285
	Model: https://huggingface.co/microsoft/bitnet-b1.58-2B-4T
	"""

	import os
	import time
	import psutil
	import gradio as gr
	from openai import OpenAI

	# ─── Configuration ───────────────────────────────────────────────────────────
	SERVER_URL = "http://127.0.0.1:8080/v1"
	MODEL_NAME = "bitnet-b1.58-2B-4T"

	# Connect to local llama-server
	client = OpenAI(base_url=SERVER_URL, api_key="bitnet")

	# ─── System Info ─────────────────────────────────────────────────────────────
	cpu_count = psutil.cpu_count(logical=True)
	total_ram = psutil.virtual_memory().total / 1024**3
	proc = psutil.Process(os.getpid())


	def get_system_info():
	mem = proc.memory_info().rss / 1024**3
	return f"""### System
	\| Metric \| Value \|
	\|---\|---\|
	\| CPU cores \| {cpu_count} \|
	\| Total RAM \| {total_ram:.1f} GB \|
	\| Process RSS \| {mem:.2f} GB \|
	\| Inference engine \| bitnet.cpp (I2_S kernel) \|
	\| Weights \| 1.58-bit ternary ({{-1, 0, +1}}) \|
	\| Activations \| 8-bit integer \|
	\| Context \| 4096 tokens \|
	\| Backend \| llama-server (OpenAI API) \|
	"""


	# ─── Paper benchmark table ───────────────────────────────────────────────────
	PAPER_TABLE = """### Published Benchmarks (from the paper)

	\| Benchmark \| LLaMA 3.2 1B \| Gemma-3 1B \| Qwen2.5 1.5B \| SmolLM2 1.7B \| BitNet 2B \|
	\|---\|---\|---\|---\|---\|---\|
	\| Memory \| 2 GB \| 1.4 GB \| 2.6 GB \| 3.2 GB \| 0.4 GB \|
	\| CPU Latency \| 48ms \| 41ms \| 65ms \| 67ms \| 29ms \|
	\| Energy/token \| 0.258J \| 0.186J \| 0.347J \| 0.425J \| 0.028J \|
	\| ARC-Challenge \| 37.8 \| 38.4 \| 46.7 \| 43.5 \| 49.9 \|
	\| WinoGrande \| 59.5 \| 58.5 \| 62.8 \| 69.0 \| 71.9 \|
	\| GSM8K \| 38.2 \| 31.2 \| 56.8 \| 45.1 \| 58.4 \|
	\| MMLU \| 45.6 \| 39.9 \| 60.3 \| 49.2 \| 53.2 \|
	\| HumanEval+ \| 31.1 \| 37.2 \| 50.6 \| 28.0 \| 38.4 \|
	\| Average \| 44.9 \| 43.7 \| 55.2 \| 48.7 \| 54.2 \|

	BitNet uses 5-13× less memory and 6-9× less energy than comparable models.

	> ✅ This demo uses bitnet.cpp with the optimized I2_S kernel — the same
	> engine that achieves the 29ms/token latency shown above.
	"""

	# ─── Architecture explainer ──────────────────────────────────────────────────
	ARCHITECTURE_MD = """### How BitNet b1.58 Works

	```
	Standard Transformer → BitNet b1.58
	───────────────────── ─────────────────
	FP16/BF16 weights (16 bits) → Ternary weights: {-1, 0, +1} (1.58 bits)
	FP16 activations → INT8 activations (absmax per-token)
	nn.Linear → BitLinear (absmean quantization)
	SwiGLU activation → Squared ReLU (ReLU²)
	LayerNorm → SubLN normalization
	Standard MatMul → Additions only (no multiplications!)
	```

	Key Insight: Since weights are only -1, 0, or +1, matrix multiplication
	becomes pure addition/subtraction. This is why CPUs can run BitNet models
	so efficiently — you don't need floating-point multiply hardware at all.

	bitnet.cpp Kernels:
	- I2_S (Int2 with Scale): MAD-based, lossless, 2 bits/weight storage
	- TL1/TL2 (Ternary Lookup): LUT-based, lossless, sub-2-bit storage
	- Both achieve 4-6× speedup over FP16 llama.cpp on the same CPU

	Training: The model was trained from scratch with this quantization,
	not post-training quantized. This is crucial — native 1-bit training preserves
	quality far better than quantizing a pre-trained FP16 model down to 1-bit.

	3-Stage Training Pipeline:
	1. Pre-training on 4T tokens (text, code, synthetic math)
	2. SFT on instruction-following datasets
	3. DPO for alignment with human preferences
	"""

	# ─── Generation functions ────────────────────────────────────────────────────

	def chat_respond(message, history, system_prompt, max_new_tokens, temperature, top_p):
	"""Streaming chat via bitnet.cpp llama-server."""
	messages = [{"role": "system", "content": system_prompt}]
	for item in history:
	messages.append(item)
	messages.append({"role": "user", "content": message})

	t0 = time.perf_counter()
	tok_count = 0
	response = ""

	try:
	stream = client.chat.completions.create(
	model=MODEL_NAME,
	messages=messages,
	max_tokens=int(max_new_tokens),
	temperature=float(temperature) if temperature > 0 else 0.0,
	top_p=float(top_p),
	stream=True,
	)

	for chunk in stream:
	if chunk.choices[0].delta.content:
	token_text = chunk.choices[0].delta.content
	response += token_text
	tok_count += 1
	elapsed = time.perf_counter() - t0
	tps = tok_count / elapsed if elapsed > 0 else 0
	stats = f"\n\n---\n⚡ {tok_count} tokens · {tps:.1f} tok/s · {elapsed:.1f}s · bitnet.cpp I2_S"
	yield response + stats

	except Exception as e:
	yield f"Error: {str(e)}\n\nIs the llama-server running on port 8080?"


	def single_benchmark(prompt, max_new_tokens):
	"""Run a single non-streaming generation with detailed stats."""
	messages = [
	{"role": "system", "content": "You are a helpful AI assistant."},
	{"role": "user", "content": prompt},
	]

	t0 = time.perf_counter()
	try:
	completion = client.chat.completions.create(
	model=MODEL_NAME,
	messages=messages,
	max_tokens=int(max_new_tokens),
	temperature=0.0,
	stream=False,
	)
	elapsed = time.perf_counter() - t0

	response = completion.choices[0].message.content
	n_generated = completion.usage.completion_tokens if completion.usage else len(response.split())
	n_input = completion.usage.prompt_tokens if completion.usage else 0
	tps = n_generated / elapsed if elapsed > 0 else 0

	stats_md = f"""### ⚡ Benchmark Results (bitnet.cpp I2_S kernel)

	\| Metric \| Value \|
	\|---\|---\|
	\| Input tokens \| {n_input} \|
	\| Output tokens \| {n_generated} \|
	\| Total time \| {elapsed:.2f}s \|
	\| Tokens/sec \| {tps:.2f} \|
	\| Avg ms/token \| {(elapsed/max(n_generated,1)*1000):.1f}ms \|
	\| Engine \| bitnet.cpp (lossless) \|
	\| Kernel \| I2_S (MAD-based) \|
	"""
	return response, stats_md

	except Exception as e:
	return f"Error: {str(e)}", "Server not responding"


	# ─── Build Gradio UI ─────────────────────────────────────────────────────────

	HEADER = """# 🧬 BitNet b1.58 2B4T — CPU-Only Inference Explorer

	The first open-source native 1-bit LLM by Microsoft Research — powered by bitnet.cpp optimized kernels.

	\| \| \|
	\|---\|---\|
	\| 📄 [Paper](https://arxiv.org/abs/2504.12285) \| 🤗 [Model](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T) \|
	\| 💻 [bitnet.cpp](https://github.com/microsoft/BitNet) (38K+ ⭐) \| ⚡ Ternary I2_S kernel · ~10 tok/s on CPU \|
	"""

	with gr.Blocks(
	title="BitNet b1.58 2B4T — CPU Inference Explorer",
	) as demo:

	gr.Markdown(HEADER)

	with gr.Tabs():
	# ── Tab 1: Chat ──────────────────────────────────────────────────
	with gr.Tab("💬 Chat", id="chat"):
	chat = gr.ChatInterface(
	fn=chat_respond,
	description="Chat with BitNet b1.58 via bitnet.cpp on CPU. Live token/sec stats shown after each response.",
	additional_inputs=[
	gr.Textbox(
	value="You are a helpful, concise AI assistant.",
	label="System Prompt",
	),
	gr.Slider(1, 2048, value=256, step=1, label="Max New Tokens"),
	gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature (0 = greedy)"),
	gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"),
	],
	examples=[
	["Explain what a 1-bit LLM is in 3 sentences."],
	["Write a Python function to find the nth Fibonacci number."],
	["What are the pros and cons of running AI on CPUs vs GPUs?"],
	["Solve: If 3x + 7 = 22, what is x?"],
	],
	cache_examples=False,
	)

	# ── Tab 2: Benchmark ─────────────────────────────────────────────
	with gr.Tab("📊 Benchmark", id="bench"):
	gr.Markdown("### Run a single-shot benchmark (greedy decoding, bitnet.cpp)")
	with gr.Row():
	with gr.Column(scale=2):
	bench_prompt = gr.Textbox(
	value="Write a detailed explanation of how transformer neural networks work, covering attention mechanisms, positional encoding, and the training process.",
	label="Prompt",
	lines=3,
	)
	bench_tokens = gr.Slider(16, 512, value=128, step=16, label="Max New Tokens")
	bench_btn = gr.Button("🚀 Run Benchmark", variant="primary")
	with gr.Column(scale=1):
	bench_stats = gr.Markdown("Click 'Run Benchmark' to start")

	bench_output = gr.Textbox(label="Generated Text", lines=10, interactive=False)
	bench_btn.click(
	fn=single_benchmark,
	inputs=[bench_prompt, bench_tokens],
	outputs=[bench_output, bench_stats],
	)

	# ── Tab 3: Paper Results ─────────────────────────────────────────
	with gr.Tab("📈 Paper Results", id="paper"):
	gr.Markdown(PAPER_TABLE)

	# ── Tab 4: Architecture ──────────────────────────────────────────
	with gr.Tab("🏗️ Architecture", id="arch"):
	gr.Markdown(ARCHITECTURE_MD)

	# ── Tab 5: System Info ───────────────────────────────────────────
	with gr.Tab("⚙️ System", id="sys"):
	sys_info = gr.Markdown(get_system_info())
	refresh_btn = gr.Button("🔄 Refresh")
	refresh_btn.click(fn=get_system_info, outputs=sys_info)


	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, theme=gr.themes.Soft())