ARBS / testing /benchmarks /benchmark.py
CLIWorks's picture
Upload folder using huggingface_hub
d8bc908 verified
"""
Benchmark harness for training throughput and peak GPU memory.
Measures tokens/sec and peak memory MB, saves results as JSON.
Follows benchmark_phase2.py pattern for CUDA synchronization and memory tracking.
"""
import sys
import os
import json
import time
import torch
sys.path.insert(0, os.path.dirname(__file__))
from arbitor.main import MORPHTernaryModel, VOCAB, CTX
def run_benchmark(model, train_data, device, n_steps=100, warmup_steps=10,
batch_size=64, ctx=CTX):
"""
Measure training throughput (tokens/sec) and peak GPU memory (MB).
Resets peak memory stats, runs warmup steps (no timing), then timed steps.
Uses torch.cuda.synchronize() before first and after last timed step for
accurate wall-clock timing.
Args:
model: MORPHTernaryModel instance
train_data: 1D byte tensor of training data
device: 'cuda' or 'cpu'
n_steps: Number of timed steps
warmup_steps: Steps before timing begins
batch_size: Batch size for each step
ctx: Context window length
Returns:
dict with keys: tokens_per_sec, peak_memory_mb, n_steps, batch_size,
ctx, device
"""
model.train()
# Reset memory tracking
if device == "cuda":
torch.cuda.reset_peak_memory_stats(device)
torch.cuda.empty_cache()
torch.cuda.synchronize()
# Generate data once (avoid IO jitter)
all_ix = torch.randint(0, len(train_data) - ctx - 1, (warmup_steps + n_steps, batch_size))
all_x = torch.stack([
torch.stack([train_data[ix[i]: ix[i] + ctx] for i in range(batch_size)])
for ix in all_ix
])
all_targets = all_x[:, :, 3:]
# Warmup steps
for step_idx in range(warmup_steps):
x = all_x[step_idx].to(device, non_blocking=True)
targets = all_targets[step_idx].to(device, non_blocking=True)
with torch.no_grad():
model(x, targets=targets)
if device == "cuda":
torch.cuda.synchronize()
# Timed steps
t_start = time.perf_counter()
for step_idx in range(warmup_steps, warmup_steps + n_steps):
x = all_x[step_idx].to(device, non_blocking=True)
targets = all_targets[step_idx].to(device, non_blocking=True)
with torch.no_grad():
model(x, targets=targets)
if device == "cuda":
torch.cuda.synchronize()
t_end = time.perf_counter()
elapsed = t_end - t_start
tokens_total = n_steps * batch_size * ctx
tokens_per_sec = tokens_total / elapsed if elapsed > 0 else 0.0
# Peak memory
peak_memory_mb = 0.0
if device == "cuda":
peak_memory_mb = torch.cuda.max_memory_allocated(device) / (1024 * 1024)
result = {
"tokens_per_sec": round(tokens_per_sec, 2),
"peak_memory_mb": round(peak_memory_mb, 2),
"n_steps": n_steps,
"batch_size": batch_size,
"ctx": ctx,
"device": device,
}
return result
def compare_benchmarks(before_path, after_path):
"""
Compare two benchmark result JSON files and compute deltas.
Args:
before_path: Path to baseline benchmark JSON
after_path: Path to optimized benchmark JSON
Returns:
dict with keys: before, after, delta, pct_change
delta[tokens_per_sec] = after - before
pct_change[tokens_per_sec] = (after - before) / before * 100
"""
with open(before_path, "r") as f:
before = json.load(f)
with open(after_path, "r") as f:
after = json.load(f)
metrics = ["tokens_per_sec", "peak_memory_mb"]
delta = {}
pct_change = {}
for key in metrics:
b = before.get(key, 0.0)
a = after.get(key, 0.0)
delta[key] = round(a - b, 2)
if b != 0:
pct_change[key] = round(((a - b) / abs(b)) * 100.0, 2)
else:
pct_change[key] = 0.0
# Print comparison table
print("\n=== Benchmark Comparison ===")
print(f"{'Metric':<20} {'Before':>12} {'After':>12} {'Delta':>12} {'Change':>10}")
print("-" * 66)
for key in metrics:
b = before.get(key, 0.0)
a = after.get(key, 0.0)
print(f"{key:<20} {b:>12.2f} {a:>12.2f} {delta[key]:>+12.2f} {pct_change[key]:>+9.2f}%")
print("=" * 66)
return {
"before": before,
"after": after,
"delta": delta,
"pct_change": pct_change,
}