""" Benchmark harness for training throughput and peak GPU memory. Measures tokens/sec and peak memory MB, saves results as JSON. Follows benchmark_phase2.py pattern for CUDA synchronization and memory tracking. """ import sys import os import json import time import torch sys.path.insert(0, os.path.dirname(__file__)) from arbitor.main import MORPHTernaryModel, VOCAB, CTX def run_benchmark(model, train_data, device, n_steps=100, warmup_steps=10, batch_size=64, ctx=CTX): """ Measure training throughput (tokens/sec) and peak GPU memory (MB). Resets peak memory stats, runs warmup steps (no timing), then timed steps. Uses torch.cuda.synchronize() before first and after last timed step for accurate wall-clock timing. Args: model: MORPHTernaryModel instance train_data: 1D byte tensor of training data device: 'cuda' or 'cpu' n_steps: Number of timed steps warmup_steps: Steps before timing begins batch_size: Batch size for each step ctx: Context window length Returns: dict with keys: tokens_per_sec, peak_memory_mb, n_steps, batch_size, ctx, device """ model.train() # Reset memory tracking if device == "cuda": torch.cuda.reset_peak_memory_stats(device) torch.cuda.empty_cache() torch.cuda.synchronize() # Generate data once (avoid IO jitter) all_ix = torch.randint(0, len(train_data) - ctx - 1, (warmup_steps + n_steps, batch_size)) all_x = torch.stack([ torch.stack([train_data[ix[i]: ix[i] + ctx] for i in range(batch_size)]) for ix in all_ix ]) all_targets = all_x[:, :, 3:] # Warmup steps for step_idx in range(warmup_steps): x = all_x[step_idx].to(device, non_blocking=True) targets = all_targets[step_idx].to(device, non_blocking=True) with torch.no_grad(): model(x, targets=targets) if device == "cuda": torch.cuda.synchronize() # Timed steps t_start = time.perf_counter() for step_idx in range(warmup_steps, warmup_steps + n_steps): x = all_x[step_idx].to(device, non_blocking=True) targets = all_targets[step_idx].to(device, non_blocking=True) with torch.no_grad(): model(x, targets=targets) if device == "cuda": torch.cuda.synchronize() t_end = time.perf_counter() elapsed = t_end - t_start tokens_total = n_steps * batch_size * ctx tokens_per_sec = tokens_total / elapsed if elapsed > 0 else 0.0 # Peak memory peak_memory_mb = 0.0 if device == "cuda": peak_memory_mb = torch.cuda.max_memory_allocated(device) / (1024 * 1024) result = { "tokens_per_sec": round(tokens_per_sec, 2), "peak_memory_mb": round(peak_memory_mb, 2), "n_steps": n_steps, "batch_size": batch_size, "ctx": ctx, "device": device, } return result def compare_benchmarks(before_path, after_path): """ Compare two benchmark result JSON files and compute deltas. Args: before_path: Path to baseline benchmark JSON after_path: Path to optimized benchmark JSON Returns: dict with keys: before, after, delta, pct_change delta[tokens_per_sec] = after - before pct_change[tokens_per_sec] = (after - before) / before * 100 """ with open(before_path, "r") as f: before = json.load(f) with open(after_path, "r") as f: after = json.load(f) metrics = ["tokens_per_sec", "peak_memory_mb"] delta = {} pct_change = {} for key in metrics: b = before.get(key, 0.0) a = after.get(key, 0.0) delta[key] = round(a - b, 2) if b != 0: pct_change[key] = round(((a - b) / abs(b)) * 100.0, 2) else: pct_change[key] = 0.0 # Print comparison table print("\n=== Benchmark Comparison ===") print(f"{'Metric':<20} {'Before':>12} {'After':>12} {'Delta':>12} {'Change':>10}") print("-" * 66) for key in metrics: b = before.get(key, 0.0) a = after.get(key, 0.0) print(f"{key:<20} {b:>12.2f} {a:>12.2f} {delta[key]:>+12.2f} {pct_change[key]:>+9.2f}%") print("=" * 66) return { "before": before, "after": after, "delta": delta, "pct_change": pct_change, }