ARBS / testing /benchmarks /benchmark.py

Upload folder using huggingface_hub

d8bc908 verified 1 day ago

4.42 kB

	"""
	Benchmark harness for training throughput and peak GPU memory.

	Measures tokens/sec and peak memory MB, saves results as JSON.
	Follows benchmark_phase2.py pattern for CUDA synchronization and memory tracking.
	"""

	import sys
	import os
	import json
	import time
	import torch

	sys.path.insert(0, os.path.dirname(__file__))

	from arbitor.main import MORPHTernaryModel, VOCAB, CTX


	def run_benchmark(model, train_data, device, n_steps=100, warmup_steps=10,
	batch_size=64, ctx=CTX):
	"""
	Measure training throughput (tokens/sec) and peak GPU memory (MB).

	Resets peak memory stats, runs warmup steps (no timing), then timed steps.
	Uses torch.cuda.synchronize() before first and after last timed step for
	accurate wall-clock timing.

	Args:
	model: MORPHTernaryModel instance
	train_data: 1D byte tensor of training data
	device: 'cuda' or 'cpu'
	n_steps: Number of timed steps
	warmup_steps: Steps before timing begins
	batch_size: Batch size for each step
	ctx: Context window length

	Returns:
	dict with keys: tokens_per_sec, peak_memory_mb, n_steps, batch_size,
	ctx, device
	"""
	model.train()

	# Reset memory tracking
	if device == "cuda":
	torch.cuda.reset_peak_memory_stats(device)
	torch.cuda.empty_cache()
	torch.cuda.synchronize()

	# Generate data once (avoid IO jitter)
	all_ix = torch.randint(0, len(train_data) - ctx - 1, (warmup_steps + n_steps, batch_size))
	all_x = torch.stack([
	torch.stack([train_data[ix[i]: ix[i] + ctx] for i in range(batch_size)])
	for ix in all_ix
	])
	all_targets = all_x[:, :, 3:]

	# Warmup steps
	for step_idx in range(warmup_steps):
	x = all_x[step_idx].to(device, non_blocking=True)
	targets = all_targets[step_idx].to(device, non_blocking=True)
	with torch.no_grad():
	model(x, targets=targets)

	if device == "cuda":
	torch.cuda.synchronize()

	# Timed steps
	t_start = time.perf_counter()
	for step_idx in range(warmup_steps, warmup_steps + n_steps):
	x = all_x[step_idx].to(device, non_blocking=True)
	targets = all_targets[step_idx].to(device, non_blocking=True)
	with torch.no_grad():
	model(x, targets=targets)

	if device == "cuda":
	torch.cuda.synchronize()
	t_end = time.perf_counter()

	elapsed = t_end - t_start
	tokens_total = n_steps * batch_size * ctx
	tokens_per_sec = tokens_total / elapsed if elapsed > 0 else 0.0

	# Peak memory
	peak_memory_mb = 0.0
	if device == "cuda":
	peak_memory_mb = torch.cuda.max_memory_allocated(device) / (1024 * 1024)

	result = {
	"tokens_per_sec": round(tokens_per_sec, 2),
	"peak_memory_mb": round(peak_memory_mb, 2),
	"n_steps": n_steps,
	"batch_size": batch_size,
	"ctx": ctx,
	"device": device,
	}

	return result


	def compare_benchmarks(before_path, after_path):
	"""
	Compare two benchmark result JSON files and compute deltas.

	Args:
	before_path: Path to baseline benchmark JSON
	after_path: Path to optimized benchmark JSON

	Returns:
	dict with keys: before, after, delta, pct_change
	delta[tokens_per_sec] = after - before
	pct_change[tokens_per_sec] = (after - before) / before * 100
	"""
	with open(before_path, "r") as f:
	before = json.load(f)
	with open(after_path, "r") as f:
	after = json.load(f)

	metrics = ["tokens_per_sec", "peak_memory_mb"]
	delta = {}
	pct_change = {}

	for key in metrics:
	b = before.get(key, 0.0)
	a = after.get(key, 0.0)
	delta[key] = round(a - b, 2)
	if b != 0:
	pct_change[key] = round(((a - b) / abs(b)) * 100.0, 2)
	else:
	pct_change[key] = 0.0

	# Print comparison table
	print("\n=== Benchmark Comparison ===")
	print(f"{'Metric':<20} {'Before':>12} {'After':>12} {'Delta':>12} {'Change':>10}")
	print("-" * 66)
	for key in metrics:
	b = before.get(key, 0.0)
	a = after.get(key, 0.0)
	print(f"{key:<20} {b:>12.2f} {a:>12.2f} {delta[key]:>+12.2f} {pct_change[key]:>+9.2f}%")
	print("=" * 66)

	return {
	"before": before,
	"after": after,
	"delta": delta,
	"pct_change": pct_change,
	}