ARBS / testing /benchmarks /benchmark_true_ternary.py

Upload folder using huggingface_hub

d8bc908 verified 1 day ago

22.2 kB

	"""
	True Ternary Benchmark: Compare training methods on ARBModel.

	Configs:
	1. Adam_FP32 — standard FP32 Adam (full model, float params)
	2. SignSGD_Old — SignSGD optimizer (full model, float params)
	3. TrueTernary — pure ternary training (0 float params, T flips + E_accum)

	Metrics: loss curve, step time, peak VRAM, model/optimizer memory, convergence

	After REFACTOR6 (architecture ternarization), the internal model has 0 trainable
	float params. Adam_FP32 and SignSGD_Old use the pre-ternarization float weights.
	TrueTernary uses the post-REFACTOR6 strict ternary-only path.
	"""
	import os, sys, time, json, math, gc, argparse
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
	from arbitor.main import ARBModel, VOCAB, CTX, LossComponents
	from arbitor.kernel.ternary_scale import TScaleType
	from arbitor.kernel.ternary_scale import _triton_ternary_grad_sign, _triton_update_e, _triton_ternary_step
	from arbitor.optim.sign_sgd import SignSGD
	from arbitor.kernel.ternary_audit import audit_model, format_audit, freeze_float_parameters, trainable_parameters

	STEPS = 50
	WARMUP = 10
	BATCH = 8
	CTX_LEN = 66
	SEED = 42

	DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
	DATA_PATH = os.path.join(os.path.dirname(__file__), "tinyshakespeare.txt")

	CONFIGS = [
	"Adam_FP32",
	"SignSGD_Old",
	"TrueTernary",
	]


	class NoTrainableParametersOptimizer:
	def __init__(self):
	self.param_groups = []
	self.state = {}

	def zero_grad(self, args, *kwargs):
	return None

	def step(self, args, *kwargs):
	return None


	def download_data():
	if not os.path.exists(DATA_PATH):
	import urllib.request
	print(" Downloading tinyshakespeare...")
	urllib.request.urlretrieve(DATA_URL, DATA_PATH)
	with open(DATA_PATH, "r", encoding="utf-8") as f:
	text = f.read()
	byte_data = torch.tensor(list(text.encode("utf-8")), dtype=torch.long)
	n = int(0.9 * len(byte_data))
	return byte_data[:n], byte_data[n:]


	def get_batch(data, device):
	ix = torch.randint(0, len(data) - CTX_LEN - 1, (BATCH,))
	x = torch.stack([data[i: i + CTX_LEN] for i in ix])
	targets = x[:, 3:]
	return x.to(device, non_blocking=True), targets.to(device, non_blocking=True)


	def get_lr(step, max_lr=1e-4, min_lr=1e-6):
	if step < WARMUP:
	return max_lr * (step + 1) / WARMUP
	progress = (step - WARMUP) / max(1, STEPS - WARMUP)
	return min_lr + 0.5 * (max_lr - min_lr) * (1 + math.cos(math.pi * progress))


	def cpu_update_memory(model, accum_threshold=3, loss_signal=None):
	"""CPU-based update that avoids the Triton compilation bug (14s/step)."""
	import torch.nn.functional as F
	from arbitor.converters.convert_to_ternary8 import pack_ternary
	t_step = 1
	if loss_signal is not None:
	loss_val = float(loss_signal.detach().clamp(min=0, max=32).item())
	t_step = max(1, min(4, int(loss_val // 2) + 1))
	for module in model.modules():
	if not hasattr(module, 'update_E') and not hasattr(module, 'ternary_step'):
	continue
	has_grad = hasattr(module, '_hook_grad_T_sign')
	has_direct = hasattr(module, '_hook_grad_2d') and hasattr(module, '_hook_x_2d')
	if not has_grad and not has_direct:
	continue

	device = module.T_accum.device
	N, K = tuple(module._T_shape.tolist())
	if has_direct:
	grads = module._hook_grad_2d
	xs = module._hook_x_2d
	grad_W = torch.matmul(grads.float().t(), xs.float())
	grad_sign = grad_W.sign().to(torch.int8)
	else:
	grad_sign = module._hook_grad_T_sign.to(device=device)

	# --- update_E (CPU fixed-point residual path) ---
	if hasattr(module, 'update_E'):
	T_source = module._get_T() if not hasattr(module, '_hook_T') else module._hook_T
	T = T_source.to(device=device)
	grad_T = grad_sign.float() * T.float()
	gpr = (K + module.group_size - 1) // module.group_size
	total_in = gpr * module.group_size
	padded = F.pad(grad_T, (0, total_in - K))
	grouped = padded.view(N, gpr, module.group_size)
	group_score = grouped.sum(dim=2)
	delta = -group_score.sign().to(torch.int8).flatten()
	if not hasattr(module, "E_accum"):
	module.register_buffer("E_accum", torch.zeros_like(module.E, dtype=torch.int8))
	e_accum_threshold = int(getattr(module, "_e_accum_threshold", 4))
	new_accum = torch.clamp(module.E_accum + delta, -128, 127).to(torch.int8)
	step_up = new_accum >= e_accum_threshold
	step_down = new_accum <= -e_accum_threshold
	e_step = torch.where(step_up, torch.ones_like(new_accum),
	torch.where(step_down, -torch.ones_like(new_accum), torch.zeros_like(new_accum)))
	module.E = torch.clamp(module.E.to(torch.int16) + e_step.to(torch.int16), -128, 127).to(torch.int8)
	module.E_accum = (new_accum.to(torch.int16) - e_step.to(torch.int16) * e_accum_threshold).to(torch.int8)

	# --- ternary_step (CPU T flip) ---
	if hasattr(module, 'ternary_step'):
	module.T_accum = torch.clamp(module.T_accum + grad_sign.to(device) * t_step, -128, 127).to(torch.int8)
	fu = module.T_accum > accum_threshold
	fd = module.T_accum < -accum_threshold
	if fu.any() or fd.any():
	T = module._get_T().to(device)
	T[fu] = torch.tensor(1, dtype=T.dtype, device=device)
	T[fd] = torch.tensor(-1, dtype=T.dtype, device=device)
	torch.cuda.synchronize()
	module.T_packed = pack_ternary(T.cpu())[0].to(device=device)
	module.T_accum = torch.where(fu \| fd, torch.zeros_like(module.T_accum), module.T_accum)

	# Clean up hooks
	if has_direct:
	del module._hook_grad_2d, module._hook_x_2d
	else:
	del module._hook_grad_T_sign


	def gpu_signcache_update_memory(model, accum_threshold=3, update_scales=True, loss_signal=None):
	"""GPU update that computes one temporary int8 grad_sign per module, then frees it.

	This avoids the very slow per-packed-byte direct reduction path for benchmark
	shapes with large M = batch * sequence. It still keeps persistent model state
	ternary-first: packed T, int8 E, int8 accumulators, no FP master weights.
	"""
	t_step = 1
	if loss_signal is not None:
	loss_val = float(loss_signal.detach().clamp(min=0, max=32).item())
	t_step = max(1, min(4, int(loss_val // 2) + 1))
	for module in model.modules():
	has_grad = hasattr(module, '_hook_grad_T_sign')
	has_direct = hasattr(module, '_hook_grad_2d') and hasattr(module, '_hook_x_2d')
	if not has_grad and not has_direct:
	continue

	if has_direct:
	n_out, k_in = tuple(module._T_shape.tolist())
	grad_sign = _triton_ternary_grad_sign(module._hook_grad_2d, module._hook_x_2d, n_out, k_in)
	module._hook_grad_T_sign = grad_sign
	del module._hook_grad_2d, module._hook_x_2d

	if update_scales and hasattr(module, 'update_E'):
	if getattr(module, "E", None) is not None and module.E.is_cuda and hasattr(module, "_hook_grad_T_sign"):
	n_out, k_in = tuple(module._T_shape.tolist())
	if not hasattr(module, "E_accum"):
	module.register_buffer("E_accum", torch.zeros_like(module.E, dtype=torch.int8))
	_triton_update_e(
	module.T_packed.contiguous(),
	module._hook_grad_T_sign.contiguous(),
	module.E,
	module.E_accum,
	n_out,
	k_in,
	module.group_size,
	int(getattr(module, "_e_accum_threshold", 4)),
	)
	else:
	module.update_E(loss_signal=loss_signal)

	if hasattr(module, 'ternary_step'):
	if getattr(module, "T_packed", None) is not None and module.T_packed.is_cuda and hasattr(module, "_hook_grad_T_sign"):
	total = int(module._T_shape[0].item() * module._T_shape[1].item())
	_triton_ternary_step(
	module.T_packed,
	module._hook_grad_T_sign.contiguous(),
	module.T_accum,
	total,
	accum_threshold,
	t_step,
	)
	del module._hook_grad_T_sign
	else:
	module.ternary_step(accum_threshold=accum_threshold)


	def build_model(strict_ternary):
	return ARBModel(
	tscale_type=TScaleType.T32,
	enable_image=not strict_ternary,
	enable_audio=not strict_ternary,
	enable_vq=not strict_ternary,
	enable_graph=not strict_ternary,
	enable_memory_modules=not strict_ternary,
	enable_moe=True,
	)


	def run_config(
	name,
	device,
	base_state=None,
	strict_true_ternary=True,
	update_backend="gpu",
	scale_update_interval=4,
	accum_threshold=3,
	print_every=1,
	):
	torch.manual_seed(SEED)
	torch.cuda.reset_peak_memory_stats(device)
	torch.cuda.empty_cache()
	gc.collect()

	is_true_ternary = "TrueTernary" in name
	is_signsgd = "SignSGD" in name or "TrueTernary" in name
	use_bf16 = "BF16" in name

	# TrueTernary always uses strict mode (0 float params, no encoders)
	strict_model = "TrueTernary" in name

	if strict_model:
	model = build_model(strict_ternary=True).to(device)
	freeze_float_parameters(model)
	elif base_state is not None:
	model = build_model(strict_ternary=False).to(device)
	model.load_state_dict(base_state, strict=False)
	# Re-freeze ViT/audio params that load_state_dict may have unfrozen
	for param_name, p in model.named_parameters():
	bn = param_name.split('.')[0]
	if bn in ('vit', 'image_sequencer', 'audio_sequencer'):
	p.requires_grad = False
	else:
	model = build_model(strict_ternary=strict_model).to(device)

	if strict_model:
	freeze_float_parameters(model)

	opt_params = trainable_parameters(model)
	if use_bf16:
	import bitsandbytes as bnb
	print(f" Creating Adam8bit optimizer...", flush=True)
	optimizer = bnb.optim.Adam8bit(opt_params, lr=1e-4, weight_decay=0.01) if opt_params else NoTrainableParametersOptimizer()
	elif name == "Adam_FP32":
	print(f" Creating Adam FP32 optimizer...", flush=True)
	optimizer = torch.optim.Adam(opt_params, lr=1e-4, weight_decay=0.01) if opt_params else NoTrainableParametersOptimizer()
	elif is_signsgd:
	print(f" Creating SignSGD optimizer...", flush=True)
	optimizer = SignSGD(opt_params, lr=0.001, weight_decay=0.01) if opt_params else NoTrainableParametersOptimizer()
	else:
	raise ValueError(f"Unknown config: {name}")

	n_params = sum(p.numel() for p in model.parameters())
	trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)

	# Compute persistent ternary memory
	ternary_bytes = 0
	for buf_name, buf in model.named_buffers():
	if 'T_packed' in buf_name:
	ternary_bytes += buf.numel()
	e_bytes = sum(b.numel() for n, b in model.named_buffers() if n.endswith('.E'))
	e_accum_bytes = sum(b.numel() for n, b in model.named_buffers() if n.endswith('.E_accum'))
	ternary_p_unique = ternary_bytes * 5 # 5 trits per byte
	e_count = e_bytes # int8 E

	# Memory accounting
	model_mem = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 * 1024)
	opt_mem = 0
	for g in optimizer.param_groups:
	for p in g["params"]:
	opt_mem += p.numel() * p.element_size()
	state = optimizer.state.get(p, {})
	for v in state.values():
	if isinstance(v, torch.Tensor):
	opt_mem += v.numel() * v.element_size()
	opt_mem /= 1024 * 1024
	buf_mem = sum(b.numel() * b.element_size() for n, b in model.named_buffers()) / (1024 * 1024)

	print(f"\n [{name}]", flush=True)
	print(f" Params: {n_params:,} total, {trainable:,} trainable", flush=True)
	print(f" Model mode: {'strict ternary text-only' if strict_model else 'full multimodal'}")
	print(format_audit(audit_model(model), limit=5), flush=True)
	print(f" Ternary: ~{ternary_p_unique/1e6:.1f}M packed trits, {e_count:,} int8 E values, {e_accum_bytes:,} int8 E_accum values")
	print(f" Model weights: {model_mem:.1f}MB \| Buffers: {buf_mem:.1f}MB \| Optimizer: {opt_mem:.1f}MB")
	print(f" Compiling warmup...", end=" ", flush=True)

	# Warmup forward pass to trigger JIT compilation
	x_warm, t_warm = get_batch(train_data, device)
	with torch.no_grad():
	with torch.autocast("cuda", dtype=torch.bfloat16, enabled=use_bf16):
	_ = model(x_warm, targets=t_warm)
	torch.cuda.synchronize()
	print(f"done.", flush=True)
	if device == "cuda":
	torch.cuda.reset_peak_memory_stats(device)

	loss_history = []
	step_times = []

	for step in range(STEPS):
	lr = get_lr(step)
	for pg in optimizer.param_groups:
	pg["lr"] = lr

	x, targets = get_batch(train_data, device)
	t0 = time.perf_counter()

	optimizer.zero_grad()
	with torch.autocast("cuda", dtype=torch.bfloat16, enabled=use_bf16):
	logits, losses, _, _ = model(x, targets=targets)

	losses.total.backward()
	if opt_params:
	torch.nn.utils.clip_grad_norm_(opt_params, 1.0)
	optimizer.step()

	if is_true_ternary:
	update_scales = scale_update_interval > 0 and step % scale_update_interval == 0
	if update_backend == "gpu":
	model._ternary_update_memory(
	accum_threshold=accum_threshold,
	update_scales=update_scales,
	loss_signal=losses.total,
	)
	elif update_backend == "gpu-signcache":
	gpu_signcache_update_memory(
	model,
	accum_threshold=accum_threshold,
	update_scales=update_scales,
	loss_signal=losses.total,
	)
	elif update_backend == "dense-fallback":
	if update_scales:
	cpu_update_memory(model, accum_threshold=accum_threshold, loss_signal=losses.total)
	else:
	model._ternary_update_memory(
	accum_threshold=accum_threshold,
	update_scales=False,
	loss_signal=losses.total,
	)
	elif update_backend != "none":
	raise ValueError(f"Unknown update backend: {update_backend}")

	if device == "cuda":
	torch.cuda.synchronize()
	t1 = time.perf_counter()

	loss = losses.total.item()
	loss_history.append(loss)
	step_ms = (t1 - t0) * 1000
	step_times.append(step_ms)

	if step % print_every == 0 or step == STEPS - 1:
	peak = torch.cuda.max_memory_allocated(device) / (1024 * 1024)
	allocated = torch.cuda.memory_allocated(device) / (1024 * 1024)
	reserved = torch.cuda.memory_reserved(device) / (1024 * 1024)
	toks_sec = BATCH * (CTX_LEN - 3) / (step_ms / 1000)
	print(
	f" step {step:>4d}/{STEPS} \| loss={loss:.4f} \| {step_ms:.0f}ms \| "
	f"{toks_sec:.0f} tok/s \| alloc={allocated:.0f}MB reserved={reserved:.0f}MB peak={peak:.0f}MB",
	flush=True,
	)

	final_window = loss_history[-min(20, len(loss_history)):]
	final_avg = sum(final_window) / len(final_window)
	min_loss = min(loss_history)
	avg_step_ms = sum(step_times[WARMUP:]) / len(step_times[WARMUP:])
	avg_toks_sec = BATCH * (CTX_LEN - 3) / (avg_step_ms / 1000)
	peak_vram = torch.cuda.max_memory_allocated(device) / (1024 * 1024)

	del model, optimizer
	gc.collect()
	torch.cuda.empty_cache()

	return {
	"config": name,
	"n_params": n_params,
	"trainable_params": trainable,
	"model_mem_mb": round(model_mem, 1),
	"optimizer_mem_mb": round(opt_mem, 1),
	"buffer_mem_mb": round(buf_mem, 1),
	"peak_vram_mb": round(peak_vram, 1),
	"final_loss_avg20": round(final_avg, 4),
	"min_loss": round(min_loss, 4),
	"avg_step_ms": round(avg_step_ms, 1),
	"avg_toks_sec": round(avg_toks_sec, 1),
	"loss_history": [round(l, 4) for l in loss_history],
	}


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Benchmark full or strict true-ternary MORPH configs.")
	parser.add_argument("--steps", type=int, default=STEPS)
	parser.add_argument("--warmup", type=int, default=WARMUP)
	parser.add_argument("--batch", type=int, default=BATCH)
	parser.add_argument("--ctx", type=int, default=CTX_LEN)
	parser.add_argument("--configs", type=str, default=",".join(CONFIGS),
	help="Comma-separated configs: Adam_FP32,SignSGD_Old,TrueTernary")
	parser.add_argument("--strict-true-ternary", action=argparse.BooleanOptionalAction, default=True,
	help="Run TrueTernary as text-only strict ternary with frozen float params.")
	parser.add_argument("--update-backend", choices=["gpu", "gpu-signcache", "dense-fallback", "none"], default="gpu-signcache",
	help="TrueTernary state update implementation.")
	parser.add_argument("--scale-update-interval", type=int, default=4,
	help="Update int8 E every N TrueTernary steps. 0 disables E updates.")
	parser.add_argument("--accum-threshold", type=int, default=3,
	help="T_accum threshold for ternary sign flips.")
	parser.add_argument("--print-every", type=int, default=1)
	parser.add_argument("--reuse-base", action=argparse.BooleanOptionalAction, default=False,
	help="Create one full base model on CPU and load it into full-model configs.")
	args = parser.parse_args()

	STEPS = args.steps
	WARMUP = args.warmup
	BATCH = args.batch
	CTX_LEN = args.ctx
	CONFIGS = [item.strip() for item in args.configs.split(",") if item.strip()]

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Device: {device}")
	if device == "cuda":
	print(f" GPU: {torch.cuda.get_device_name(0)}")
	print(f" VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

	print("\nDownloading data...")
	global train_data, val_data
	train_data, val_data = download_data()
	print(f" Train: {len(train_data):,} bytes, Val: {len(val_data):,} bytes")
	print(f" Batch={BATCH}, CTX={CTX_LEN}, Steps={STEPS}, Warmup={WARMUP}")

	results = []
	t_all_0 = time.perf_counter()

	base_state = None
	if args.reuse_base and any(cfg != "TrueTernary" or not args.strict_true_ternary for cfg in CONFIGS):
	# Keep reusable initialization on CPU so it does not inflate per-config VRAM.
	print(f"\nCreating base model (CPU state reuse)...", flush=True)
	base_model = build_model(strict_ternary=False)
	base_state = {k: v.detach().cpu().clone() for k, v in base_model.state_dict().items()}
	del base_model
	gc.collect()
	if device == "cuda":
	torch.cuda.empty_cache()
	print(" Done.", flush=True)

	for cfg in CONFIGS:
	r = run_config(
	cfg,
	device,
	base_state=base_state,
	strict_true_ternary=args.strict_true_ternary,
	update_backend=args.update_backend,
	scale_update_interval=args.scale_update_interval,
	accum_threshold=args.accum_threshold,
	print_every=max(1, args.print_every),
	)
	results.append(r)

	gc.collect()
	torch.cuda.empty_cache()
	t_all = time.perf_counter() - t_all_0

	# Summary table
	print(f"\n{'='*90}")
	print(f" BENCHMARK RESULTS — {STEPS} steps, {BATCH}x{CTX_LEN} batch")
	print(f"{'='*90}")
	print(f" {'Config':<20} {'Loss(avg20)':<12} {'Loss(min)':<10} {'Step(ms)':<10} {'tok/s':<10} {'PeakMB':<8} {'ModelMB':<8} {'OptMB':<8}")
	print(f" {'-'*86}")
	for r in results:
	print(f" {r['config']:<20} {r['final_loss_avg20']:<12} {r['min_loss']:<10} {r['avg_step_ms']:<10} {r['avg_toks_sec']:<10} {r['peak_vram_mb']:<8} {r['model_mem_mb']:<8} {r['optimizer_mem_mb']:<8}")

	# Compare to baseline
	baseline = None
	for r in results:
	if r['config'] == 'Adam_FP32':
	baseline = r
	break
	if baseline:
	print(f"\n {'─'*86}")
	print(f" {'Relative to Adam_FP32':<50}")
	print(f" {'─'*86}")
	for r in results:
	if r['config'] == 'Adam_FP32':
	continue
	loss_ratio = r['final_loss_avg20'] / baseline['final_loss_avg20']
	speed_ratio = baseline['avg_toks_sec'] / r['avg_toks_sec'] if r['avg_toks_sec'] > 0 else float('inf')
	vram_ratio = r['peak_vram_mb'] / baseline['peak_vram_mb']
	print(f" {r['config']:<20} loss={loss_ratio:.2f}x speed={speed_ratio:.2f}x vram={vram_ratio:.2f}x")

	# Save results
	out = {
	"config": "True Ternary vs Baselines",
	"steps": STEPS,
	"batch": BATCH,
	"context": CTX_LEN,
	"total_time_s": round(t_all, 1),
	"results": results,
	}
	path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "results", "benchmark", "benchmark_results.json")
	with open(path, "w") as f:
	json.dump(out, f, indent=2)
	print(f"\n Results saved to {path}")
	print(f" Total benchmark time: {t_all:.0f}s ({t_all/60:.1f}min)")