Upload folder using huggingface_hub

d8bc908 verified 2 days ago

19.1 kB

	"""
	import os
	import sys
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
	FlashVQ Correctness Tests — CPU path, GPU path, and CPU vs GPU equivalence.

	Test structure follows testing/test_tscale.py pattern:
	- Each test is a standalone function
	- Manual runner at bottom for direct execution
	- CUDA/Triton tests skip gracefully when unavailable

	Tests 1-7: CPU path correctness (Task 1)
	Tests 8-11: GPU path correctness + CPU vs GPU equivalence (Task 2)
	"""

	import torch
	import torch.nn.functional as F
	import sys
	import os


	import flash_vq
	from arbitor.kernel.flash_vq import FlashVQCodebook, _HAS_TRITON

	try:
	from arbitor.main import VQAdapter, MultimodalVQBridge, HIDDEN_DIM, CODEBOOK_DIM
	from arbitor.kernel.ternary_scale import TScaleType
	_HAS_TRIGRAM = True
	except ImportError:
	_HAS_TRIGRAM = False


	# ─── Test Helpers ───

	def _make_cpu_vq(codebook_size=8192, codebook_dim=32, seed=42, rotation_trick=True):
	"""Create a deterministic FlashVQCodebook on CPU."""
	torch.manual_seed(seed)
	vq = FlashVQCodebook(
	codebook_size=codebook_size,
	codebook_dim=codebook_dim,
	decay=0.99,
	commitment_weight=1.0,
	threshold_ema_dead_code=2,
	kmeans_init=False,
	kmeans_iters=10,
	rotation_trick=rotation_trick,
	)
	return vq


	# ─── Task 1: CPU Path Tests (Tests 1-7) ───

	def test_flash_vq_cpu_forward_shapes():
	"""
	Test 1: FlashVQCodebook CPU forward with random input returns
	(quantized, indices, commitment_loss) with correct shapes.
	"""
	vq = _make_cpu_vq()
	x = torch.randn(4, 16, 32)
	quantized, indices, loss = vq._cpu_forward(x.reshape(-1, 32))

	# quantized: [N, D] where N=B*T
	assert quantized.shape == (64, 32), f"quantized shape: {quantized.shape}"
	# indices: [N]
	assert indices.shape == (64,), f"indices shape: {indices.shape}"
	# commitment_loss: scalar or single-element
	assert loss.numel() == 1, f"loss shape: {loss.shape}"
	assert loss.dim() == 0, f"loss dim: {loss.dim()}"
	# indices in valid range
	assert indices.min() >= 0, f"negative index: {indices.min()}"
	assert indices.max() < vq.codebook_size, f"index too large: {indices.max()}"
	# quantized should match codebook dim
	assert quantized.shape[-1] == 32, f"quantized last dim: {quantized.shape[-1]}"

	print(" PASS test_flash_vq_cpu_forward_shapes")


	def test_flash_vq_cpu_quantized_matches_codebook():
	"""
	Test 2: FlashVQCodebook CPU quantized output matches codebook[indices]
	(straight-through estimator).
	"""
	vq = _make_cpu_vq()
	x = torch.randn(4, 16, 32)
	x_flat = x.reshape(-1, 32)

	# Save embed snapshot before forward (EMA update modifies embed in-place)
	embed_snapshot = vq.embed.clone()

	quantized, indices, loss = vq._cpu_forward(x_flat)

	# The quantized output should equal embed_snapshot[indices] with STE applied
	# STE: quantized = x_flat + (embed[indices] - x_flat).detach()
	expected_quantized = embed_snapshot[indices]
	diff_vq = quantized - x_flat
	diff_raw = expected_quantized - x_flat
	# diff_vq should equal diff_raw.detach()
	assert torch.allclose(diff_vq, diff_raw.detach(), atol=1e-6), \
	"STE: quantized - x should equal (embed[indices] - x).detach()"

	print(" PASS test_flash_vq_cpu_quantized_matches_codebook")


	def test_flash_vq_cpu_cosine_sim():
	"""
	Test 3: FlashVQCodebook CPU cosine similarity matches
	F.normalize(x) @ F.normalize(codebook).T argmax.
	"""
	vq = _make_cpu_vq()
	x = torch.randn(4, 16, 32)
	x_flat = x.reshape(-1, 32)

	# Capture embed snapshot before EMA update modifies it
	embed_snapshot = vq.embed.clone()

	quantized, indices, loss = vq._cpu_forward(x_flat)

	# Manual cosine similarity using embed before EMA update
	x_norm = F.normalize(x_flat, dim=-1)
	embed_norm = F.normalize(embed_snapshot, dim=-1)
	manual_sim = x_norm @ embed_norm.T
	manual_indices = manual_sim.argmax(dim=-1)

	# Indices should match
	assert torch.equal(indices, manual_indices), \
	f"Indices differ! First 10 indices: {indices[:10]} vs {manual_indices[:10]}"

	print(" PASS test_flash_vq_cpu_cosine_sim")


	def test_flash_vq_cpu_ema_update():
	"""
	Test 4: FlashVQCodebook CPU EMA update changes embed and cluster_size
	after forward pass (with rotation_trick=False for deterministic EMA).

	Tests EMA in isolation by calling _ema_update directly, then verifies
	embed and cluster_size changed for assigned codebook entries.
	"""
	vq = _make_cpu_vq(rotation_trick=False)
	embed_before = vq.embed.clone()
	cluster_size_before = vq.cluster_size.clone()

	# Create indices that assign all inputs to first few codebook entries
	x = torch.randn(2, 8, 32)
	x_flat = x.reshape(-1, 32)
	# Force indices to specific entries to make EMA predictable
	indices = torch.zeros(16, dtype=torch.long)
	# Assign inputs to the first 4 codebook entries
	for i in range(16):
	indices[i] = i % 4

	# Call EMA update directly (isolated from dead code reset)
	vq._ema_update(x_flat, indices)

	# After EMA update, embed should have changed
	assert not torch.equal(embed_before, vq.embed), \
	"Embed did not change after EMA update"
	# cluster_size should have changed
	assert not torch.equal(cluster_size_before, vq.cluster_size), \
	"cluster_size did not change after EMA update"
	# cluster_size decay: initially 0, after assignment of 4 items each with decay=0.99:
	# cluster_size = 0 * 0.99 + 4 * 0.01 = 0.04 for entries 0-3
	assert (vq.cluster_size[:4] > 0).all(), \
	"Assigned entries should have non-zero cluster_size"
	assert (vq.cluster_size[4:] == 0).all(), \
	"Unassigned entries should have zero cluster_size"

	# Also test that the full forward (EMA + dead code reset) runs without error
	# and embed changes overall
	vq2 = _make_cpu_vq(rotation_trick=False)
	embed_before2 = vq2.embed.clone()
	q, idx, loss = vq2._cpu_forward(torch.randn(4, 16, 32).reshape(-1, 32))
	assert not torch.equal(embed_before2, vq2.embed), \
	"Embed did not change after full forward pass"

	print(" PASS test_flash_vq_cpu_ema_update")


	def test_flash_vq_cpu_dead_code_reset():
	"""
	Test 5: FlashVQCodebook CPU dead code reset replaces inactive codebook entries.
	"""
	vq = _make_cpu_vq()
	# Manually set all cluster_sizes to 0 (all dead)
	vq.cluster_size[:] = 0.0
	# Mark a few entries as alive
	vq.cluster_size[:10] = 5.0

	x = torch.randn(2, 8, 32)
	x_flat = x.reshape(-1, 32)

	# Record embed before reset
	embed_before = vq.embed.clone()
	n_dead_before = vq.get_dead_code_count()
	assert n_dead_before == vq.codebook_size - 10, \
	f"Expected {vq.codebook_size - 10} dead entries, got {n_dead_before}"

	# Run dead code reset
	vq._dead_code_reset(x_flat)

	# After reset: previously dead entries should now have cluster_size=0
	# (the reset function sets cluster_size[dead_indices] = 0.0 after replacing)
	n_dead_after = vq.get_dead_code_count()
	# Entries with cluster_size == 0 should have been replaced
	dead_indices_before_10 = torch.where(vq.cluster_size == 0)[0]
	# Those entries' embed should have changed from before
	if len(dead_indices_before_10) > 0:
	idx = dead_indices_before_10[0]
	assert not torch.equal(embed_before[idx], vq.embed[idx]), \
	f"Dead entry {idx} embed was not replaced"

	print(" PASS test_flash_vq_cpu_dead_code_reset")


	def test_flash_vq_cpu_rotation_trick_grad():
	"""
	Test 6: FlashVQCodebook CPU rotation trick gradient flows correctly.
	Gradient should not be zero, and should differ from STE gradient.
	"""
	torch.manual_seed(42)

	# With rotation trick
	vq_rot = _make_cpu_vq(rotation_trick=True, seed=42)
	x = torch.randn(2, 4, 32, requires_grad=True)
	x_flat = x.reshape(-1, 32).detach().clone().requires_grad_(True)

	# Forward pass with rotation trick
	quantized_rot, indices_rot, loss_rot = vq_rot._cpu_forward(x_flat)

	# Gradient should flow through rotation trick
	loss_val = quantized_rot.sum()
	loss_val.backward()
	rot_grad = x_flat.grad.clone()

	assert rot_grad is not None, "Rotation trick gradient is None"
	assert rot_grad.abs().sum().item() > 0, "Rotation trick gradient is all zeros"

	# Compare with STE gradient (no rotation)
	torch.manual_seed(42)
	vq_ste = _make_cpu_vq(rotation_trick=False, seed=42)
	x_flat2 = x.reshape(-1, 32).detach().clone().requires_grad_(True)

	quantized_ste, indices_ste, loss_ste = vq_ste._cpu_forward(x_flat2)
	loss_val_ste = quantized_ste.sum()
	loss_val_ste.backward()
	ste_grad = x_flat2.grad.clone()

	# Rotation trick gradient should differ from STE gradient
	# (if same codebook entries selected)
	if torch.equal(indices_rot, indices_ste):
	grad_diff = (rot_grad - ste_grad).abs().max().item()
	assert grad_diff > 1e-8, \
	f"Rotation trick gradient equals STE gradient (diff={grad_diff})"

	print(" PASS test_flash_vq_cpu_rotation_trick_grad")


	def test_flash_vq_cpu_commitment_loss():
	"""
	Test 7: FlashVQCodebook CPU commitment loss is non-negative scalar.
	"""
	vq = _make_cpu_vq(rotation_trick=False)
	x = torch.randn(4, 16, 32)
	x_flat = x.reshape(-1, 32)

	quantized, indices, loss = vq._cpu_forward(x_flat)

	assert loss.item() >= 0.0, f"Commitment loss is negative: {loss.item()}"
	assert loss.dim() == 0, f"Loss is not scalar: {loss.shape}"

	# With commitment_weight=1.0, loss should be MSE between x and quantized.detach()
	expected_loss = F.mse_loss(x_flat, quantized.detach())
	assert torch.allclose(loss, expected_loss, atol=1e-6), \
	f"Loss mismatch: {loss.item()} vs {expected_loss.item()}"

	print(" PASS test_flash_vq_cpu_commitment_loss")


	# ─── Task 2: GPU Path Tests (Tests 8-11) ───

	def _make_gpu_vq(codebook_size=8192, codebook_dim=32, seed=42, rotation_trick=True):
	"""Create a deterministic FlashVQCodebook on GPU."""
	vq = _make_cpu_vq(codebook_size, codebook_dim, seed, rotation_trick)
	vq = vq.cuda()
	return vq


	def test_flash_vq_gpu_vs_cpu_forward():
	"""
	Test 8: FlashVQCodebook GPU forward output matches CPU forward output
	within atol=1e-3.
	"""
	if not torch.cuda.is_available() or not _HAS_TRITON:
	print(" SKIP test_flash_vq_gpu_vs_cpu_forward (CUDA/Triton unavailable)")
	return

	torch.manual_seed(42)
	vq_cpu = _make_cpu_vq(rotation_trick=False)
	vq_gpu = _make_gpu_vq(rotation_trick=False)

	x = torch.randn(2, 8, 32)
	x_flat = x.reshape(-1, 32)

	quantized_cpu, indices_cpu, loss_cpu = vq_cpu._cpu_forward(x_flat)
	x_gpu = x_flat.detach().clone().cuda()
	quantized_gpu, indices_gpu, loss_gpu = vq_gpu._triton_forward(x_gpu)

	quantized_gpu_cpu = quantized_gpu.cpu()
	loss_gpu_cpu = loss_gpu.cpu()

	# Compare quantized output within tolerance
	fwd_diff = (quantized_cpu - quantized_gpu_cpu).abs().max().item()
	assert fwd_diff < 1e-3, \
	f"CPU vs GPU quantized max diff: {fwd_diff} (exceeds 1e-3)"

	# Indices must match exactly
	assert torch.equal(indices_cpu, indices_gpu.cpu()), \
	"CPU vs GPU indices differ"

	# Loss within tolerance
	loss_diff = abs(loss_cpu.item() - loss_gpu_cpu.item())
	assert loss_diff < 1e-3, \
	f"CPU vs GPU loss diff: {loss_diff}"

	print(f" PASS test_flash_vq_gpu_vs_cpu_forward (fwd_diff={fwd_diff:.6f})")


	def test_flash_vq_gpu_vs_cpu_gradients():
	"""
	Test 9: FlashVQCodebook GPU gradient (rotation trick backward) matches
	CPU gradient within atol=1e-3.
	"""
	if not torch.cuda.is_available() or not _HAS_TRITON:
	print(" SKIP test_flash_vq_gpu_vs_cpu_gradients (CUDA/Triton unavailable)")
	return

	torch.manual_seed(42)
	vq_cpu = _make_cpu_vq(rotation_trick=True, seed=42)
	vq_gpu = _make_gpu_vq(rotation_trick=True, seed=42)

	x = torch.randn(2, 4, 32)
	x_flat = x.reshape(-1, 32).detach().clone().requires_grad_(True)

	# CPU forward + backward
	q_cpu, idx_cpu, loss_cpu = vq_cpu._cpu_forward(x_flat)
	q_cpu.sum().backward()
	cpu_grad = x_flat.grad.clone()

	# GPU forward + backward
	x_gpu = x_flat.detach().clone().cuda().requires_grad_(True)
	q_gpu, idx_gpu, loss_gpu = vq_gpu._triton_forward(x_gpu)
	q_gpu.sum().backward()
	gpu_grad = x_gpu.grad.clone()

	bwd_diff = (cpu_grad - gpu_grad.cpu()).abs().max().item()
	assert bwd_diff < 1e-3, \
	f"CPU vs GPU gradient max diff: {bwd_diff} (exceeds 1e-3)"

	print(f" PASS test_flash_vq_gpu_vs_cpu_gradients (bwd_diff={bwd_diff:.6f})")


	def test_flash_vq_gpu_small_codebook():
	"""
	Test 10: FlashVQCodebook GPU path with codebook_size=4096 also matches
	CPU path (multi-codebook support per D-102).
	"""
	if not torch.cuda.is_available() or not _HAS_TRITON:
	print(" SKIP test_flash_vq_gpu_small_codebook (CUDA/Triton unavailable)")
	return

	torch.manual_seed(42)
	vq_cpu = _make_cpu_vq(codebook_size=4096, rotation_trick=False)
	vq_gpu = _make_gpu_vq(codebook_size=4096, rotation_trick=False)

	x = torch.randn(2, 8, 32)
	x_flat = x.reshape(-1, 32)

	q_cpu, idx_cpu, loss_cpu = vq_cpu._cpu_forward(x_flat)
	x_gpu = x_flat.detach().clone().cuda()
	q_gpu, idx_gpu, loss_gpu = vq_gpu._triton_forward(x_gpu)

	fwd_diff = (q_cpu - q_gpu.cpu()).abs().max().item()
	assert fwd_diff < 1e-3, \
	f"CPU vs GPU (4096) quantized max diff: {fwd_diff}"
	assert torch.equal(idx_cpu, idx_gpu.cpu()), \
	"CPU vs GPU (4096) indices differ"

	print(f" PASS test_flash_vq_gpu_small_codebook (fwd_diff={fwd_diff:.6f})")


	# ─── Task 3: VQAdapter Integration Tests ───

	def test_flash_vq_in_vqadapter():
	"""
	Test 11: VQAdapter with FlashVQCodebook forward produces correct shapes
	and all VQAdapter methods work (get_codebook_utilization, get_dead_code_count,
	l2_distance_matching).
	"""
	if not _HAS_TRIGRAM:
	print(" SKIP test_flash_vq_in_vqadapter (trigram.py not importable)")
	return

	vq = VQAdapter(codebook_size=128, codebook_dim=32, tscale_type=TScaleType.T4)
	# Force CPU for deterministic testing
	vq.vq.embed.data = torch.randn(128, 32) * 0.02
	vq.vq.cluster_size.data.zero_()
	vq.eval()

	x = torch.randn(2, 8, 512) # [B, T, trigram_dim]

	with torch.no_grad():
	output, vq_loss, indices = vq(x)

	# output shape: [B, T, 512] (same as trigram_dim)
	assert output.shape == (2, 8, 512), f"output shape: {output.shape}"
	# vq_loss: scalar
	assert vq_loss.numel() == 1, f"vq_loss shape: {vq_loss.shape}"
	# indices: [B, T]
	assert indices.shape == (2, 8), f"indices shape: {indices.shape}"
	# indices in valid range
	assert indices.min() >= 0, f"negative index: {indices.min()}"
	assert indices.max() < 128, f"index too large: {indices.max()}"

	# get_codebook_utilization returns float 0..1
	util = vq.get_codebook_utilization()
	assert isinstance(util, float), f"util type: {type(util)}"
	assert 0.0 <= util <= 1.0, f"util out of range: {util}"

	# get_dead_code_count returns non-negative int
	dead = vq.get_dead_code_count()
	assert isinstance(dead, (int, type(torch.tensor(0).item()))), f"dead type: {type(dead)}"
	dead_val = int(dead)
	assert dead_val >= 0, f"dead count negative: {dead_val}"

	# l2_distance_matching returns (indices, distances) — expects codebook_dim input
	x_codebook_dim = x[..., :32] # slice to match codebook_dim
	with torch.no_grad():
	l2_idx, l2_dist = vq.l2_distance_matching(x_codebook_dim)
	assert l2_idx.shape == (2, 8), f"l2 indices shape: {l2_idx.shape}"
	assert l2_dist.shape == (2, 8), f"l2 distances shape: {l2_dist.shape}"
	assert l2_dist.min() >= 0.0, "l2 distance should be non-negative"

	print(" PASS test_flash_vq_in_vqadapter")


	def test_flash_vq_multimodal_bridge():
	"""
	Test 12: MultimodalVQBridge with FlashVQCodebook — all three VQAdapters
	(text, image, audio) produce correct outputs.
	"""
	if not _HAS_TRIGRAM:
	print(" SKIP test_flash_vq_multimodal_bridge (trigram.py not importable)")
	return

	bridge = MultimodalVQBridge(
	text_codebook_size=256,
	image_codebook_size=128,
	audio_codebook_size=128,
	codebook_dim=32,
	enable_image=True,
	enable_audio=True,
	)
	bridge.eval()

	x = torch.randn(2, 8, 512)
	with torch.no_grad():
	text_out, text_loss, text_idx = bridge.text_vq(x)
	image_out, image_loss, image_idx = bridge.image_vq(x)
	audio_out, audio_loss, audio_idx = bridge.audio_vq(x)

	assert text_out.shape == (2, 8, 512), f"text output shape: {text_out.shape}"
	assert image_out.shape == (2, 8, 512), f"image output shape: {image_out.shape}"
	assert audio_out.shape == (2, 8, 512), f"audio output shape: {audio_out.shape}"

	assert text_idx.max() < 256, f"text index too large: {text_idx.max()}"
	assert image_idx.max() < 128, f"image index too large: {image_idx.max()}"
	assert audio_idx.max() < 128, f"audio index too large: {audio_idx.max()}"

	# Bridge-level codebook utilization
	all_util = bridge.get_codebook_utilization()
	assert 'text' in all_util
	assert 'image' in all_util
	assert 'audio' in all_util
	for mod, u in all_util.items():
	assert 0.0 <= u <= 1.0, f"{mod} utilization out of range: {u}"

	print(" PASS test_flash_vq_multimodal_bridge")


	# ─── Manual Test Runner ───

	if __name__ == "__main__":
	cpu_tests = [
	test_flash_vq_cpu_forward_shapes,
	test_flash_vq_cpu_quantized_matches_codebook,
	test_flash_vq_cpu_cosine_sim,
	test_flash_vq_cpu_ema_update,
	test_flash_vq_cpu_dead_code_reset,
	test_flash_vq_cpu_rotation_trick_grad,
	test_flash_vq_cpu_commitment_loss,
	]
	gpu_tests = [
	test_flash_vq_gpu_vs_cpu_forward,
	test_flash_vq_gpu_vs_cpu_gradients,
	test_flash_vq_gpu_small_codebook,
	]
	integration_tests = [
	test_flash_vq_in_vqadapter,
	test_flash_vq_multimodal_bridge,
	]
	all_tests = cpu_tests + gpu_tests + integration_tests

	print("Running FlashVQ tests...\n")
	passed = 0
	failed = 0
	skipped = 0
	for test in all_tests:
	try:
	test()
	passed += 1
	except Exception as e:
	msg = str(e)
	if msg.startswith(" SKIP"):
	print(msg)
	skipped += 1
	else:
	print(f" FAIL {test.__name__}: {e}")
	import traceback
	traceback.print_exc()
	failed += 1
	total_run = passed + failed
	print(f"\n{passed} passed, {failed} failed, {skipped} skipped out of {len(all_tests)} tests (attempted {total_run})")
	sys.exit(1 if failed > 0 else 0)