World_Model / URSA /scripts /test_patches_mock.py

Add files using upload-large-folder tool

2ee4cd6 verified 27 days ago

20.4 kB

	#!/usr/bin/env python3
	"""Self-contained mock test for all 6 patches in train_onestep_ursa_dimo.py.

	Does NOT require loading the real URSA pipeline.
	Exercises:
	(1) Batch-concat [2B] forward — verified via forward call counts
	(2) reward / adv detach — runtime assertions
	(3) _stable_kl / _stable_jeffrey (float32 + log_softmax)
	(4) Separate loss_aux_cond / loss_aux_uncond / loss_kd_cond / loss_kd_uncond logging
	(5) use_guided per-sample shape [B] and ratio
	(6) flex_attn offsets probe / reset

	Run:
	python scripts/test_patches_mock.py
	"""
	import sys, os
	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

	import types, copy
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	# Import helpers from the training script directly
	import importlib.util
	spec = importlib.util.spec_from_file_location(
	"train", os.path.join(os.path.dirname(__file__), "train_onestep_ursa_dimo.py"))
	train_mod = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(train_mod)

	_stable_kl = train_mod._stable_kl
	_stable_jeffrey = train_mod._stable_jeffrey
	_build_guided_logits = train_mod._build_guided_logits
	_select_target = train_mod._select_target
	_cfg_warmup_prob = train_mod._cfg_warmup_prob
	_compute_cfg_scale = train_mod._compute_cfg_scale
	_probe_flex_attn = train_mod._probe_flex_attn
	_reset_flex_attn = train_mod._reset_flex_attn
	_print_flex_attn_state = train_mod._print_flex_attn_state
	_token_histogram_entropy = train_mod._token_histogram_entropy

	print("=" * 70)
	print("URSA distillation patch self-test (mock)")
	print("=" * 70)

	device = torch.device("cpu")
	B, N, K = 2, 12, 64 # small numbers for speed

	# =========================================================================
	# Patch (3): _stable_kl / _stable_jeffrey — float32 + log_softmax
	# =========================================================================
	print("\n[3] Testing _stable_kl / _stable_jeffrey …")
	torch.manual_seed(0)
	z_p = torch.randn(B, N, K)
	z_q = torch.randn(B, N, K)

	kl_pq = _stable_kl(z_p, z_q)
	kl_qp = _stable_kl(z_q, z_p)
	jeff = _stable_jeffrey(z_p, z_q)

	assert kl_pq.shape == (B,), f"kl_pq shape={kl_pq.shape}"
	assert (kl_pq >= 0).all(), "KL must be non-negative"
	assert (kl_qp >= 0).all(), "KL must be non-negative (reverse)"
	assert torch.allclose(jeff, kl_pq + kl_qp, atol=1e-5), "Jeffrey ≠ KL(p\|\|q) + KL(q\|\|p)"
	assert not torch.isnan(kl_pq).any(), "kl_pq has NaN"
	assert not torch.isinf(kl_pq).any(), "kl_pq has Inf"

	# KL(p\|\|p) == 0
	kl_pp = _stable_kl(z_p, z_p)
	assert kl_pp.abs().max() < 1e-5, f"KL(p\|\|p) should be ~0, got {kl_pp}"

	# Numerics with large logits (simulate s=3 amplification)
	z_large = z_p * 50.0
	kl_large = _stable_kl(z_large, z_q)
	assert not torch.isnan(kl_large).any(), "kl_large has NaN with large logits"
	assert not torch.isinf(kl_large).any(), "kl_large has Inf with large logits"

	print(f" kl_pq = {kl_pq.tolist()} (both ≥0 ✓)")
	print(f" jeffrey= {jeff.tolist()} (= kl_pq + kl_qp ✓)")
	print(f" kl(p,p)= {kl_pp.tolist()} (≈0 ✓)")
	print(f" kl with z*50: {kl_large.tolist()} (finite ✓)")
	print("[3] _stable_kl / _stable_jeffrey PASSED ✓")

	# =========================================================================
	# Patch (3b): _build_guided_logits — float32, per-sample scale
	# =========================================================================
	print("\n[3b] Testing _build_guided_logits …")
	z_cond = torch.randn(B, N, K)
	z_uncond = torch.randn(B, N, K)
	t = torch.tensor([0.3, 0.95]) # one below, one above trunc=0.9
	z_guided = _build_guided_logits(z_cond, z_uncond, t, cfg_scale=3.0, trunc=0.9)

	assert z_guided.shape == (B, N, K), f"z_guided.shape={z_guided.shape}"
	assert not torch.isnan(z_guided).any(), "z_guided has NaN"
	assert not torch.isinf(z_guided).any(), "z_guided has Inf"

	# Sample 0: t=0.3 < trunc → scale=3
	# z_guided[0] = z_uncond[0] + 3*(z_cond[0] - z_uncond[0])
	expected_0 = z_uncond[0] + 3.0 * (z_cond[0] - z_uncond[0])
	assert torch.allclose(z_guided[0], expected_0, atol=1e-5), "sample 0 guided mismatch"
	# Sample 1: t=0.95 >= trunc → scale=1
	expected_1 = z_uncond[1] + 1.0 * (z_cond[1] - z_uncond[1])
	assert torch.allclose(z_guided[1], expected_1, atol=1e-5), "sample 1 (trunc) mismatch"

	g_min, g_max, g_mean = z_guided.min().item(), z_guided.max().item(), z_guided.mean().item()
	print(f" z_T_guided shape={z_guided.shape} min={g_min:.3f} max={g_max:.3f} mean={g_mean:.3f}")
	assert abs(g_min) < 1e4 and abs(g_max) < 1e4, f"guided logits exploded: [{g_min:.1e}, {g_max:.1e}]"
	print("[3b] _build_guided_logits PASSED ✓")

	# =========================================================================
	# Patch (5): use_guided per-sample [B] shape + ratio
	# =========================================================================
	print("\n[5] Testing per-sample use_guided …")
	torch.manual_seed(42)

	# After warmup (step >> warmup_steps) → p = cfg_prob = 1.0
	prob_full = _cfg_warmup_prob(step=10000, cfg_prob=1.0, warmup_steps=2000)
	assert abs(prob_full - 1.0) < 1e-6, f"full warmup prob={prob_full}"

	# During warmup at step=1000 with warmup_steps=2000 → p = 0.5
	prob_half = _cfg_warmup_prob(step=1000, cfg_prob=1.0, warmup_steps=2000)
	assert abs(prob_half - 0.5) < 1e-6, f"half warmup prob={prob_half}"

	# Per-sample sampling
	torch.manual_seed(0)
	use_guided = torch.rand(B) < 0.5 # [B] bool
	assert use_guided.shape == (B,), f"use_guided.shape={use_guided.shape}"
	use_guided_ratio = use_guided.float().mean().item()
	print(f" use_guided={use_guided.tolist()} ratio={use_guided_ratio:.2f}")

	# _select_target per-sample
	z_target = _select_target(z_guided, z_cond, use_guided)
	for b in range(B):
	if use_guided[b]:
	assert torch.allclose(z_target[b], z_guided[b]), f"sample {b}: guided not selected"
	else:
	assert torch.allclose(z_target[b], z_cond[b]), f"sample {b}: cond not selected"
	print(f" _select_target: per-sample selection correct ✓")
	print("[5] Per-sample use_guided PASSED ✓")

	# =========================================================================
	# Patch (1): Batch-concat [2B] — verified via a tiny linear net
	# =========================================================================
	print("\n[1] Testing batch-concat [2B] forward equivalence …")

	class TinyModel(nn.Module):
	def __init__(self):
	super().__init__()
	self.lin = nn.Linear(K, K, bias=False)
	self._call_count = 0
	def forward(self, x):
	self._call_count += 1
	return self.lin(x.float())

	model = TinyModel()
	x_cond = torch.randn(B, N, K)
	x_uncond = torch.randn(B, N, K)

	# Separate forward (old way: 2 calls)
	model._call_count = 0
	out_cond_sep = model(x_cond)
	out_uncond_sep = model(x_uncond)
	calls_sep = model._call_count # = 2

	# Batch-concat forward (new way: 1 call)
	model._call_count = 0
	x_dual = torch.cat([x_cond, x_uncond], dim=0) # [2B, N, K]
	out_dual = model(x_dual) # [2B, N, K]
	out_cond_bat, out_uncond_bat = out_dual.chunk(2, dim=0)
	calls_bat = model._call_count # = 1

	assert calls_sep == 2, f"sep calls={calls_sep}"
	assert calls_bat == 1, f"batch calls={calls_bat}"
	assert torch.allclose(out_cond_sep, out_cond_bat, atol=1e-5), "cond output mismatch"
	assert torch.allclose(out_uncond_sep, out_uncond_bat, atol=1e-5), "uncond output mismatch"
	print(f" Separate: {calls_sep} calls → batch: {calls_bat} call (identical outputs ✓)")
	print("[1] Batch-concat forward PASSED ✓")

	# =========================================================================
	# Patch (2): reward / adv detach — no student gradient
	# =========================================================================
	print("\n[2] Testing reward/adv detach …")

	z_T = torch.randn(B, N, K).detach() # teacher logits (no grad)
	z_S_with_grad = torch.randn(B, N, K, requires_grad=True) # student logits (has grad)

	# Reward computation: z_S must be detached
	reward = -_stable_kl(z_T.detach(), z_S_with_grad.detach(), tau=1.0) # [B]
	assert not reward.requires_grad, \
	f"[BUG] reward.requires_grad={reward.requires_grad} — gradient leaked"

	baseline_ema = 0.0
	adv = (reward - baseline_ema).detach()
	assert not adv.requires_grad, \
	f"[BUG] adv.requires_grad={adv.requires_grad} — detach failed"

	# Verify gradient DOES flow through logp (the differentiable path)
	logits_gen = torch.randn(B, N, K, requires_grad=True)
	p_gen = F.softmax(logits_gen / 1.0, dim=-1)
	x_hat = torch.multinomial(p_gen.view(-1, K).detach(), 1).view(B, N)
	logp = p_gen.clamp(1e-8).log().gather(-1, x_hat.unsqueeze(-1)).squeeze(-1).sum(-1) # [B]
	loss_pg = -(adv * logp).mean()
	loss_pg.backward()
	assert logits_gen.grad is not None, "logits_gen has no grad — REINFORCE broken"
	assert logits_gen.grad.abs().max() > 0, "logits_gen grad is all zeros"

	print(f" reward.requires_grad={reward.requires_grad} (must be False ✓)")
	print(f" adv.requires_grad={adv.requires_grad} (must be False ✓)")
	print(f" logits_gen.grad max={logits_gen.grad.abs().max():.4f} (non-zero ✓)")
	print("[2] Reward/adv detach PASSED ✓")

	# =========================================================================
	# Patch (4): Separate loss logging keys
	# =========================================================================
	print("\n[4] Testing separate loss logging …")

	loss_aux_cond_v = _stable_jeffrey(z_T, z_T + torch.randn_like(z_T) * 0.1, tau=1.0).mean()
	loss_aux_uncond_v = _stable_jeffrey(z_T, z_T + torch.randn_like(z_T) * 0.2, tau=1.0).mean()
	loss_kd_cond = _stable_kl(z_T, z_S_with_grad, tau=1.0).mean()
	loss_kd_uncond_v = _stable_kl(z_T, z_T + torch.randn_like(z_T) * 0.05, tau=1.0).mean()

	log_line = (
	f"[step 1] "
	f"loss_aux_cond={loss_aux_cond_v.item():.4f} "
	f"loss_aux_uncond={loss_aux_uncond_v.item():.4f} "
	f"loss_kd_cond={loss_kd_cond.item():.4f} "
	f"loss_kd_uncond={loss_kd_uncond_v.item():.4f} "
	f"loss_pg=0.1234 H=3.123 tok_H=4.500 "
	f"guided_ratio=0.50 baseline=0.0000 mean_logp=-3.45"
	)
	print(f" Sample log: {log_line}")
	assert "loss_aux_cond=" in log_line
	assert "loss_aux_uncond=" in log_line
	assert "loss_kd_cond=" in log_line
	assert "loss_kd_uncond=" in log_line
	assert "guided_ratio=" in log_line
	print("[4] Separate loss logging format PASSED ✓")

	# =========================================================================
	# Patch (6): flex_attn offsets probe / reset
	# =========================================================================
	print("\n[6] Testing flex_attn probe / reset …")

	# Case A: model without flex_attn
	class ModelNoFlex(nn.Module):
	pass

	m_no_flex = ModelNoFlex()
	fa = _probe_flex_attn(m_no_flex, "no_flex")
	assert fa is None, f"Expected None, got {fa}"
	_reset_flex_attn(m_no_flex, "no_flex", verbose=True) # should not raise
	print(" Model without flex_attn: probe=None, reset is no-op ✓")

	# Case B: model WITH flex_attn — simulate FlexAttentionCausal2D
	class FakeFlexAttn:
	def __init__(self):
	self.offsets = None
	self.block_mask = None
	self.cu_offsets = None

	class ModelWithFlex(nn.Module):
	def __init__(self):
	super().__init__()
	self.flex_attn = FakeFlexAttn()

	m_flex = ModelWithFlex()
	m_flex.flex_attn.offsets = [0, 50, 370] # simulate set offsets
	m_flex.flex_attn.block_mask = "some_mask"
	m_flex.flex_attn.cu_offsets = torch.tensor([0, 50, 370])

	print(" Before reset:")
	_print_flex_attn_state(m_flex, "test_model")
	_reset_flex_attn(m_flex, "test_model", verbose=True)
	print(" After reset:")
	_print_flex_attn_state(m_flex, "test_model")

	assert m_flex.flex_attn.offsets is None, "offsets not reset"
	assert m_flex.flex_attn.block_mask is None, "block_mask not reset"
	assert m_flex.flex_attn.cu_offsets is None, "cu_offsets not reset"
	print(" flex_attn.offsets=None, block_mask=None, cu_offsets=None ✓")
	print("[6] flex_attn probe/reset PASSED ✓")

	# =========================================================================
	# z_T_guided explosion guard (from _run_assertions)
	# =========================================================================
	print("\n[3c] Testing z_T_guided explosion guard …")
	z_guided_ok = torch.randn(B, N, K) * 10 # normal magnitude
	z_guided_bad = torch.randn(B, N, K) * 2e4 # exploded

	assert not torch.isnan(z_guided_ok).any()
	assert not torch.isinf(z_guided_ok).any()
	assert abs(z_guided_ok.min().item()) < 1e4

	try:
	big_min = z_guided_bad.min().item()
	big_max = z_guided_bad.max().item()
	assert abs(big_min) < 1e4 and abs(big_max) < 1e4, f"Explosion: [{big_min:.1e}, {big_max:.1e}]"
	print(" ⚠️ explosion guard NOT triggered (unexpected)")
	except AssertionError as e:
	print(f" Explosion guard triggered correctly: {e} ✓")
	print("[3c] z_T_guided explosion guard PASSED ✓")

	# =========================================================================
	# Token histogram entropy
	# =========================================================================
	print("\n[misc] Testing _token_histogram_entropy …")
	# Uniform: entropy = log(K)
	x_uniform = torch.randint(0, K, (1, B * N))
	H_uniform = _token_histogram_entropy(x_uniform, K)
	print(f" uniform entropy={H_uniform:.3f} log(K)={K ** 0 * torch.tensor(K).float().log().item():.3f}")

	# Collapsed: all tokens = 0 → entropy = 0
	x_collapsed = torch.zeros(1, B * N, dtype=torch.long)
	H_collapsed = _token_histogram_entropy(x_collapsed, K)
	assert H_collapsed < 0.01, f"collapsed entropy={H_collapsed} should be ~0"
	print(f" collapsed entropy={H_collapsed:.4f} (≈0 ✓)")
	print("[misc] _token_histogram_entropy PASSED ✓")

	# =========================================================================
	# Patch (7): extract_visual_logits — manual reconstruction
	# =========================================================================
	print("\n[7] extract_visual_logits end-to-end alignment (mock) …")
	import importlib.util as _ilu, sys as _sys
	_spec = _ilu.spec_from_file_location(
	"_utils", os.path.join(os.path.dirname(__file__), "..", "src", "distill", "utils_ursa_inputs.py"))
	_utils = _ilu.module_from_spec(_spec)
	_spec.loader.exec_module(_utils)
	extract_visual_logits = _utils.extract_visual_logits

	# Case A: D == K (URSA default — lm_head outputs K logits directly)
	B7, N7, K7 = 1, 20, 64
	L7 = 8
	logits_full_A = torch.randn(B7, L7 + N7 + 1, K7) # D == K
	z_vis_A = extract_visual_logits(logits_full_A, N7, K7)
	z_seq_A = logits_full_A[:, -(N7+1):-1] # raw causal slice [B, N, D=K]
	delta_A = (z_vis_A - z_seq_A).abs().max().item()
	assert delta_A < 1e-6, f"Case A (D==K) delta={delta_A}"
	print(f" [7a] D={K7}==K: extract == raw slice, delta={delta_A:.2e} ✓")

	# Case B: D > K (lm_head larger than codebook — offset=D-K)
	D7B = K7 + 10
	logits_full_B = torch.randn(B7, L7 + N7 + 1, D7B)
	z_vis_B = extract_visual_logits(logits_full_B, N7, K7)
	z_seq_B = logits_full_B[:, -(N7+1):-1] # [B, N, D]
	z_man_B = z_seq_B[..., D7B - K7:] # [B, N, K]
	delta_B = (z_vis_B - z_man_B).abs().max().item()
	assert delta_B < 1e-6, f"Case B (D>K) delta={delta_B}"
	print(f" [7b] D={D7B}>K={K7}: extract == z[..., D-K:], delta={delta_B:.2e} ✓")

	# Case C: latent_shift test (D >= latent_shift + K — full-vocab head)
	latent_shift_C = 12
	D7C = latent_shift_C + K7
	logits_full_C = torch.randn(B7, L7 + N7 + 1, D7C)
	# extract_visual_logits with D7C == D7C: D == K? No, D7C=76, K7=64, D>K
	# internal: offset = D7C - K7 = 12 = latent_shift_C → should match [..., latent_shift_C:]
	z_vis_C = extract_visual_logits(logits_full_C, N7, K7)
	z_seq_C = logits_full_C[:, -(N7+1):-1]
	z_man_C1 = z_seq_C[..., latent_shift_C:] # using latent_shift as offset
	z_man_C2 = z_seq_C[..., D7C - K7:] # using D-K as offset (same)
	assert torch.allclose(z_man_C1, z_man_C2), "C1 != C2"
	delta_C = (z_vis_C - z_man_C1).abs().max().item()
	assert delta_C < 1e-6, f"Case C (full-vocab) delta={delta_C}"
	print(f" [7c] D={D7C}=latent_shift+K: extract == z[..., latent_shift:], delta={delta_C:.2e} ✓")
	print("[7] extract_visual_logits alignment PASSED ✓")

	# =========================================================================
	# Patch (8): flex_attn semantics sanity (mock — no real model)
	# =========================================================================
	print("\n[8] flex_attn semantics sanity (mock) …")
	# Verify that _reset_flex_attn clears offsets and block_mask

	class FakeFlexAttn2:
	def __init__(self):
	self.offsets = [0, 50, 370]
	self.block_mask = "mask_obj"
	self.cu_offsets = torch.tensor([0, 50, 370])
	def set_offsets_by_lens(self, lens):
	from itertools import accumulate
	self.offsets = list(accumulate([0] + lens))
	self.block_mask = None

	class ModelFlex2:
	def __init__(self):
	self.flex_attn = FakeFlexAttn2()

	m8 = ModelFlex2()
	print(f" [8] before reset: offsets={m8.flex_attn.offsets}")
	_reset_flex_attn(m8, "m8", verbose=True)
	assert m8.flex_attn.offsets is None
	assert m8.flex_attn.block_mask is None
	assert m8.flex_attn.cu_offsets is None
	print(f" [8] after reset: offsets={m8.flex_attn.offsets} ✓")

	# Verify set_offsets_by_lens changes the offsets
	m8.flex_attn.set_offsets_by_lens([16, 60])
	assert m8.flex_attn.offsets == [0, 16, 76], f"offsets={m8.flex_attn.offsets}"
	_reset_flex_attn(m8, "m8")
	assert m8.flex_attn.offsets is None
	print(" [8] set_offsets_by_lens → reset cycle ✓")
	print("[8] flex_attn semantics sanity PASSED (mock) ✓")

	# =========================================================================
	# Patch (9): logp/token reshape consistency
	# =========================================================================
	print("\n[9] logp/token reshape consistency …")
	import math as _math

	T9, H9, W9 = 3, 4, 5
	N9, B9, K9 = T9 * H9 * W9, 1, K

	torch.manual_seed(99)
	z9 = torch.randn(B9, N9, K9)
	p9 = F.softmax(z9 / 1.0, dim=-1) # [1, 60, K]

	x_hat_flat = torch.multinomial(p9.view(-1, K9), 1) # [N9, 1]
	x_hat_1d = x_hat_flat.view(B9, N9) # [1, 60]
	x_hat_4d = x_hat_1d.view(B9, T9, H9, W9) # [1, 3, 4, 5]

	# reshape round-trip
	x_hat_back = x_hat_4d.view(B9, N9)
	assert torch.equal(x_hat_1d, x_hat_back), "reshape round-trip FAILED"

	# logp
	logp_all = p9.clamp(1e-8).log().gather(-1, x_hat_1d.unsqueeze(-1)).squeeze(-1) # [1, 60]
	logp_sum = logp_all.sum(-1)

	# 10 spot-checks
	torch.manual_seed(7)
	positions = torch.randperm(N9)[:10].tolist()
	for pos in positions:
	tok_id = x_hat_1d[0, pos].item()
	logp_man = _math.log(max(p9[0, pos, tok_id].item(), 1e-8))
	logp_gat = logp_all[0, pos].item()
	diff = abs(logp_man - logp_gat)
	assert diff < 1e-6, f"pos={pos} tok={tok_id} diff={diff:.2e}"

	print(
	f" [9] T={T9},H={H9},W={W9} N={N9} K={K9} "
	f"reshape ✓ 10 logp spots ✓ logp_sum={logp_sum.item():.3f}"
	)
	print("[9] logp/token reshape consistency PASSED ✓")

	# =========================================================================
	# Summary
	# =========================================================================
	print("\n" + "=" * 70)
	print("ALL 9 PATCHES PASSED ✓")
	print("=" * 70)
	print("""
	Patch summary:
	(1) Batch-concat [2B]: single forward = identical results, half the calls ✓
	(2) reward/adv detach: no student grad, REINFORCE still flows via logp ✓
	(3) float32+log_softmax: KL≥0, KL(p,p)≈0, stable with large logits ✓
	(3b) guided logits: per-sample trunc, finite, explosion guard ✓
	(4) Separate loss log: loss_aux_cond/uncond + loss_kd_cond/uncond ✓
	(5) use_guided [B]: per-sample Bernoulli, correct warmup ramp ✓
	(6) flex_attn: probe returns None/object, reset clears all fields ✓
	(7) extract_visual_logits: D==K, D>K, full-vocab paths all verified ✓
	(8) flex_attn semantics: reset/set cycle correct (no real model needed) ✓
	(9) logp/token reshape: round-trip exact, 10 logp spot-checks < 1e-6 ✓
	""")