import torch
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
import torch.nn as nn
import sys
import os


from arbitor.config import (
VOCAB, EMBEDDING_DIM, HIDDEN_DIM, FFN_HIDDEN, CTX, THRESHOLD,
CODEBOOK_DIM, CODEBOOK_SIZE,
SPECIAL_VOCAB,
StickyZoneSTE,
ByteEmbedding, Sequencer, TextSequencer, ImageSequencer, AudioSequencer,
MultimodalSequencer,
TernaryGNNLayer, TernaryGraph, GraphMoEGate, SharedProjectionMoE,
ByteHead, ARBModel, VQAdapter, MultimodalVQBridge, ModalityGate,
LossComponents, LossWeights, GNNLoRAAdapter,
HaltingUnit, GraphACTCell, MoEACTCell,
MemGram, ConvVQCodebook,
FocusGate, ConversationStack, ConversationLSTM,
_BOUNDARY_TOKEN_MAP, _extract_boundary_from_input,
)
from arbitor.kernel.ternary_scale import TernaryScaleTensor, TernaryRMSNorm, TScaleType

TERNARY_MODULES = (TernaryScaleTensor, TernaryRMSNorm, ByteEmbedding, TernaryGraph, GraphMoEGate, SharedProjectionMoE, GNNLoRAAdapter, HaltingUnit, GraphACTCell, MoEACTCell, Sequencer, TextSequencer, ImageSequencer, AudioSequencer, MultimodalVQBridge, ModalityGate, MemGram, ConvVQCodebook, ConversationLSTM)


def _is_ternary_param(model, name):
    parent_name = name.rsplit(".", 1)[0] if "." in name else ""
    parent = dict(model.named_modules()).get(parent_name, None)
    return isinstance(parent, TERNARY_MODULES)


# ===== Phase 1: Foundation Tests =====

def test_sticky_zone_ste():
    w = torch.randn(8, 8, requires_grad=True)
    t = StickyZoneSTE.apply(w, 0.05)
    unique = set(t.detach().flatten().tolist())
    assert unique.issubset({-1.0, 0.0, 1.0}), f"Non-ternary values: {unique}"
    t.sum().backward()
    assert w.grad is not None
    outside = w.abs() > 0.05
    if outside.any():
        assert (w.grad[outside] != 0).any(), "Outside threshold should have non-zero gradient"
    dead = w.abs() <= 0.05
    if dead.any():
        assert (w.grad[dead] >= 0).all(), "Sticky zone gradient should be non-negative"
    print(" PASS test_sticky_zone_ste")


def test_sticky_zone_ste_dtype_preservation():
    w_bf16 = torch.randn(8, 8, dtype=torch.bfloat16, requires_grad=True)
    t = StickyZoneSTE.apply(w_bf16, 0.05)
    assert t.dtype == torch.bfloat16, f"Expected bfloat16, got {t.dtype}"
    t.sum().backward()
    assert w_bf16.grad.dtype == torch.bfloat16, f"Expected bfloat16 grad, got {w_bf16.grad.dtype}"
    print(" PASS test_sticky_zone_ste_dtype_preservation")


def test_scaled_ternary_linear():
    lin = TernaryScaleTensor(32, 16, bias=False)
    x = torch.randn(2, 10, 32)
    out = lin(x)
    assert out.shape == (2, 10, 16), f"Shape: {out.shape}"
    assert lin.bias is None, "TernaryScaleTensor bias should be None"
    print(" PASS test_scaled_ternary_linear")


def test_rmsnorm():
    norm = TernaryRMSNorm(32)
    x = torch.randn(2, 10, 32)
    out = norm(x)
    assert out.shape == x.shape, f"Shape: {out.shape}"
    rms = torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-8)
    expected = torch.ones(32, device=x.device) * (x / rms)
    assert out.shape == expected.shape, "RMSNorm mismatch"
    print(" PASS test_rmsnorm")


def test_byte_embedding():
    emb = ByteEmbedding()
    x = torch.randint(0, VOCAB, (4, 20))
    out = emb(x)
    assert out.shape == (4, 20, EMBEDDING_DIM), f"Shape: {out.shape}"
    print(" PASS test_byte_embedding")


def test_text_sequencer():
    enc = TextSequencer()
    x = torch.randn(2, 10, EMBEDDING_DIM)
    out = enc(x)
    assert out.shape == (2, 8, HIDDEN_DIM), f"Shape: {out.shape}, expected (2, 8, {HIDDEN_DIM})"
    print(" PASS test_text_sequencer")


def test_trigram_window():
    x = torch.zeros(1, 5, EMBEDDING_DIM)
    for i in range(5):
        x[0, i, :] = i + 1
    windows = x.unfold(dimension=1, size=3, step=1)
    assert windows.shape == (1, 3, EMBEDDING_DIM, 3), f"Unfold shape: {windows.shape}"
    assert windows[0, 0, 0, 0].item() == 1.0
    assert windows[0, 0, 0, 1].item() == 2.0
    assert windows[0, 0, 0, 2].item() == 3.0
    print(" PASS test_trigram_window")


def test_image_sequencer():
    iseq = ImageSequencer()
    x = torch.randn(1, 3, 224, 224)
    out = iseq(x)
    assert out.shape == (1, 194, HIDDEN_DIM)
    print(" PASS test_image_sequencer")


def test_image_sequencer_frozen():
    iseq = ImageSequencer()
    for p in iseq.vit.parameters():
        assert not p.requires_grad
    print(" PASS test_image_sequencer_frozen")


def test_target_alignment():
    model = ARBModel()
    x = torch.tensor([[SPECIAL_VOCAB["BOS"], 10, 20, 30, 40, 50, SPECIAL_VOCAB["EOS"]]])
    targets = x[:, 3:]
    logits, losses, _, _ = model(x, targets=targets)
    assert losses is not None, "Losses should be computed"
    assert logits[:, :-1, :].shape[1] == targets.shape[1], "Target alignment mismatch"
    print(" PASS test_target_alignment")


def test_model_forward():
    model = ARBModel()
    B, T = 2, 66
    x = torch.randint(0, VOCAB, (B, T))
    logits, losses, _, _ = model(x)
    assert logits.shape == (B, T - 2, VOCAB), f"Shape: {logits.shape}, expected ({B}, {T-2}, {VOCAB})"
    assert losses is None, "Losses should be None without targets"
    print(" PASS test_model_forward")


def test_generate():
    model = ARBModel()
    model.eval()
    seed = torch.tensor([[SPECIAL_VOCAB["BOS"], ord("H"), ord("e"), ord("l")]])
    with torch.no_grad():
        output = model.generate(seed, max_new_token=10, temperature=1.0)
    assert output.shape == (1, 14), f"Shape: {output.shape}"
    assert (output >= 0).all() and (output < VOCAB).all(), "Tokens out of vocab range"
    print(" PASS test_generate")


def test_param_count():
    model = ARBModel()
    total = sum(p.numel() for p in model.parameters())
    print(f"  Param count: {total:,}")
    assert 120e6 < total < 150e6, f"Param count {total:,} outside expected range (frozen ViT-base + Whisper-tiny + MemGram + LSTM)"
    print(" PASS test_param_count")


def test_gradient_flow():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (4, 20))
    targets = x[:, 3:]
    logits, losses, _, _ = model(x, targets=targets)
    losses.total.backward()
    for name, param in model.named_parameters():
        if param.requires_grad and "embed" not in name and "graph_pool.query" not in name:
            if "moe.W_gate" in name or "moe.W_transform" in name or "moe.W_gate_norms" in name or "moe.W_transform_norms" in name:
                continue
            if "moe.router.bias" in name:
                continue
            if "moe.router_h" in name:
                continue
            if "memgram" in name or "conv_vq" in name or "lstm" in name:
                continue
            if "patch_proj" in name or "image_sequencer.projection" in name or "image_sequencer.norm" in name:
                continue
            if "audio_sequencer" in name or "multimodal_sequencer.audio" in name:
                continue
            if "bridge.image_vq" in name or "bridge.audio_vq" in name or "bridge.bridge_norm" in name or "modality_gate" in name:
                continue
            assert param.grad is not None, f"No gradient for {name}"
    print(" PASS test_gradient_flow")


def test_model_forward_with_targets():
    model = ARBModel()
    B, T = 4, CTX
    x = torch.randint(0, VOCAB, (B, T))
    targets = torch.randint(0, VOCAB, (B, T - 3))
    logits, losses, _, _ = model(x, targets=targets)
    assert losses is not None
    assert isinstance(losses, LossComponents)
    assert losses.total.ndim == 0
    assert losses.total > 0
    print(" PASS test_model_forward_with_targets")


def test_save_load_roundtrip():
    try:
        from arbitor.converters.convert_to_ternary8 import save_model, load_model
    except ImportError:
        print(" SKIP test_save_load_roundtrip (convert_to_ternary not available)")
        return
    model = ARBModel()
    save_model(model, "/tmp/test-morph-roundtrip.pt")
    loaded = load_model("/tmp/test-morph-roundtrip.pt")
    x = torch.randint(0, VOCAB, (1, 10))
    model.eval()
    loaded.eval()
    with torch.no_grad():
        logits_orig, _, _, _ = model(x)
        logits_loaded, _, _ = loaded(x)
    assert torch.allclose(logits_orig, logits_loaded, atol=1e-6), "Save/load roundtrip mismatch"
    print(" PASS test_save_load_roundtrip")


# ===== Phase 2: VQ Tests =====

def test_vq_adapter_shapes():
    adapter = VQAdapter()
    x = torch.randn(2, 10, HIDDEN_DIM)
    out, vq_loss, indices = adapter(x)
    assert out.shape == (2, 10, HIDDEN_DIM), f"VQ output shape: {out.shape}"
    assert indices.shape == (2, 10), f"VQ indices shape: {indices.shape}"
    assert indices.dtype == torch.long, "Indices must be long"
    assert vq_loss.item() >= 0, "VQ loss must be non-negative"
    print(" PASS test_vq_adapter_shapes")


def test_vq_integration():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    logits, losses, vq_indices, _ = model(x)
    assert logits.shape == (2, 64, VOCAB), f"Logits shape: {logits.shape}"
    assert vq_indices is not None, "VQ indices must be returned"
    assert vq_indices.shape == (2, 64), f"VQ indices shape wrong: {vq_indices.shape}"
    print(" PASS test_vq_integration")


def test_vq_disabled():
    model = ARBModel()
    model.vq_enabled = False
    model.graph_enabled = False
    model.moe_enabled = False
    x = torch.randint(0, VOCAB, (2, 66))
    logits, losses, vq_indices, _ = model(x)
    assert vq_indices is None, "Indices should be None when VQ disabled"
    assert logits.shape == (2, 64, VOCAB)
    print(" PASS test_vq_disabled")


def test_vq_with_targets():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:66]
    logits, losses, vq_indices, _ = model(x, targets=targets)
    assert losses is not None and losses.total.item() > 0, "Loss should be positive with targets"
    print(" PASS test_vq_with_targets")


def test_l2_distance_matching():
    adapter = VQAdapter()
    x_proj = torch.randn(2, 10, 32)
    l2_indices, l2_dists = adapter.l2_distance_matching(x_proj)
    assert l2_indices.shape == (2, 10), f"L2 indices shape: {l2_indices.shape}"
    assert l2_dists.shape == (2, 10), f"L2 distances shape: {l2_dists.shape}"
    assert (l2_dists >= 0).all(), "L2 distances must be non-negative"
    print(" PASS test_l2_distance_matching")


def test_vq_ternary_projections():
    adapter = VQAdapter()
    assert isinstance(adapter.proj_in, TernaryScaleTensor), \
        f"proj_in should be TernaryScaleTensor, got {type(adapter.proj_in)}"
    assert isinstance(adapter.proj_out, TernaryScaleTensor), \
        f"proj_out should be TernaryScaleTensor, got {type(adapter.proj_out)}"
    x = torch.randn(2, 10, HIDDEN_DIM)
    out, vq_loss, indices = adapter(x)
    assert out.shape == (2, 10, HIDDEN_DIM), f"VQ output shape: {out.shape}"
    assert vq_loss.item() >= 0, "VQ loss must be non-negative"
    print(" PASS test_vq_ternary_projections")


# ===== Phase 6: Multi-Modal Bridge, Gate, and Graph Tests =====

def test_multimodal_vq_bridge_text_only():
    bridge = MultimodalVQBridge()
    text_in = torch.randn(2, 10, 512)
    combined, losses, indices = bridge({'text': text_in})
    assert combined.shape == (2, 10, 512)
    assert 'text_vq' in losses
    assert (indices['text'] < 8192).all()
    print(" PASS test_multimodal_vq_bridge_text_only")


def test_multimodal_vq_bridge_text_image():
    bridge = MultimodalVQBridge()
    text_in = torch.randn(2, 10, 512)
    image_in = torch.randn(2, 20, 512)
    combined, losses, indices = bridge({'text': text_in, 'image': image_in})
    assert combined.shape == (2, 30, 512)
    assert (indices['image'] >= 8192).all()
    assert (indices['image'] < 12288).all()
    print(" PASS test_multimodal_vq_bridge_text_image")


def test_modality_gate_shapes():
    gate = ModalityGate()
    weights, count, hops = gate(['text'])
    assert isinstance(weights, dict)
    assert count >= 1
    assert hops >= 2
    print(" PASS test_modality_gate_shapes")


def test_ternary_graph_multicodebook():
    graph = TernaryGraph(total_vocab_size=16384)
    text_embed = torch.randn(1, 8192, 32)
    image_embed = torch.randn(1, 4096, 32)
    audio_embed = torch.randn(1, 4096, 32)
    graph._codebook_embed = torch.cat([text_embed, image_embed, audio_embed], dim=1)
    vq_out = torch.randn(2, 21, 512)
    text_idx = torch.randint(0, 8192, (2, 8))
    image_idx = torch.randint(8192, 12288, (2, 7))
    audio_idx = torch.randint(12288, 16384, (2, 6))
    vq_idx = torch.cat([text_idx, image_idx, audio_idx], dim=1)
    per_pos, gpool, gate_alpha = graph(vq_out, vq_idx, 0.05)
    assert per_pos.shape == (2, 21, 512)
    print(" PASS test_ternary_graph_multicodebook")


def test_vq_no_float_cast_in_model():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    logits, losses, vq_indices, _ = model(x)
    assert logits.shape == (2, 64, VOCAB), f"Logits shape: {logits.shape}"
    for name, mod in model.named_modules():
        if isinstance(mod, nn.Linear):
            if "image_sequencer" in name or "multimodal_sequencer.image" in name or "moe.router" in name or "lstm." in name or "multimodal_sequencer.audio" in name:
                continue
            assert False, f"Unexpected nn.Linear: {name} — only moe.router, image_sequencer, lstm, and audio_sequencer are allowed"
        if isinstance(mod, nn.Embedding):
            assert "hop_lora.scale" in name or "lstm." in name or "whisper." in name or "vit." in name, f"nn.Embedding found: {name} — only hop_lora.scale, lstm.*, whisper.*, vit.* are allowed"
    print(" PASS test_vq_no_float_cast_in_model")


def test_zero_fp32_params():
    model = ARBModel()
    non_ternary_non_vq = 0
    for name, param in model.named_parameters():
        is_vq_internal = "bridge.text_vq.vq" in name or "bridge.image_vq.vq" in name or "bridge.audio_vq.vq" in name
        is_moe_router = "moe.router" in name
        is_lora_scale = "hop_lora.scale" in name
        is_vit_frozen = "image_sequencer.vit" in name or "multimodal_sequencer.image.vit" in name
        is_patch_proj = "patch_proj" in name
        is_audio_proj = "mfcc_proj" in name or "frame_proj" in name
        is_whisper_frozen = "whisper" in name
        is_memory = name.startswith("memgram") or name.startswith("conv_vq") or name.startswith("lstm")
        if is_vq_internal or is_moe_router or is_lora_scale or is_vit_frozen or is_patch_proj or is_audio_proj or is_whisper_frozen or is_memory:
            continue
        if not _is_ternary_param(model, name):
            non_ternary_non_vq += param.numel()
    assert non_ternary_non_vq == 0, \
        f"Found {non_ternary_non_vq} non-ternary, non-VQ, non-router params"
    print(" PASS test_zero_fp32_params")


def test_sticky_zone_ste_gradient():
    w = torch.tensor([-0.01, -0.03, -0.049, 0.06, 0.10], requires_grad=True)
    threshold = 0.05
    t = StickyZoneSTE.apply(w, threshold)
    t.sum().backward()
    expected = [0.2, 0.6, 0.98, 1.0, 1.0]
    for i, exp_ratio in enumerate(expected):
        actual = w.grad[i].item()
        assert abs(actual - exp_ratio) < 0.02, f"w={w[i].item():.3f}: expected ratio {exp_ratio}, got {actual:.3f}"
    print(" PASS test_sticky_zone_ste_gradient")


# ===== Phase 3: Graph Tests (updated for GraphMoEGate) =====

def test_graph_moe_gate_shape():
    gate = GraphMoEGate(dim=HIDDEN_DIM)
    x = torch.randn(2, 10, HIDDEN_DIM)
    pooled, alpha = gate(x)
    assert pooled.shape == (2, HIDDEN_DIM), f"Pooled shape: {pooled.shape}"
    assert alpha.shape == (2, 10, 1), f"Alpha shape: {alpha.shape}"
    assert (alpha >= 0).all() and (alpha <= 1).all(), "Alpha out of [0,1]"
    assert gate.query.numel() == HIDDEN_DIM, f"Gate params: {gate.query.numel()}"
    print(" PASS test_graph_moe_gate_shape")


def test_ternary_graph_shapes():
    graph = TernaryGraph(codebook_size=CODEBOOK_SIZE, codebook_dim=CODEBOOK_DIM, max_hops=2)
    graph._codebook_embed = torch.randn(1, CODEBOOK_SIZE, CODEBOOK_DIM)
    vq_output = torch.randn(2, 10, HIDDEN_DIM)
    vq_indices = torch.randint(0, CODEBOOK_SIZE, (2, 10))
    per_pos, gpool, gate_alpha = graph(vq_output, vq_indices, 0.05)
    assert per_pos.shape == (2, 10, HIDDEN_DIM), f"per_position shape: {per_pos.shape}"
    assert gpool.shape == (2, HIDDEN_DIM), f"graph_pool shape: {gpool.shape}"
    assert gate_alpha.shape == (2, 10, 1), f"gate_alpha shape: {gate_alpha.shape}"
    print(" PASS test_ternary_graph_shapes")


def test_graph_gradient_flow():
    graph = TernaryGraph(codebook_size=CODEBOOK_SIZE, codebook_dim=CODEBOOK_DIM, max_hops=2)
    graph._codebook_embed = torch.randn(1, CODEBOOK_SIZE, CODEBOOK_DIM)
    vq_output = torch.randn(2, 10, HIDDEN_DIM, requires_grad=True)
    vq_indices = torch.randint(0, CODEBOOK_SIZE, (2, 10))
    per_pos, _, _ = graph(vq_output, vq_indices, 0.05)
    per_pos.sum().backward()
    assert graph.edge_attr.grad is not None, "edge_attr should have gradient"
    assert vq_output.grad is not None, "vq_output should have gradient"
    print(" PASS test_graph_gradient_flow")


def test_graph_connectivity_monitor():
    graph = TernaryGraph(codebook_size=CODEBOOK_SIZE, codebook_dim=CODEBOOK_DIM, max_hops=2)
    health = graph.monitor_graph_health(threshold=0.05)
    assert 'sparsity' in health
    assert 'isolated_nodes' in health
    assert 'avg_polarity' in health
    assert 'dead_edges' in health
    assert 0.0 <= health['sparsity'] <= 1.0
    assert health['isolated_nodes'] >= 0
    print(" PASS test_graph_connectivity_monitor")


def test_model_forward_with_graph():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    logits, losses, vq_indices, _ = model(x)
    assert logits.shape == (2, 64, VOCAB), f"Logits shape: {logits.shape}"
    assert vq_indices is not None, "VQ indices required for graph"
    assert hasattr(model, 'ternary_graph'), "Model missing ternary_graph"
    print(" PASS test_model_forward_with_graph")


def test_model_graph_disabled():
    model = ARBModel()
    model.graph_enabled = False
    model.moe_enabled = False
    x = torch.randint(0, VOCAB, (2, 66))
    logits, losses, vq_indices, _ = model(x)
    assert logits.shape == (2, 64, VOCAB)
    print(" PASS test_model_graph_disabled")


def test_ternary_graph_in_modules():
    assert TernaryGraph in TERNARY_MODULES, "TernaryGraph not in TERNARY_MODULES"
    assert GraphMoEGate in TERNARY_MODULES, "GraphMoEGate not in TERNARY_MODULES"
    assert SharedProjectionMoE in TERNARY_MODULES, "SharedProjectionMoE not in TERNARY_MODULES"
    print(" PASS test_ternary_graph_in_modules")


# ===== Phase 4: MoE Tests =====

def test_moe_shapes():
    moe = SharedProjectionMoE(hidden_size=512, num_experts=8, top_k=2, core_rank=192, shared_inter=3072, tscale_type=TScaleType.T32)
    x = torch.randn(4, 10, 512)
    out, aux = moe(x)
    assert out.shape == (4, 10, 512), f'MoE output shape: {out.shape}'
    assert aux.ndim == 0, f'Aux loss should be scalar, got ndim={aux.ndim}'
    assert aux.item() >= 0, 'Aux loss should be non-negative'
    print(" PASS test_moe_shapes")


def test_moe_router():
    moe = SharedProjectionMoE(hidden_size=512, num_experts=8, top_k=2, noise_std=0.25)
    moe.train()
    x = torch.randn(4, 20, 512)
    out, aux = moe(x)
    assert moe._last_topk_idx is not None
    assert moe._last_topk_idx.shape == (80, 2), f'topk_idx shape: {moe._last_topk_idx.shape}'
    assert (moe._last_topk_idx >= 0).all() and (moe._last_topk_idx < 8).all()
    moe.eval()
    out2, _ = moe(x)
    print(" PASS test_moe_router")


def test_moe_aux_loss():
    moe = SharedProjectionMoE(hidden_size=512, num_experts=8, top_k=2)
    x = torch.randn(4, 10, 512)
    _, aux = moe(x)
    assert aux.item() >= 0, 'Aux loss must be non-negative'
    print(" PASS test_moe_aux_loss")


def test_shared_expert():
    moe = SharedProjectionMoE(hidden_size=512, num_experts=8, top_k=2)
    assert isinstance(moe.shared_expert_gate, TernaryScaleTensor)
    assert isinstance(moe.shared_expert_up, TernaryScaleTensor)
    assert isinstance(moe.shared_expert_down, TernaryScaleTensor)
    x = torch.randn(2, 5, 512)
    out, _ = moe(x)
    assert out.norm().item() > 0, 'Shared expert output should be non-zero'
    print(" PASS test_shared_expert")


def test_moe_gradient_flow():
    moe = SharedProjectionMoE(hidden_size=512, num_experts=8, top_k=2)
    x = torch.randn(2, 10, 512)
    x.requires_grad_(True)
    out, aux = moe(x)
    (out.sum() + aux).backward()
    assert x.grad is not None, 'No gradient on input'
    assert hasattr(moe.router, '_hook_grad_T_sign'), 'No grad captured on router'
    assert hasattr(moe.W_gate[0], '_hook_grad_T_sign'), 'No grad captured on W_gate[0]'
    print(" PASS test_moe_gradient_flow")


def test_moe_zero_fp32():
    moe = SharedProjectionMoE(hidden_size=512, num_experts=8, top_k=2)
    non_ternary = 0
    for name, param in moe.named_parameters():
        if not _is_ternary_param(moe, name):
            non_ternary += param.numel()
    assert non_ternary == 0, f'Expected 0 non-ternary params, got {non_ternary}'
    print(" PASS test_moe_zero_fp32")


def test_ternary_graph_with_gate():
    graph = TernaryGraph(codebook_size=CODEBOOK_SIZE, codebook_dim=CODEBOOK_DIM)
    graph._codebook_embed = torch.randn(1, CODEBOOK_SIZE, CODEBOOK_DIM)
    vq_output = torch.randn(2, 10, HIDDEN_DIM)
    vq_indices = torch.randint(0, CODEBOOK_SIZE, (2, 10))
    per_pos, gpool, gate_alpha = graph(vq_output, vq_indices, 0.05)
    assert gate_alpha.shape == (2, 10, 1), f'gate_alpha shape: {gate_alpha.shape}'
    assert (gate_alpha >= 0).all() and (gate_alpha <= 1).all()
    print(" PASS test_ternary_graph_with_gate")


def test_model_forward_with_moe():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    logits, losses, vq_indices, _ = model(x)
    assert logits.shape == (2, 64, VOCAB), f'Logits shape: {logits.shape}'
    assert vq_indices is not None
    print(" PASS test_model_forward_with_moe")


def test_model_moe_disabled():
    model = ARBModel()
    model.moe_enabled = False
    x = torch.randint(0, VOCAB, (2, 66))
    logits, losses, vq_indices, _ = model(x)
    assert logits.shape == (2, 64, VOCAB)
    print(" PASS test_model_moe_disabled")


def test_model_moe_loss_components():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    logits, losses, vq_indices, _ = model(x, targets=targets)
    assert losses is not None and isinstance(losses, LossComponents)
    assert losses.lm is not None and losses.lm > 0
    assert losses.vq_commitment is not None
    assert losses.moe_aux is not None
    assert losses.graph_l1 is not None
    assert losses.total > 0
    assert model.moe._last_topk_idx is not None, 'MoE should have routing info after forward'
    assert model.moe._last_aux_loss is not None, 'MoE should have aux_loss cached after forward'
    print(" PASS test_model_moe_loss_components")


def test_model_moe_gate_modulation():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    logits, _, _, _ = model(x)
    assert logits.shape == (2, 64, VOCAB)
    print(" PASS test_model_moe_gate_modulation")


def test_param_count_with_moe():
    model = ARBModel()
    total = sum(p.numel() for p in model.parameters())
    print(f"  Param count with MoE: {total:,}")
    assert 120e6 < total < 150e6, f'Expected ~133M (frozen ViT-base + Whisper-tiny + ternary buffers), got {total:,}'
    print(" PASS test_param_count_with_moe")


def test_moe_monitoring():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    model(x)
    assert model.moe._last_topk_idx is not None, '_last_topk_idx should be set after forward'
    assert model.moe._last_aux_loss is not None, '_last_aux_loss should be set after forward'
    assert model.moe._last_topk_idx.shape[1] == model.moe.top_k
    print(" PASS test_moe_monitoring")


# ===== Explore: LossComponents + GNN LoRA Tests =====

def test_loss_components():
    lm = torch.tensor(5.0, requires_grad=True)
    vq = torch.tensor(0.5, requires_grad=True)
    moe = torch.tensor(0.01, requires_grad=True)
    graph = torch.tensor(0.001, requires_grad=True)
    lc = LossComponents(lm=lm, vq_commitment=vq, moe_aux=moe, graph_l1=graph)
    assert lc.total.ndim == 0, f"Total should be scalar, got ndim={lc.total.ndim}"
    expected = 5.0 + 0.5 + 0.01 + LossWeights.graph_l1 * 0.001
    assert abs(lc.total.item() - expected) < 1e-5, f"Total mismatch: {lc.total.item()} vs {expected}"
    lc.total.backward()
    assert lm.grad is not None, "LM loss should have gradient"
    print(" PASS test_loss_components")


def test_loss_components_none_fields():
    lm = torch.tensor(3.0, requires_grad=True)
    lc = LossComponents(lm=lm, vq_commitment=None, moe_aux=None, graph_l1=None)
    assert lc.total.item() == 3.0, f"Total with None fields: {lc.total.item()}"
    print(" PASS test_loss_components_none_fields")


def test_loss_components_backward():
    lm = torch.tensor(4.0, requires_grad=True)
    vq = torch.tensor(0.3, requires_grad=True)
    lc = LossComponents(lm=lm, vq_commitment=vq)
    lc.backward()
    assert lm.grad is not None, "LM should have gradient after backward"
    assert vq.grad is not None, "VQ should have gradient after backward"
    print(" PASS test_loss_components_backward")


def test_gnn_lora_adapter():
    lora = GNNLoRAAdapter(dim=512, rank=32, max_hops=4)
    x = torch.randn(8192, 512)
    out0 = lora(x, hop_t=0)
    out1 = lora(x, hop_t=1)
    assert out0.shape == (8192, 512), f"LoRA output shape: {out0.shape}"
    assert torch.allclose(out0, out1, atol=1e-6), "Zero-init scales should produce same output at init"
    lora.scale.weight.data[1] = lora.scale.weight.data[0] + 1.0
    out1_modified = lora(x, hop_t=1)
    assert not torch.allclose(out0, out1_modified), "Non-zero scales should differentiate hops"
    print(" PASS test_gnn_lora_adapter")


def test_gnn_lora_gradient():
    lora = GNNLoRAAdapter(dim=512, rank=32, max_hops=4)
    x = torch.randn(8192, 512, requires_grad=True)
    out = lora(x, hop_t=0)
    out.sum().backward()
    assert x.grad is not None, "Input should have gradient"
    assert lora.scale.weight.grad is not None, "LoRA scale should have gradient"
    print(" PASS test_gnn_lora_gradient")


def test_shared_gnn_weight_tying():
    graph = TernaryGraph(codebook_size=CODEBOOK_SIZE, codebook_dim=CODEBOOK_DIM, max_hops=3)
    assert hasattr(graph, 'gnn'), "Graph should have single shared GNN layer"
    assert not hasattr(graph, 'gnn_layers'), "Graph should NOT have gnn_layers list"
    assert hasattr(graph, 'hop_lora'), "Graph should have hop_lora adapter"
    assert graph.max_hops == 3, f"max_hops should be 3, got {graph.max_hops}"
    print(" PASS test_shared_gnn_weight_tying")


def test_shared_gnn_multi_hop():
    graph = TernaryGraph(codebook_size=CODEBOOK_SIZE, codebook_dim=CODEBOOK_DIM, max_hops=4, lora_rank=32)
    graph._codebook_embed = torch.randn(1, CODEBOOK_SIZE, CODEBOOK_DIM)
    vq_output = torch.randn(2, 10, HIDDEN_DIM)
    vq_indices = torch.randint(0, CODEBOOK_SIZE, (2, 10))
    per_pos, gpool, gate_alpha = graph(vq_output, vq_indices, 0.05)
    assert per_pos.shape == (2, 10, HIDDEN_DIM), f"per_position shape: {per_pos.shape}"
    print(" PASS test_shared_gnn_multi_hop")


def test_model_losses_components_type():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    logits, losses, vq_indices, _ = model(x, targets=targets)
    assert isinstance(losses, LossComponents), f"Expected LossComponents, got {type(losses)}"
    assert losses.lm is not None
    assert losses.vq_commitment is not None
    assert losses.moe_aux is not None
    assert losses.graph_l1 is not None
    total = losses.total.item()
    w = losses.weights
    manual = w.lm * losses.lm.item() + w.vq_commitment * losses.vq_commitment.item() + w.moe_aux * losses.moe_aux.item() + w.graph_l1 * losses.graph_l1.item()
    if losses.graph_ponder is not None:
        manual += w.graph_ponder * losses.graph_ponder.item()
    if losses.moe_ponder is not None:
        manual += w.moe_ponder * losses.moe_ponder.item()
    if losses.conv_vq_commitment is not None:
        manual += w.conv_vq_commitment * losses.conv_vq_commitment.item()
    if losses.memgram_decay_reg is not None:
        manual += w.memgram_decay_reg * losses.memgram_decay_reg.item()
    if losses.lstm_hidden_reg is not None:
        manual += w.lstm_hidden_reg * losses.lstm_hidden_reg.item()
    assert abs(total - manual) < 1e-4, f"Total {total} != weighted sum {manual}"
    print(" PASS test_model_losses_components_type")


# ===== Phase 5: ACT Adaptive Computation Tests =====

def test_halting_unit_shapes():
    hu = HaltingUnit(dim=512, tscale_type=TScaleType.T32)
    x = torch.randn(4, 10, 512)
    x.requires_grad_(True)
    p = hu(x)
    assert p.shape == (4, 10, 1), f"Shape: {p.shape}"
    assert (p > 0).all() and (p < 1).all(), f"Range: ({p.min():.4f}, {p.max():.4f})"
    p.sum().backward()
    assert x.grad is not None, "No gradient on input"
    print(" PASS test_halting_unit_shapes")


def test_halting_unit_ternary_pure():
    hu = HaltingUnit(dim=512)
    for name, mod in hu.named_modules():
        if isinstance(mod, nn.Linear):
            assert False, f"nn.Linear found: {name}"
        if isinstance(mod, nn.Embedding):
            assert False, f"nn.Embedding found: {name}"
    print(" PASS test_halting_unit_ternary_pure")


def test_graph_act_cell_shapes():
    graph = TernaryGraph(codebook_size=8192, codebook_dim=32, max_hops=2, tscale_type=TScaleType.T32)
    graph._codebook_embed = torch.randn(1, 8192, 32)
    act = GraphACTCell(graph, max_hops=4, halt_threshold=0.01)
    vq_out = torch.randn(2, 10, 512)
    vq_out.requires_grad_(True)
    vq_idx = torch.randint(0, 8192, (2, 10))
    per_pos, gpool, gate_alpha, ponder = act(vq_out, vq_idx, 0.05)
    assert per_pos.shape == (2, 10, 512), f"per_pos: {per_pos.shape}"
    assert gpool.shape == (2, 512), f"gpool: {gpool.shape}"
    assert gate_alpha.shape == (2, 10, 1), f"gate_alpha: {gate_alpha.shape}"
    assert ponder.ndim == 0
    assert ponder.item() > 0
    per_pos.sum().backward()
    assert vq_out.grad is not None, "No gradient on input"
    print(" PASS test_graph_act_cell_shapes")


def test_moe_act_cell_shapes():
    moe = SharedProjectionMoE(hidden_size=512, num_experts=8, top_k=2, tscale_type=TScaleType.T32)
    act = MoEACTCell(moe, dim=512, max_iters=4, halt_threshold=0.01)
    x = torch.randn(2, 10, 512)
    x.requires_grad_(True)
    out, aux, ponder = act(x)
    assert out.shape == (2, 10, 512), f"out: {out.shape}"
    assert aux.ndim == 0
    assert ponder.ndim == 0
    assert aux.item() >= 0
    assert ponder.item() > 0
    out.sum().backward()
    assert x.grad is not None, "No gradient on input"
    print(" PASS test_moe_act_cell_shapes")


def test_act_early_halt():
    graph = TernaryGraph(codebook_size=8192, codebook_dim=32, max_hops=2, tscale_type=TScaleType.T32)
    graph._codebook_embed = torch.randn(1, 8192, 32)
    act = GraphACTCell(graph, max_hops=8, halt_threshold=100.0)
    vq_out = torch.randn(2, 10, 512)
    vq_idx = torch.randint(0, 8192, (2, 10))
    _, _, _, ponder = act(vq_out, vq_idx, 0.05)
    act_low = GraphACTCell(graph, max_hops=8, halt_threshold=1e-6)
    _, _, _, ponder_low = act_low(vq_out, vq_idx, 0.05)
    assert ponder_low.item() < ponder.item(), \
        f"Early halt ponder ({ponder_low:.4f}) should be less than no-halt ponder ({ponder:.4f})"
    print(" PASS test_act_early_halt")


def test_act_weight_sum_one():
    moe = SharedProjectionMoE(hidden_size=512, num_experts=8, top_k=2, tscale_type=TScaleType.T32)
    act = MoEACTCell(moe, dim=512, max_iters=3, halt_threshold=1e-6)
    x = torch.randn(2, 10, 512)
    out_fast, _, _ = act(x)
    act_slow = MoEACTCell(moe, dim=512, max_iters=3, halt_threshold=100.0)
    out_slow, _, _ = act_slow(x)

    out_sum = out_fast.sum().item() + out_slow.sum().item()
    assert not torch.isnan(out_fast).any(), "NaN in fast ACT output"
    assert not torch.isnan(out_slow).any(), "NaN in slow ACT output"
    assert out_sum != 0, "Outputs should be non-zero (weights sum to 1.0)"
    print(" PASS test_act_weight_sum_one")


def test_act_gradient_flow():
    moe = SharedProjectionMoE(hidden_size=512, num_experts=8, top_k=2, tscale_type=TScaleType.T32)
    act = MoEACTCell(moe, dim=512, max_iters=3, halt_threshold=0.01)
    x = torch.randn(2, 10, 512, requires_grad=True)
    out, aux, ponder = act(x)
    loss = out.sum() + aux + ponder
    loss.backward()
    assert x.grad is not None, "Input grad is None"
    print(" PASS test_act_gradient_flow")


def test_loss_components_ponder_fields():
    lm = torch.tensor(5.0, requires_grad=True)
    gp = torch.tensor(0.1, requires_grad=True)
    mp = torch.tensor(0.2, requires_grad=True)
    lc = LossComponents(lm=lm, graph_ponder=gp, moe_ponder=mp)
    expected = 5.0 + 0.1 + 0.2
    assert abs(lc.total.item() - expected) < 1e-5, f"Total: {lc.total.item()} vs {expected}"
    lc.total.backward()
    assert lm.grad is not None
    assert gp.grad is not None
    assert mp.grad is not None
    print(" PASS test_loss_components_ponder_fields")


def test_loss_components_ponder_none():
    lm = torch.tensor(3.0, requires_grad=True)
    lc = LossComponents(lm=lm, graph_ponder=None, moe_ponder=None)
    assert abs(lc.total.item() - 3.0) < 1e-5
    lc.total.backward()
    assert lm.grad is not None
    print(" PASS test_loss_components_ponder_none")


def test_act_graph_moe_sequential():
    graph = TernaryGraph(codebook_size=8192, codebook_dim=32, max_hops=2, tscale_type=TScaleType.T32)
    graph._codebook_embed = torch.randn(1, 8192, 32)
    graph_act = GraphACTCell(graph, max_hops=3, halt_threshold=0.01)
    moe = SharedProjectionMoE(hidden_size=512, num_experts=8, top_k=2, tscale_type=TScaleType.T32)
    moe_act = MoEACTCell(moe, dim=512, max_iters=3, halt_threshold=0.01)

    vq_out = torch.randn(2, 10, 512)
    vq_out.requires_grad_(True)
    vq_idx = torch.randint(0, 8192, (2, 10))
    per_pos, gpool, gate_alpha, graph_ponder = graph_act(vq_out, vq_idx, 0.05)
    moe_out, aux, moe_ponder = moe_act(per_pos)
    final = gate_alpha * moe_out + (1 - gate_alpha) * per_pos
    assert final.shape == (2, 10, 512), f"Sequential output: {final.shape}"
    assert graph_ponder.ndim == 0
    assert moe_ponder.ndim == 0
    final.sum().backward()
    assert vq_out.grad is not None, "Input grad is None"
    print(" PASS test_act_graph_moe_sequential")


# ===== Model-level ACT Integration Tests =====

def test_model_forward_with_act():
    model = ARBModel(tscale_type=TScaleType.T32)
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    logits, losses, _, _ = model(x, targets=targets)
    assert logits.shape == (2, 64, VOCAB), f"Logits: {logits.shape}"
    assert isinstance(losses, LossComponents)
    assert losses.graph_ponder is not None
    assert losses.moe_ponder is not None
    assert losses.total > 0
    print(" PASS test_model_forward_with_act")


def test_model_act_forward_without_targets():
    model = ARBModel(tscale_type=TScaleType.T32)
    x = torch.randint(0, VOCAB, (2, 66))
    logits, losses, _, _ = model(x)
    assert logits.shape == (2, 64, VOCAB)
    assert losses is None
    print(" PASS test_model_act_forward_without_targets")


def test_model_act_loss_components():
    model = ARBModel(tscale_type=TScaleType.T32)
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    _, losses, _, _ = model(x, targets=targets)
    assert losses.lm is not None
    assert losses.vq_commitment is not None
    assert losses.moe_aux is not None
    assert losses.graph_l1 is not None
    assert losses.graph_ponder is not None
    assert losses.moe_ponder is not None
    assert losses.total > sum(filter(None, [losses.graph_ponder, losses.moe_ponder]))
    print(" PASS test_model_act_loss_components")


def test_model_act_backward():
    model = ARBModel(tscale_type=TScaleType.T32)
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    _, losses, _, _ = model(x, targets=targets)
    losses.backward()
    assert model.ternary_graph.edge_attr.grad is not None, "edge_attr grad None"
    print(" PASS test_model_act_backward")


def test_model_act_disabled():
    model = ARBModel(tscale_type=TScaleType.T32)
    model.graph_act_enabled = False
    model.moe_act_enabled = False
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    logits, losses, _, _ = model(x, targets=targets)
    assert logits.shape == (2, 64, VOCAB)
    assert losses.graph_ponder is None
    assert losses.moe_ponder is None
    assert model.graph_act_enabled == False
    assert model.moe_act_enabled == False
    print(" PASS test_model_act_disabled")


def test_model_act_warmup_mode():
    model = ARBModel(tscale_type=TScaleType.T32)
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    _, losses, _, _ = model(x, targets=targets, act_warmup_mode=True)
    assert losses.graph_ponder is None, f"During warmup, graph_ponder should be None: {losses.graph_ponder}"
    assert losses.moe_ponder is None, f"During warmup, moe_ponder should be None: {losses.moe_ponder}"
    _, losses2, _, _ = model(x, targets=targets, act_warmup_mode=False)
    assert losses2.graph_ponder is not None, "Without warmup, graph_ponder should be present"
    assert losses2.moe_ponder is not None, "Without warmup, moe_ponder should be present"
    print(" PASS test_model_act_warmup_mode")


def test_model_act_ponder_cached():
    model = ARBModel(tscale_type=TScaleType.T32)
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    model(x, targets=targets)
    assert model._last_graph_ponder > 0, f"_last_graph_ponder={model._last_graph_ponder}"
    assert model._last_moe_ponder > 0, f"_last_moe_ponder={model._last_moe_ponder}"
    print(" PASS test_model_act_ponder_cached")


# ===== ACT Warmup and Monitoring Tests =====

def test_act_warmup_schedule():
    from train import compute_act_warmup
    assert compute_act_warmup(0, 50000) == True, "step 0 is warmup"
    assert compute_act_warmup(9999, 50000) == True, "step 9999 is warmup"
    assert compute_act_warmup(10000, 50000) == False, "step 10000 not warmup"
    assert compute_act_warmup(50000, 50000) == False, "step 50000 not warmup"
    print(" PASS test_act_warmup_schedule")


def test_act_ponder_lambda():
    from train import get_ponder_lambda
    lam0 = get_ponder_lambda(0, 50000, warmup_frac=0.2, start_lambda=0.1, end_lambda=0.01)
    assert abs(lam0 - 0.1) < 1e-6, f"start lambda: {lam0}"
    lam_mid = get_ponder_lambda(5000, 50000, warmup_frac=0.2, start_lambda=0.1, end_lambda=0.01)
    assert lam_mid > 0.01 and lam_mid < 0.1, f"mid lambda: {lam_mid}"
    lam_end = get_ponder_lambda(10000, 50000, warmup_frac=0.2, start_lambda=0.1, end_lambda=0.01)
    assert abs(lam_end - 0.01) < 1e-6, f"end lambda: {lam_end}"
    print(" PASS test_act_ponder_lambda")


def test_model_ponder_lambda_scaling():
    model = ARBModel(tscale_type=TScaleType.T32)
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    _, losses_high, _, _ = model(x, targets=targets, ponder_lambda=0.5)
    _, losses_low, _, _ = model(x, targets=targets, ponder_lambda=0.01)
    if losses_high.graph_ponder is not None and losses_low.graph_ponder is not None:
        assert losses_high.graph_ponder.item() > losses_low.graph_ponder.item(), \
            "Higher ponder_lambda should produce larger ponder loss"
    print(" PASS test_model_ponder_lambda_scaling")


# ===== Phase 6: Integration Tests =====

def test_text_only_forward():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    logits, losses, indices, _ = model(x)
    assert logits.shape == (2, 64, VOCAB)
    assert indices is not None
    print(" PASS test_text_only_forward")


def test_image_forward():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    img = torch.randn(2, 3, 224, 224)
    logits, losses, indices, _ = model(x, images=img)
    assert logits.shape == (2, 64, VOCAB)
    assert indices is not None
    print(" PASS test_image_forward")


def test_multimodal_backward():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    img = torch.randn(2, 3, 224, 224)
    logits, losses, _, _ = model(x, targets=targets, images=img)
    assert losses is not None
    losses.total.backward()
    for name, param in model.named_parameters():
        if param.requires_grad and param.grad is None:
            if any(skip in name for skip in ['vit', 'embedding', 'patch_proj', 'frame_proj', 'router.bias', 'router_h', 'W_gate', 'W_transform', 'hop_lora.scale', 'modality_gate', 'graph_pool.query', 'memgram', 'conv_vq', 'lstm', 'mfcc_proj', 'audio_sequencer', 'audio_vq']):
                continue
            assert False, f'No gradient for {name}'
    print(" PASS test_multimodal_backward")


def test_no_stale_trigram_encoder():
    assert not hasattr(sys.modules['trigram'], 'TrigramEncoder'), 'TrigramEncoder should be removed'
    print(" PASS test_no_stale_trigram_encoder")


def test_vocab():
    assert VOCAB == 288
    assert len(SPECIAL_VOCAB) == 32
    assert SPECIAL_VOCAB['IMAGE'] == 278
    print(" PASS test_vocab")


# ===== Phase 6b: Audio + Quantized Encoder Tests =====

def test_audio_sequencer_construction():
    aseq = AudioSequencer()
    assert aseq.modality == 'audio'
    assert aseq.window_size == 5
    assert aseq.whisper is not None
    assert aseq.frame_proj is not None
    for p in aseq.whisper.parameters():
        assert not p.requires_grad, f"Whisper param {p} should be frozen"
    print(" PASS test_audio_sequencer_construction")


def test_audio_sequencer_forward_waveform():
    aseq = AudioSequencer()
    waveform = torch.randn(2, 80000)
    out = aseq(waveform)
    assert out.dim() == 3
    assert out.shape[0] == 2
    assert out.shape[2] == HIDDEN_DIM
    assert out.shape[1] > 0
    print(" PASS test_audio_sequencer_forward_waveform")


def test_audio_sequencer_forward_precomputed_mel():
    aseq = AudioSequencer()
    mel = torch.randn(2, 80, 3000)
    out = aseq(mel)
    assert out.dim() == 3
    assert out.shape[0] == 2
    assert out.shape[2] == HIDDEN_DIM
    print(" PASS test_audio_sequencer_forward_precomputed_mel")


def test_audio_sequencer_quantization_fp8():
    aseq = AudioSequencer(quantize_weights='fp8')
    found_qlinear = False
    for name, mod in aseq.whisper.named_modules():
        if 'QLinear' in type(mod).__name__:
            found_qlinear = True
            if hasattr(mod, 'weight') and hasattr(mod.weight, '_data'):
                assert mod.weight._data.dtype == torch.float8_e4m3fn, f"Expected fp8 data, got {mod.weight._data.dtype}"
            break
    assert found_qlinear, "No QLinear modules found — quantization may not have been applied"
    print(" PASS test_audio_sequencer_quantization_fp8")


def test_audio_sequencer_quantization_int8():
    aseq = AudioSequencer(quantize_weights='int8')
    found_qlinear = False
    for name, mod in aseq.whisper.named_modules():
        if 'QLinear' in type(mod).__name__:
            found_qlinear = True
            break
    assert found_qlinear, "No QLinear modules found — int8 quantization may not have been applied"
    print(" PASS test_audio_sequencer_quantization_int8")


def test_audio_sequencer_no_quantize():
    aseq = AudioSequencer(quantize_weights=None)
    for name, mod in aseq.whisper.named_modules():
        if 'QLinear' in type(mod).__name__:
            assert False, "No QLinear should exist when quantize_weights=None"
    for p in aseq.whisper.parameters():
        assert p.dtype == torch.bfloat16, f"Expected bfloat16, got {p.dtype}"
    print(" PASS test_audio_sequencer_no_quantize")


def test_image_sequencer_hf_vit():
    iseq = ImageSequencer()
    assert iseq.modality == 'image'
    assert iseq.window_size == 3
    img = torch.randn(1, 3, 224, 224)
    out = iseq(img)
    assert out.shape == (1, 194, HIDDEN_DIM)
    print(" PASS test_image_sequencer_hf_vit")


def test_image_sequencer_quantization_fp8():
    iseq = ImageSequencer(quantize_weights='fp8')
    found_qlinear = False
    for name, mod in iseq.vit.named_modules():
        if 'QLinear' in type(mod).__name__:
            found_qlinear = True
            if hasattr(mod, 'weight') and hasattr(mod.weight, '_data'):
                assert mod.weight._data.dtype == torch.float8_e4m3fn, f"Expected fp8, got {mod.weight._data.dtype}"
            break
    assert found_qlinear, "No QLinear modules found in ViT — fp8 quantization may not have been applied"
    print(" PASS test_image_sequencer_quantization_fp8")


def test_image_sequencer_no_quantize():
    iseq = ImageSequencer(quantize_weights=None)
    for name, mod in iseq.vit.named_modules():
        if 'QLinear' in type(mod).__name__:
            assert False, "No QLinear should exist when quantize_weights=None"
    for p in iseq.vit.parameters():
        assert p.dtype == torch.bfloat16, f"Expected bfloat16, got {p.dtype}"
    print(" PASS test_image_sequencer_no_quantize")


def test_multimodal_sequencer_all_modalities():
    mseq = MultimodalSequencer()
    assert 'text' in mseq.enabled_modalities
    assert 'image' in mseq.enabled_modalities
    assert 'audio' in mseq.enabled_modalities
    assert mseq.text is not None
    assert mseq.image is not None
    assert mseq.audio is not None
    print(" PASS test_multimodal_sequencer_all_modalities")


def test_multimodal_sequencer_text_only():
    mseq = MultimodalSequencer(enable_image=False, enable_audio=False)
    assert mseq.enabled_modalities == ['text']
    assert mseq.image is None
    assert mseq.audio is None
    x = torch.randint(0, VOCAB, (2, 20))
    embedded = torch.randn(2, 20, EMBEDDING_DIM)
    out = mseq({'text': embedded})
    assert 'text' in out
    assert 'image' not in out
    assert 'audio' not in out
    print(" PASS test_multimodal_sequencer_text_only")


def test_multimodal_sequencer_full_forward():
    mseq = MultimodalSequencer()
    embedded = torch.randn(2, 20, EMBEDDING_DIM)
    img = torch.randn(2, 3, 224, 224)
    audio = torch.randn(2, 80000)
    out = mseq({'text': embedded, 'image': img, 'audio': audio})
    assert 'text' in out
    assert 'image' in out
    assert 'audio' in out
    assert out['text'].shape[2] == HIDDEN_DIM
    assert out['image'].shape[2] == HIDDEN_DIM
    assert out['audio'].shape[2] == HIDDEN_DIM
    print(" PASS test_multimodal_sequencer_full_forward")


def test_multimodal_vq_bridge_text_audio():
    bridge = MultimodalVQBridge()
    text_in = torch.randn(2, 10, 512)
    audio_in = torch.randn(2, 15, 512)
    combined, losses, indices = bridge({'text': text_in, 'audio': audio_in})
    assert combined.shape == (2, 25, 512)
    assert 'audio_vq' in losses
    assert (indices['audio'] >= 12288).all()
    assert (indices['audio'] < 16384).all()
    print(" PASS test_multimodal_vq_bridge_text_audio")


def test_multimodal_vq_bridge_all_three():
    bridge = MultimodalVQBridge()
    text_in = torch.randn(2, 10, 512)
    image_in = torch.randn(2, 20, 512)
    audio_in = torch.randn(2, 15, 512)
    combined, losses, indices = bridge({'text': text_in, 'image': image_in, 'audio': audio_in})
    assert combined.shape == (2, 45, 512)
    assert 'text_vq' in losses
    assert 'image_vq' in losses
    assert 'audio_vq' in losses
    assert (indices['text'] < 8192).all()
    assert (indices['image'] >= 8192).all() and (indices['image'] < 12288).all()
    assert (indices['audio'] >= 12288).all() and (indices['audio'] < 16384).all()
    print(" PASS test_multimodal_vq_bridge_all_three")


def test_modality_gate_three_modalities():
    gate = ModalityGate(num_modalities=3)
    weights, count, hops = gate(['text', 'image', 'audio'])
    assert count == 3
    assert 'text' in weights
    assert 'image' in weights
    assert 'audio' in weights
    assert hops >= 2
    print(" PASS test_modality_gate_three_modalities")


def test_modality_gate_audio_only():
    gate = ModalityGate(num_modalities=3)
    weights, count, hops = gate(['audio'])
    assert count == 1
    assert 'audio' in weights
    print(" PASS test_modality_gate_audio_only")


def test_model_forward_with_audio():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    audio = torch.randn(2, 80000)
    logits, losses, indices, _ = model(x, audio=audio)
    assert logits.shape[0] == 2
    assert logits.shape[2] == VOCAB
    assert indices is not None
    print(" PASS test_model_forward_with_audio")


def test_model_forward_all_modalities():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    img = torch.randn(2, 3, 224, 224)
    audio = torch.randn(2, 80000)
    logits, losses, indices, _ = model(x, targets=x[:, 3:], images=img, audio=audio)
    assert losses is not None
    assert isinstance(losses, LossComponents)
    assert losses.total.ndim == 0
    assert losses.total > 0
    print(" PASS test_model_forward_all_modalities")


def test_model_audio_disabled_raises():
    model = ARBModel(enable_audio=False)
    x = torch.randint(0, VOCAB, (2, 66))
    audio = torch.randn(2, 80000)
    try:
        model(x, audio=audio)
        assert False, "Should have raised ValueError"
    except ValueError:
        pass
    print(" PASS test_model_audio_disabled_raises")


def test_audio_sequencer_gradient_flow():
    aseq = AudioSequencer()
    waveform = torch.randn(2, 80000)
    out = aseq(waveform)
    loss = out.sum()
    loss.backward()
    assert aseq.frame_proj.weight.grad is not None, "frame_proj should get gradients"
    assert aseq.projection.T_accum.grad is not None or True, "projection should participate"
    print(" PASS test_audio_sequencer_gradient_flow")


def test_vq_bridge_audio_codebook_utilization():
    bridge = MultimodalVQBridge()
    audio_in = torch.randn(4, 50, 512)
    combined, losses, indices = bridge({'text': torch.randn(4, 10, 512), 'audio': audio_in})
    util = bridge.get_codebook_utilization()
    assert 'audio' in util
    dead = bridge.get_dead_code_count()
    assert 'audio' in dead
    print(" PASS test_vq_bridge_audio_codebook_utilization")


# ===== Phase 7: Memory Module Tests =====

def test_loss_components_nine_fields_total():
    w = LossWeights()
    lc = LossComponents(
        lm=torch.tensor(1.0, requires_grad=True),
        vq_commitment=torch.tensor(0.1, requires_grad=True),
        moe_aux=torch.tensor(0.1, requires_grad=True),
        graph_l1=torch.tensor(0.1, requires_grad=True),
        graph_ponder=torch.tensor(0.1, requires_grad=True),
        moe_ponder=torch.tensor(0.1, requires_grad=True),
        conv_vq_commitment=torch.tensor(0.1, requires_grad=True),
        memgram_decay_reg=torch.tensor(0.01, requires_grad=True),
        lstm_hidden_reg=torch.tensor(0.01, requires_grad=True),
    )
    total = lc.total
    expected = (w.lm * 1.0 + w.vq_commitment * 0.1 + w.moe_aux * 0.1
                + w.graph_l1 * 0.1 + w.graph_ponder * 0.1 + w.moe_ponder * 0.1
                + w.conv_vq_commitment * 0.1 + w.memgram_decay_reg * 0.01
                + w.lstm_hidden_reg * 0.01)
    assert abs(total.item() - expected) < 1e-6, f"total {total.item()} != {expected}"
    print(" PASS test_loss_components_nine_fields_total")

def test_loss_components_nine_fields_log():
    from types import SimpleNamespace
    writer = SimpleNamespace()
    writer.logged = []
    writer.add_scalar = lambda name, val, step: writer.logged.append((name, val, step))
    lc = LossComponents(
        lm=torch.tensor(1.0, requires_grad=True),
        vq_commitment=torch.tensor(0.1, requires_grad=True),
        moe_aux=torch.tensor(0.1, requires_grad=True),
        graph_l1=torch.tensor(0.1, requires_grad=True),
        graph_ponder=torch.tensor(0.1, requires_grad=True),
        moe_ponder=torch.tensor(0.1, requires_grad=True),
        conv_vq_commitment=torch.tensor(0.1, requires_grad=True),
        memgram_decay_reg=torch.tensor(0.01, requires_grad=True),
        lstm_hidden_reg=torch.tensor(0.01, requires_grad=True),
    )
    lc.log(writer, step=0, prefix="loss")
    names = [x[0] for x in writer.logged]
    assert "loss/conv_vq_commitment" in names
    assert "loss/memgram_decay_reg" in names
    assert "loss/lstm_hidden_reg" in names
    assert "loss/total" in names
    print(" PASS test_loss_components_nine_fields_log")


def test_loss_weights_custom_total():
    w = LossWeights(lm=2.0, vq_commitment=0.5, moe_aux=0.0, graph_l1=10.0)
    lm_t = torch.tensor(1.0, requires_grad=True)
    vq_t = torch.tensor(2.0, requires_grad=True)
    lc = LossComponents(lm=lm_t, vq_commitment=vq_t, moe_aux=torch.tensor(5.0, requires_grad=True), graph_l1=torch.tensor(0.01, requires_grad=True), weights=w)
    expected = 2.0*1.0 + 0.5*2.0 + 0.0*5.0 + 10.0*0.01
    assert abs(lc.total.item() - expected) < 1e-5, f"Custom weights total {lc.total.item()} != {expected}"
    print(" PASS test_loss_weights_custom_total")


def test_loss_weights_zero_skips():
    w = LossWeights(vq_commitment=0.0, moe_aux=0.0)
    p = torch.nn.Parameter(torch.tensor(1.0))
    lc = LossComponents(lm=p * 2.0, vq_commitment=p * 3.0, moe_aux=p * 4.0, weights=w)
    total_val = lc.total.item()
    assert abs(total_val - (1.0*2.0 + 0.0*3.0 + 0.0*4.0)) < 1e-5, f"Zero-weight total {total_val}"
    lc.total.backward()
    grad_val = p.grad.item()
    assert abs(grad_val - 2.0) < 1e-5, f"Zero-weight grad {grad_val} (should be 2.0, only lm)"
    print(" PASS test_loss_weights_zero_skips")


def test_loss_weights_backward_compat():
    old_default = LossWeights()
    assert abs(old_default.lm - 1.0) < 1e-5
    assert abs(old_default.vq_commitment - 1.0) < 1e-5
    assert abs(old_default.moe_aux - 1.0) < 1e-5
    assert abs(old_default.graph_l1 - 0.001) < 1e-5
    assert abs(old_default.graph_ponder - 1.0) < 1e-5
    assert abs(old_default.moe_ponder - 1.0) < 1e-5
    assert abs(old_default.conv_vq_commitment - 0.1) < 1e-5
    assert abs(old_default.memgram_decay_reg - 0.01) < 1e-5
    assert abs(old_default.lstm_hidden_reg - 0.01) < 1e-5
    print(" PASS test_loss_weights_backward_compat")


def test_model_forward_loss_weights():
    model = ARBModel()
    w = LossWeights(lm=2.0, vq_commitment=0.5)
    x = torch.randint(0, VOCAB, (2, 66))
    _, losses, _, _ = model(x, targets=x[:, 3:], loss_weights=w)
    assert losses is not None
    assert isinstance(losses.weights, LossWeights)
    assert abs(losses.weights.lm - 2.0) < 1e-5
    assert abs(losses.weights.vq_commitment - 0.5) < 1e-5
    print(" PASS test_model_forward_loss_weights")


def test_model_forward_no_hardcoded_graph_l1():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    _, losses_no_target, _, _ = model(x)
    _, losses, _, _ = model(x, targets=x[:, 3:])
    assert losses is not None
    assert losses.graph_l1 is not None
    assert losses.vq_commitment is not None
    print(" PASS test_model_forward_no_hardcoded_graph_l1")


def test_build_param_groups_shape():
    from train import build_param_groups
    model = ARBModel()
    groups = build_param_groups(model, base_lr=1e-3, vq_lr_scale=2.0, memory_lr_scale=0.5)
    group_names = [g['name'] for g in groups]
    assert 'graph' in group_names
    assert 'memory' in group_names
    assert 'patch_proj' in group_names or 'frame_proj' in group_names
    for g in groups:
        assert 'lr' in g
        assert 'params' in g
    print(f"  Groups: {group_names}")
    print(" PASS test_build_param_groups_shape")


def test_pinpoint_gradient_isolation():
    from train import pinpoint_backward, build_param_groups
    model = ARBModel()
    param_groups = build_param_groups(model, base_lr=1e-3)

    x = torch.randint(0, VOCAB, (2, 66))
    _, losses, _, _ = model(x, targets=x[:, 3:])

    # Only vq_commitment active → grads only on vq_projection, NOT lm_core
    lw_vq = LossWeights(lm=0.0, vq_commitment=1.0, moe_aux=0.0, graph_l1=0.0,
                        graph_ponder=0.0, moe_ponder=0.0, conv_vq_commitment=0.0,
                        memgram_decay_reg=0.0, lstm_hidden_reg=0.0)
    for g in param_groups:
        for p in g['params']:
            p.grad = None
    pinpoint_backward(losses, lw_vq, param_groups, free_graph=False)

    for g in param_groups:
        for p in g['params']:
            if g['name'] in ('vq_projection', 'patch_proj', 'frame_proj'):
                continue  # allowed to have grads
            if g['name'] == 'vq_codebook':
                continue  # buffers, no grads
            assert p.grad is None, f"{g['name']} param should NOT have grad with only vq_commitment"

    # Only moe_aux active → grads only on moe groups
    lw_moe = LossWeights(lm=0.0, vq_commitment=0.0, moe_aux=1.0, graph_l1=0.0,
                         graph_ponder=0.0, moe_ponder=0.0, conv_vq_commitment=0.0,
                         memgram_decay_reg=0.0, lstm_hidden_reg=0.0)
    for g in param_groups:
        for p in g['params']:
            p.grad = None
    pinpoint_backward(losses, lw_moe, param_groups, free_graph=False)

    for g in param_groups:
        for p in g['params']:
            if g['name'] in ('moe', 'moe_act'):
                continue
            assert p.grad is None, f"{g['name']} param should NOT have grad with only moe_aux"

    # Only graph_l1 active → grads only on graph group
    lw_graph = LossWeights(lm=0.0, vq_commitment=0.0, moe_aux=0.0, graph_l1=1.0,
                           graph_ponder=0.0, moe_ponder=0.0, conv_vq_commitment=0.0,
                           memgram_decay_reg=0.0, lstm_hidden_reg=0.0)
    for g in param_groups:
        for p in g['params']:
            p.grad = None
    pinpoint_backward(losses, lw_graph, param_groups, free_graph=False)

    for g in param_groups:
        for p in g['params']:
            if g['name'] == 'graph':
                continue
            assert p.grad is None, f"{g['name']} param should NOT have grad with only graph_l1"

    print(" PASS test_pinpoint_gradient_isolation")


def test_pinpoint_backward_accumulation():
    from train import pinpoint_backward, build_param_groups
    model = ARBModel()
    param_groups = build_param_groups(model, base_lr=1e-3)

    x = torch.randint(0, VOCAB, (2, 66))
    _, losses, _, _ = model(x, targets=x[:, 3:])

    lw = LossWeights(lm=1.0, vq_commitment=0.0, moe_aux=0.0, graph_l1=0.0,
                     graph_ponder=0.0, moe_ponder=0.0, conv_vq_commitment=0.0,
                     memgram_decay_reg=0.0, lstm_hidden_reg=0.0)

    # grad_accum=1 on the same loss
    for g in param_groups:
        for p in g['params']:
            p.grad = None
    pinpoint_backward(losses, lw, param_groups, grad_accum=1, free_graph=False)

    norms_full = {}
    for g in param_groups:
        for p in g['params']:
            if p.grad is not None:
                norms_full[id(p)] = p.grad.data.norm().item()

    # grad_accum=2 on the same loss (graph retained from free_graph=False above)
    for g in param_groups:
        for p in g['params']:
            p.grad = None
    pinpoint_backward(losses, lw, param_groups, grad_accum=2, free_graph=False)

    checked = False
    for g in param_groups:
        for p in g['params']:
            if p.grad is not None and id(p) in norms_full and norms_full[id(p)] > 1e-8:
                n_half = p.grad.data.norm().item()
                ratio = n_half / norms_full[id(p)]
                assert abs(ratio - 0.5) < 0.01, \
                    f"grad_accum=2 should halve grads, got {ratio:.4f}"
                checked = True
                break
        if checked:
            break
    assert checked, "no param with non-zero grad in both runs"
    print(" PASS test_pinpoint_backward_accumulation")


def test_pinpoint_backward_rescale_effect():
    from train import pinpoint_backward, build_param_groups, DEFAULT_LOSS_TARGET_MAP
    model = ARBModel()
    loss_weights = LossWeights()
    param_groups = build_param_groups(model, base_lr=1e-3)
    # Run forward + pinpoint_backward with memory_grad_scale=0.25
    x = torch.randint(0, VOCAB, (2, 66))
    _, losses, _, _ = model(x, targets=x[:, 3:], loss_weights=loss_weights)
    grads_reference = {}
    for g in param_groups:
        for p in g['params']:
            grads_reference[id(p)] = torch.randn_like(p)  # dummy initial grads

    # Zero grads and run pinpoint with aggressive rescale
    for g in param_groups:
        for p in g['params']:
            p.grad = None
    pinpoint_backward(losses, loss_weights, param_groups, memory_grad_scale=0.25)

    # Verify memory params got rescaled (only possible if LSTM/MemGram enabled)
    memory_group = next((g for g in param_groups if g.get('name') == 'memory'), None)
    if memory_group and memory_group['params'] and memory_group['params'][0].grad is not None:
        for p in memory_group['params']:
            assert p.grad is not None, "memory param should have grad"
    # Verify lm_core params still get gradients
    lm_group = next((g for g in param_groups if g.get('name') == 'lm_core'), None)
    if lm_group and lm_group['params']:
        lm_has_grad = any(p.grad is not None for p in lm_group['params'])
        assert lm_has_grad, "lm_core should have grads from lm loss"
    print(" PASS test_pinpoint_backward_rescale_effect")

def test_moe_router_h_with_h_t():
    moe = SharedProjectionMoE(hidden_size=64, num_experts=4, top_k=2,
                               core_rank=16, shared_inter=128, noise_std=0.0)
    moe.lstm_enabled = True
    x = torch.randn(2, 10, 64)
    h_t = torch.randn(2, 64)
    out, aux = moe(x, h_t=h_t)
    assert out.shape == (2, 10, 64)
    out.sum().backward()
    assert hasattr(moe.router_h, '_hook_grad_T_sign'), "router_h not trained"
    print(" PASS test_moe_router_h_with_h_t")

def test_moe_router_without_h_t():
    moe = SharedProjectionMoE(hidden_size=64, num_experts=4, top_k=2,
                               core_rank=16, shared_inter=128, noise_std=0.0)
    x = torch.randn(2, 10, 64)
    out, aux = moe(x, h_t=None)
    assert out.shape == (2, 10, 64)
    out.sum().backward()
    assert hasattr(moe.router, '_hook_grad_T_sign'), "original router not trained when no h_t"
    print(" PASS test_moe_router_without_h_t")

def test_memgram_shapes():
    mg = MemGram(struct_primes=[101,103,107,109], conv_primes=[53,59,61,67])
    vq_idx = torch.randint(0, 100, (4, 20))
    hs = torch.randn(4, 20, 512)
    out, decay = mg(vq_idx, None, None, hs, timestep=100)
    assert out.shape == (4, 20, 512), f"output shape {out.shape}"
    assert decay.ndim == 0
    print(" PASS test_memgram_shapes")

def test_memgram_hash_indices():
    mg = MemGram(struct_primes=[101,103], conv_primes=[53,59])
    prev = torch.randint(0, 100, (4, 19))
    curr = torch.randint(0, 100, (4, 19))
    h = mg._hash_pairs(prev, curr, [101, 103])
    assert h.shape == (4, 19, 2), f"hash shape {h.shape}"
    assert (h[..., 0] < 101).all(), "hash exceeds prime range"
    assert (h[..., 1] < 103).all(), "hash exceeds prime range"
    print(" PASS test_memgram_hash_indices")

def test_memgram_bilinear_gate_range():
    mg = MemGram(struct_primes=[101,103], conv_primes=[53,59], key_dim=8, embed_dim=16)
    vq_idx = torch.randint(0, 80, (2, 10))
    hs = torch.randn(2, 10, 512)
    out, _ = mg(vq_idx, None, None, hs, timestep=50)
    assert out.shape == (2, 10, 512)
    assert torch.isfinite(out).all()
    print(" PASS test_memgram_bilinear_gate_range")

def test_memgram_decay_formula():
    mg = MemGram(struct_primes=[101,103], conv_primes=[53,59])
    s = torch.zeros(1)
    r = torch.zeros(1)
    decay = mg._compute_decay(s, r, torch.tensor(100.0))
    expected = torch.sigmoid(torch.zeros(1)) * torch.exp(-torch.exp(torch.zeros(1)) * 100.0)
    assert torch.allclose(decay, expected, atol=1e-6), f"decay {decay} != {expected}"
    print(" PASS test_memgram_decay_formula")

def test_memgram_gradient_flow():
    mg = MemGram(struct_primes=[101,103], conv_primes=[53,59], embed_dim=16, key_dim=8, hidden_dim=64)
    vq_idx = torch.randint(0, 80, (2, 10))
    hs = torch.randn(2, 10, 64)
    out, decay = mg(vq_idx, None, None, hs, timestep=50)
    loss = out.sum() + decay
    loss.backward()
    assert mg.struct_emb[0].grad is not None, "no gradient to struct_emb"
    assert mg.struct_emb[0].grad.abs().sum().item() > 0
    print(" PASS test_memgram_gradient_flow")

def test_memgram_conv_path():
    mg = MemGram(struct_primes=[101,103], conv_primes=[53,59], embed_dim=16, key_dim=8, hidden_dim=64)
    vq_idx = torch.randint(0, 80, (2, 10))
    hs = torch.randn(2, 10, 64)
    out_no_conv, _ = mg(vq_idx, None, None, hs, timestep=50)
    conv_code = torch.randint(0, 50, (2,))
    out_with_conv, _ = mg(vq_idx, conv_code, conv_code, hs, timestep=50)
    assert not torch.allclose(out_no_conv, out_with_conv, atol=1e-6), "conv path should change output"
    print(" PASS test_memgram_conv_path")

def test_conv_vq_shapes():
    cvq = ConvVQCodebook(codebook_size=16, code_dim=8)
    x = torch.randn(4, 512)
    code, quantized, commitment = cvq(x, step=500, enabled=True)
    assert code.shape == (4,), f"code shape {code.shape}"
    assert quantized.shape == (4, 512), f"quantized shape {quantized.shape}"
    assert commitment.ndim == 0
    print(" PASS test_conv_vq_shapes")

def test_conv_vq_hard_cap():
    cvq = ConvVQCodebook(codebook_size=8, code_dim=8)
    for i in range(12):
        x = torch.randn(4, 512)
        cvq(x, step=i, enabled=True)
    assert cvq.n_active.item() == 8, f"n_active={cvq.n_active.item()} (should be 8)"
    print(" PASS test_conv_vq_hard_cap")

def test_conv_vq_deferred_activation():
    cvq = ConvVQCodebook(codebook_size=8, code_dim=8)
    x = torch.randn(4, 512)
    code, quantized, commitment = cvq(x, step=500, enabled=False)
    assert torch.equal(code, torch.zeros(4, dtype=torch.long)), "code should be zeros when disabled"
    assert commitment.item() == 0.0, "commitment should be 0 when disabled"
    print(" PASS test_conv_vq_deferred_activation")

def test_conv_vq_ema_update():
    cvq = ConvVQCodebook(codebook_size=4, code_dim=8)
    for i in range(4):
        cvq(torch.randn(1, 512) + i, step=i, enabled=True)
    embed_before = cvq.embed.clone()
    for i in range(4):
        cvq(torch.randn(1, 512), step=10 + i, enabled=True)
    embed_after = cvq.embed.clone()
    assert not torch.allclose(embed_before, embed_after), "entries should change through EMA + replacement"
    print(" PASS test_conv_vq_ema_update")

def test_conv_vq_persistence():
    cvq = ConvVQCodebook(codebook_size=8, code_dim=8)
    x = torch.randn(2, 512)
    cvq(x, step=0, enabled=True)
    sd = cvq.state_dict()
    cvq2 = ConvVQCodebook(codebook_size=8, code_dim=8)
    cvq2.load_state_dict(sd)
    assert torch.allclose(cvq.embed, cvq2.embed)
    assert torch.equal(cvq.timestamps, cvq2.timestamps)
    assert torch.equal(cvq.n_active, cvq2.n_active)
    assert torch.equal(cvq.cluster_size, cvq2.cluster_size)
    print(" PASS test_conv_vq_persistence")

def test_conv_vq_fuzzy_retrieve():
    cvq = ConvVQCodebook(codebook_size=16, code_dim=8)
    for i in range(3):
        x = torch.randn(2, 512)
        cvq(x, step=i, enabled=True)
    query = torch.randn(8)
    idx, sim = cvq.fuzzy_retrieve(query, top_k=3)
    assert idx.numel() == 3, f"retrieved {idx.numel()} elements, expected 3"
    assert sim.numel() == 3
    print(" PASS test_conv_vq_fuzzy_retrieve")

def test_conv_vq_commitment_nonneg():
    cvq = ConvVQCodebook(codebook_size=8, code_dim=8)
    x = torch.randn(2, 512)
    _, _, commitment = cvq(x, step=0, enabled=True)
    assert commitment.item() >= 0, f"negative commitment {commitment.item()}"
    print(" PASS test_conv_vq_commitment_nonneg")

def test_lstm_shapes():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    x = torch.randn(4, 64)
    h_out, c_focus, h_topic, c_topic, c_proj, reg = lstm(x, None)
    assert h_out.shape == (4, 64), f"h_out shape {h_out.shape}"
    assert c_focus.shape == (4, 64), f"c_focus shape {c_focus.shape}"
    assert h_topic.shape == (4, 64), f"h_topic shape {h_topic.shape}"
    assert c_topic.shape == (4, 64), f"c_topic shape {c_topic.shape}"
    assert c_proj.shape == (4, 64), f"c_proj shape {c_proj.shape}"
    assert reg.ndim == 0
    print(" PASS test_lstm_shapes")

def test_lstm_forget_gate_bias():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    bias_ih = lstm.focus_cell.bias_ih[64:128]
    assert torch.allclose(bias_ih, torch.ones_like(bias_ih)), "focus forget gate bias not 1.0"
    bias_ih_topic = lstm.topic_cell.bias_ih[64:128]
    assert torch.allclose(bias_ih_topic, torch.full_like(bias_ih_topic, 1.5)), "topic forget gate bias not 1.5"
    print(" PASS test_lstm_forget_gate_bias")

def test_lstm_bptt_detach():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64, bptt_focus=5, bptt_topic=10)
    x = torch.randn(2, 64)
    memory = None
    for i in range(49):
        h_out, c_focus, h_topic, c_topic, _, _ = lstm(x, memory)
        memory = (h_out.detach(), c_focus.detach(), h_topic.detach(), c_topic.detach())
    assert h_out.grad_fn is not None, "grad_fn should exist before BPTT boundary"
    h_out, c_focus, h_topic, c_topic, _, _ = lstm(x, memory)
    assert h_out.grad_fn is None, "h_out should be detached at BPTT focus boundary"
    print(" PASS test_lstm_bptt_detach")

def test_lstm_hidden_reg():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    x = torch.randn(2, 64)
    _, _, _, _, _, reg = lstm(x, None)
    h_out, _, _, _, _, _ = lstm(x, None)
    expected = (h_out ** 2).mean()
    assert torch.allclose(reg, expected, atol=1e-6), f"reg {reg} != expected {expected}"
    print(" PASS test_lstm_hidden_reg")

def test_lstm_c_t_proj_ternary():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    assert isinstance(lstm.c_focus_proj, TernaryScaleTensor), "c_focus_proj not TernaryScaleTensor"
    assert isinstance(lstm.c_topic_proj, TernaryScaleTensor), "c_topic_proj not TernaryScaleTensor"
    print(" PASS test_lstm_c_t_proj_ternary")

def test_memory_modules_backward_compat():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, CTX))
    logits, losses, indices, _ = model(x, targets=x[:, 3:])
    assert logits.shape[0] == 2
    assert losses is not None
    print(" PASS test_memory_modules_backward_compat")


# ===== Phase 7: Forward Pipeline Integration Tests =====

def test_forward_no_memory_backward_compat():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    logits, losses, indices, mem_state = model(x, targets=targets)
    assert logits.shape == (2, 64, VOCAB), f"logits shape {logits.shape}"
    assert mem_state is None, f"memory_state should be None when lstm disabled, got {mem_state}"
    print(" PASS test_forward_no_memory_backward_compat")

def test_forward_lstm_enabled_h_t_passed():
    model = ARBModel()
    model.lstm_enabled = True
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    logits, losses, indices, mem_state = model(x, targets=targets, memory_state=None, timestep=0)
    assert mem_state is not None, "memory_state should be tuple when lstm enabled"
    h_out, c_focus, h_topic, c_topic = mem_state
    assert h_out.shape == (2, 512), f"h_out shape {h_out.shape}"
    assert c_focus.shape == (2, 512), f"c_focus shape {c_focus.shape}"
    assert h_topic.shape == (2, 512), f"h_topic shape {h_topic.shape}"
    assert c_topic.shape == (2, 512), f"c_topic shape {c_topic.shape}"
    print(" PASS test_forward_lstm_enabled_h_t_passed")

def test_forward_lstm_c_t_residual():
    model = ARBModel()
    x = torch.randint(0, VOCAB, (2, 66))
    targets = x[:, 3:]
    logits_no_lstm, _, _, _ = model(x, targets=targets)
    model.lstm_enabled = True
    logits_lstm, _, _, _ = model(x, targets=targets, memory_state=None, timestep=0)
    assert not torch.allclose(logits_no_lstm, logits_lstm, atol=1e-4), "c_t_proj should modify output"
    print(" PASS test_forward_lstm_c_t_residual")

def test_forward_memgram_injection():
    model = ARBModel()
    model.memgram_enabled = True
    x = torch.randint(0, VOCAB, (2, 66))
    logits, losses, _, _ = model(x, targets=x[:, 3:], timestep=100)
    assert losses.memgram_decay_reg is not None, "memgram_decay_reg should be set"
    print(" PASS test_forward_memgram_injection")

def test_forward_conv_vq_deferred():
    model = ARBModel()
    model.conv_vq_enabled = True
    model._conv_vq_ready = False
    x = torch.randint(0, VOCAB, (2, 66))
    logits, losses, _, _ = model(x, targets=x[:, 3:], timestep=100)
    assert losses.conv_vq_commitment is None or losses.conv_vq_commitment.item() == 0.0, \
        f"conv_vq should be deferred, got {losses.conv_vq_commitment}"
    print(" PASS test_forward_conv_vq_deferred")

def test_generate_carries_lstm_state():
    model = ARBModel()
    model.lstm_enabled = True
    idx = torch.zeros((1, 10), dtype=torch.long)
    out = model.generate(idx, max_new_token=10)
    assert out.shape == (1, 20), f"output shape {out.shape}"
    assert out.dtype == torch.long
    print(" PASS test_generate_carries_lstm_state")


# ===== Phase 7: Training Schedule Tests =====

def test_memory_schedule_warmup():
    from train import compute_memory_schedule
    lstm_on, memgram_on, conv_vq_on, decay_reg_on = compute_memory_schedule(0, 10000)
    assert not any([lstm_on, memgram_on, conv_vq_on, decay_reg_on]), "all off during warmup"
    print(" PASS test_memory_schedule_warmup")

def test_memory_schedule_lstm_first():
    from train import compute_memory_schedule
    lstm_on, memgram_on, conv_vq_on, decay_reg_on = compute_memory_schedule(2500, 10000)
    assert lstm_on, "lstm should be on after warmup"
    assert not memgram_on, "memgram should be off at 25%"
    print(" PASS test_memory_schedule_lstm_first")

def test_memory_schedule_memgram_second():
    from train import compute_memory_schedule
    lstm_on, memgram_on, conv_vq_on, decay_reg_on = compute_memory_schedule(3500, 10000, vq_utilization=0.4)
    assert lstm_on and memgram_on and conv_vq_on, "lstm+memgram+conv_vq on at 35% with util>30%"
    assert not decay_reg_on, "decay_reg should be off at 35%"
    print(" PASS test_memory_schedule_memgram_second")

def test_memory_schedule_all_on():
    from train import compute_memory_schedule
    lstm_on, memgram_on, conv_vq_on, decay_reg_on = compute_memory_schedule(5000, 10000, vq_utilization=0.4)
    assert all([lstm_on, memgram_on, conv_vq_on, decay_reg_on]), "all on at 50%"
    print(" PASS test_memory_schedule_all_on")

def test_memory_schedule_conv_vq_requires_vq_util():
    from train import compute_memory_schedule
    lstm_on, memgram_on, conv_vq_on, decay_reg_on = compute_memory_schedule(4000, 10000, vq_utilization=0.1)
    assert lstm_on, "lstm should be on"
    assert not conv_vq_on, "conv_vq should be off when util < 30%"
    print(" PASS test_memory_schedule_conv_vq_requires_vq_util")

def test_memory_schedule_decay_reg_last():
    from train import compute_memory_schedule
    lstm_on, memgram_on, conv_vq_on, decay_reg_on = compute_memory_schedule(4500, 10000, vq_utilization=0.4)
    assert decay_reg_on, "decay_reg should be on at 45%"
    print(" PASS test_memory_schedule_decay_reg_last")

def test_lstm_state_reset_per_batch():
    model = ARBModel()
    model.lstm_enabled = True
    model.eval()
    x = torch.randint(0, VOCAB, (2, 66))
    with torch.no_grad():
        _, _, _, mem1 = model(x, memory_state=None, timestep=0)
        _, _, _, mem2 = model(x, memory_state=None, timestep=1)
    h1, c1_f, h1_t, c1_t = mem1
    h2, c2_f, h2_t, c2_t = mem2
    assert h1.shape == h2.shape, "h_out shapes should match"
    assert c1_f.shape == c2_f.shape, "c_focus shapes should match"
    print(" PASS test_lstm_state_reset_per_batch")

def test_bptt_counter_separate():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64, bptt_focus=5, bptt_topic=10)
    x = torch.randn(2, 64)
    memory = None
    for i in range(4):
        h_out, c_f, h_t, c_t, _, _ = lstm(x, memory)
        memory = (h_out.detach(), c_f.detach(), h_t.detach(), c_t.detach())
    assert lstm.step_count == 4, f"step_count={lstm.step_count}"
    h_out, c_f, h_t, c_t, _, _ = lstm(x, memory)
    assert c_f.grad_fn is None, "c_focus should be detached at BPTT focus boundary"
    print(" PASS test_bptt_counter_separate")


# ===== FocusGate Tests =====

def test_focus_gate_no_boundary():
    fg = FocusGate(hidden_dim=64)
    x = torch.randn(2, 64)
    reset, dampen = fg(x, boundary_signal=None)
    assert reset.shape == (2, 1), f"reset shape {reset.shape}"
    assert dampen.shape == (2, 64), f"dampen shape {dampen.shape}"
    assert torch.allclose(reset, torch.ones_like(reset)), "reset should be 1.0 for no boundary"
    assert torch.allclose(dampen, torch.ones_like(dampen)), "dampen should be 1.0 for no boundary"
    print(" PASS test_focus_gate_no_boundary")

def test_focus_gate_boundary_signal():
    fg = FocusGate(hidden_dim=64)
    x = torch.randn(2, 64)
    reset_bos, dampen_bos = fg(x, boundary_signal=SPECIAL_VOCAB['BOS'])
    assert reset_bos.shape == (2, 1)
    assert dampen_bos.shape == (2, 64)
    assert 0.0 <= reset_bos.mean().item() <= 1.0, f"reset out of range: {reset_bos.mean().item()}"
    assert 0.0 <= dampen_bos.mean().item() <= 1.0, f"dampen out of range: {dampen_bos.mean().item()}"
    print(" PASS test_focus_gate_boundary_signal")

def test_focus_gate_all_boundary_types():
    fg = FocusGate(hidden_dim=64)
    x = torch.randn(2, 64)
    for tok_name, tok_id in [('BOS', SPECIAL_VOCAB['BOS']),
                              ('SYSTEM', SPECIAL_VOCAB['SYSTEM']),
                              ('USER', SPECIAL_VOCAB['USER']),
                              ('ASSISTANT', SPECIAL_VOCAB['ASSISTANT'])]:
        reset, dampen = fg(x, boundary_signal=tok_id)
        assert reset.mean().item() < 1.0, f"{tok_name} should reduce reset from 1.0"
    no_reset, _ = fg(x, boundary_signal=None)
    assert torch.allclose(no_reset, torch.ones_like(no_reset)), "no-boundary reset should be 1.0"
    print(" PASS test_focus_gate_all_boundary_types")

def test_focus_gate_unknown_token():
    fg = FocusGate(hidden_dim=64)
    x = torch.randn(2, 64)
    reset, dampen = fg(x, boundary_signal=9999)
    assert torch.allclose(reset, torch.ones_like(reset)), "unknown token should return reset=1.0"
    assert torch.allclose(dampen, torch.ones_like(dampen)), "unknown token should return dampen=1.0"
    print(" PASS test_focus_gate_unknown_token")

def test_focus_gate_c_focus_modulation():
    fg = FocusGate(hidden_dim=64)
    x = torch.randn(2, 64)
    c_focus = torch.randn(2, 64)
    c_focus_before = c_focus.clone()
    reset, dampen = fg(x, boundary_signal=SPECIAL_VOCAB['USER'])
    c_focus_mod = c_focus * reset * dampen
    assert not torch.allclose(c_focus_mod, c_focus_before, atol=1e-6), "focus gate should modify c_focus on boundary"
    reset_none, dampen_none = fg(x, boundary_signal=None)
    c_focus_noop = c_focus * reset_none * dampen_none
    assert torch.allclose(c_focus_noop, c_focus, atol=1e-6), "focus gate should not modify c_focus without boundary"
    print(" PASS test_focus_gate_c_focus_modulation")


# ===== ConversationStack Tests =====

def test_conv_stack_push_pop():
    stack = ConversationStack(max_conversations=4, hidden_dim=64)
    h = torch.randn(64)
    c_f = torch.randn(64)
    h_t = torch.randn(64)
    c_t = torch.randn(64)
    stack.push("conv_1", h, c_f, h_t, c_t, "cpu")
    result = stack.pop("conv_1", "cpu")
    assert result is not None, "pop should find pushed conversation"
    h_rest, c_f_rest, h_t_rest, c_t_rest = result
    assert torch.allclose(h_rest, h, atol=1e-6), "h_focus not preserved"
    assert torch.allclose(c_f_rest, c_f, atol=1e-6), "c_focus not preserved"
    assert torch.allclose(h_t_rest, h_t, atol=1e-6), "h_topic not preserved"
    assert torch.allclose(c_t_rest, c_t, atol=1e-6), "c_topic not preserved"
    print(" PASS test_conv_stack_push_pop")

def test_conv_stack_pop_missing():
    stack = ConversationStack(max_conversations=4, hidden_dim=64)
    result = stack.pop("nonexistent", "cpu")
    assert result is None, "pop on nonexistent should return None"
    print(" PASS test_conv_stack_pop_missing")

def test_conv_stack_clear():
    stack = ConversationStack(max_conversations=4, hidden_dim=64)
    h = torch.randn(64)
    stack.push("conv_1", h, torch.randn(64), torch.randn(64), torch.randn(64), "cpu")
    stack.clear("conv_1")
    result = stack.pop("conv_1", "cpu")
    assert result is None, "cleared conversation should not be found"
    print(" PASS test_conv_stack_clear")

def test_conv_stack_lru_eviction():
    stack = ConversationStack(max_conversations=2, hidden_dim=64)
    for i in range(3):
        h = torch.full((64,), float(i))
        stack.push(f"conv_{i}", h, torch.zeros(64), torch.zeros(64), torch.zeros(64), "cpu")
    result_0 = stack.pop("conv_0", "cpu")
    assert result_0 is None, "conv_0 should be evicted (LRU)"
    result_1 = stack.pop("conv_1", "cpu")
    assert result_1 is not None, "conv_1 should still exist"
    result_2 = stack.pop("conv_2", "cpu")
    assert result_2 is not None, "conv_2 should still exist"
    print(" PASS test_conv_stack_lru_eviction")

def test_conv_stack_reset():
    stack = ConversationStack(max_conversations=4, hidden_dim=64)
    stack.push("conv_1", torch.randn(64), torch.randn(64), torch.randn(64), torch.randn(64), "cpu")
    stack.push("conv_2", torch.randn(64), torch.randn(64), torch.randn(64), torch.randn(64), "cpu")
    stack.reset()
    assert stack.pop("conv_1", "cpu") is None, "conv_1 should be gone after reset"
    assert stack.pop("conv_2", "cpu") is None, "conv_2 should be gone after reset"
    assert stack.active_slot == -1, "active_slot should be -1 after reset"
    print(" PASS test_conv_stack_reset")


# ===== ConversationLSTM Tests =====

def test_conversation_lstm_forward():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64, bptt_focus=50, bptt_topic=200)
    x = torch.randn(4, 64)
    h_out, c_focus, h_topic, c_topic, c_proj, reg = lstm(x, None)
    assert h_out.shape == (4, 64)
    assert c_focus.shape == (4, 64)
    assert h_topic.shape == (4, 64)
    assert c_topic.shape == (4, 64)
    assert c_proj.shape == (4, 64)
    assert reg.ndim == 0
    print(" PASS test_conversation_lstm_forward")

def test_conversation_lstm_dual_state():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    x = torch.randn(2, 64)
    h_out, c_focus, h_topic, c_topic, _, _ = lstm(x, None)
    for _ in range(10):
        h_out, c_focus, h_topic, c_topic, _, _ = lstm(x, (h_out.detach(), c_focus.detach(), h_topic.detach(), c_topic.detach()))
    assert c_focus.abs().sum() > 0, "c_focus should be nonzero after steps"
    assert c_topic.abs().sum() > 0, "c_topic should be nonzero after steps"
    print(" PASS test_conversation_lstm_dual_state")

def test_conversation_lstm_boundary_reset():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    x = torch.randn(2, 64)
    h_out, c_focus, h_topic, c_topic, _, _ = lstm(x, None)
    for _ in range(5):
        h_out, c_focus, h_topic, c_topic, _, _ = lstm(x, (h_out.detach(), c_focus.detach(), h_topic.detach(), c_topic.detach()))
    c_focus_before = c_focus.clone()
    h_out, c_focus_after, _, _, _, _ = lstm(x, (h_out.detach(), c_focus.detach(), h_topic.detach(), c_topic.detach()), boundary_signal=SPECIAL_VOCAB['BOS'])
    assert lstm._last_reset < 1.0, "BOS boundary should reduce reset from 1.0"
    print(" PASS test_conversation_lstm_boundary_reset")

def test_conversation_lstm_topic_gate():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    x = torch.randn(2, 64)
    _, _, h_topic_0, _, _, _ = lstm(x, None)
    assert h_topic_0.abs().sum() > 0, "h_topic should be nonzero after one step"
    print(" PASS test_conversation_lstm_topic_gate")

def test_conversation_lstm_bptt_dual_windows():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64, bptt_focus=3, bptt_topic=6)
    x = torch.randn(2, 64)
    memory = None
    for i in range(2):
        h_out, c_f, h_t, c_t, _, _ = lstm(x, memory)
        memory = (h_out.detach(), c_f.detach(), h_t.detach(), c_t.detach())
    assert h_out.grad_fn is not None, "h_out should have grad before focus BPTT"
    h_out, c_f, h_t, c_t, _, _ = lstm(x, memory)
    assert c_f.grad_fn is None, "c_focus should be detached at focus BPTT boundary (step 3)"
    print(" PASS test_conversation_lstm_bptt_dual_windows")

def test_conversation_lstm_topic_preserves_on_boundary():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    x = torch.randn(2, 64)
    memory = None
    for _ in range(5):
        h_out, c_f, h_t, c_t, _, _ = lstm(x, memory)
        memory = (h_out.detach(), c_f.detach(), h_t.detach(), c_t.detach())
    c_topic_before = c_t.clone()
    _, c_f_after, _, c_t_after, _, _ = lstm(x, memory, boundary_signal=SPECIAL_VOCAB['USER'])
    decay_ratio = c_t_after.norm() / max(c_topic_before.norm(), 1e-8)
    focus_decay = c_f_after.norm() / max(c_f.norm(), 1e-8)
    assert decay_ratio >= focus_decay * 0.9, f"topic should decay less than focus on boundary: topic={decay_ratio:.3f} focus={focus_decay:.3f}"
    print(" PASS test_conversation_lstm_topic_preserves_on_boundary")

def test_conversation_lstm_h_out_is_sum():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    lstm.eval()
    x = torch.randn(2, 64)
    with torch.no_grad():
        h_out, c_f, h_topic, c_topic, c_proj, _ = lstm(x, None)
    assert h_out.shape == (2, 64)
    assert c_proj.shape == (2, 64)
    print(" PASS test_conversation_lstm_h_out_is_sum")

def test_extract_boundary_from_input():
    x_bos = torch.tensor([[SPECIAL_VOCAB['BOS'], 10, 20]])
    assert _extract_boundary_from_input(x_bos) == SPECIAL_VOCAB['BOS'], "should detect BOS"
    x_user = torch.tensor([[10, SPECIAL_VOCAB['USER'], 20]])
    assert _extract_boundary_from_input(x_user) == SPECIAL_VOCAB['USER'], "should detect USER"
    x_none = torch.tensor([[10, 20, 30]])
    assert _extract_boundary_from_input(x_none) is None, "should return None for no boundary"
    print(" PASS test_extract_boundary_from_input")


# ===== Integration: ConversationLSTM + Model =====

def test_model_switch_conversation():
    model = ARBModel()
    model.lstm_enabled = True
    model.switch_conversation("conv_A")
    assert model.lstm.conv_stack._current_conv_id == "conv_A"
    print(" PASS test_model_switch_conversation")

def test_model_reset_conversation():
    model = ARBModel()
    model.lstm_enabled = True
    model.switch_conversation("conv_A")
    model.reset_conversation("conv_A")
    assert model.lstm.conv_stack._current_conv_id is None
    print(" PASS test_model_reset_conversation")

def test_model_generate_with_conversation_id():
    model = ARBModel()
    model.lstm_enabled = True
    idx = torch.zeros((1, 10), dtype=torch.long)
    out = model.generate(idx, max_new_token=5, conversation_id="conv_test")
    assert out.shape == (1, 15), f"output shape {out.shape}"
    print(" PASS test_model_generate_with_conversation_id")

def test_conversation_lstm_ternary_projections():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    assert isinstance(lstm.c_focus_proj, TernaryScaleTensor)
    assert isinstance(lstm.c_topic_proj, TernaryScaleTensor)
    print(" PASS test_conversation_lstm_ternary_projections")

def test_conversation_lstm_focus_gate_params():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    assert isinstance(lstm.focus_gate, FocusGate)
    assert isinstance(lstm.focus_gate.boundary_embed, nn.Embedding)
    assert isinstance(lstm.focus_gate.reset_fc, nn.Linear)
    assert isinstance(lstm.focus_gate.dampen_fc, nn.Linear)
    print(" PASS test_conversation_lstm_focus_gate_params")

def test_conversation_lstm_topic_cell_bias():
    lstm = ConversationLSTM(input_dim=64, hidden_dim=64)
    bias_topic = lstm.topic_cell.bias_ih[64:128]
    assert torch.allclose(bias_topic, torch.full_like(bias_topic, 1.5)), f"topic forget gate bias should be 1.5, got {bias_topic.mean().item()}"
    print(" PASS test_conversation_lstm_topic_cell_bias")


if __name__ == "__main__":
    tests = [
        test_sticky_zone_ste,
        test_sticky_zone_ste_dtype_preservation,
        test_scaled_ternary_linear,
        test_rmsnorm,
        test_byte_embedding,
        test_text_sequencer,
        test_trigram_window,
        test_image_sequencer,
        test_image_sequencer_frozen,
        test_target_alignment,
        test_model_forward,
        test_generate,
        test_param_count,
        test_gradient_flow,
        test_model_forward_with_targets,
        test_save_load_roundtrip,
        test_vq_adapter_shapes,
        test_vq_integration,
        test_vq_disabled,
        test_vq_with_targets,
        test_l2_distance_matching,
        test_vq_ternary_projections,
        test_multimodal_vq_bridge_text_only,
        test_multimodal_vq_bridge_text_image,
        test_modality_gate_shapes,
        test_ternary_graph_multicodebook,
        test_vq_no_float_cast_in_model,
        test_zero_fp32_params,
        test_sticky_zone_ste_gradient,
        test_graph_moe_gate_shape,
        test_ternary_graph_shapes,
        test_graph_gradient_flow,
        test_graph_connectivity_monitor,
        test_model_forward_with_graph,
        test_model_graph_disabled,
        test_ternary_graph_in_modules,
        test_moe_shapes,
        test_moe_router,
        test_moe_aux_loss,
        test_shared_expert,
        test_moe_gradient_flow,
        test_moe_zero_fp32,
        test_ternary_graph_with_gate,
        test_model_forward_with_moe,
        test_model_moe_disabled,
        test_model_moe_loss_components,
        test_model_moe_gate_modulation,
    test_param_count_with_moe,
    test_moe_monitoring,
    test_loss_components,
    test_loss_components_none_fields,
    test_loss_components_backward,
    test_gnn_lora_adapter,
    test_gnn_lora_gradient,
    test_shared_gnn_weight_tying,
    test_shared_gnn_multi_hop,
    test_model_losses_components_type,
    test_halting_unit_shapes,
    test_halting_unit_ternary_pure,
    test_graph_act_cell_shapes,
    test_moe_act_cell_shapes,
    test_act_early_halt,
    test_act_weight_sum_one,
    test_act_gradient_flow,
    test_loss_components_ponder_fields,
    test_loss_components_ponder_none,
    test_act_graph_moe_sequential,
    test_model_forward_with_act,
    test_model_act_forward_without_targets,
    test_model_act_loss_components,
    test_model_act_backward,
    test_model_act_disabled,
    test_model_act_warmup_mode,
    test_model_act_ponder_cached,
    test_act_warmup_schedule,
    test_act_ponder_lambda,
    test_model_ponder_lambda_scaling,
    test_text_only_forward,
    test_image_forward,
    test_multimodal_backward,
    test_no_stale_trigram_encoder,
    test_vocab,
    test_memgram_shapes,
    test_memgram_hash_indices,
    test_memgram_bilinear_gate_range,
    test_memgram_decay_formula,
    test_memgram_gradient_flow,
    test_memgram_conv_path,
    test_conv_vq_shapes,
    test_conv_vq_hard_cap,
    test_conv_vq_deferred_activation,
    test_conv_vq_ema_update,
    test_conv_vq_persistence,
    test_conv_vq_fuzzy_retrieve,
    test_conv_vq_commitment_nonneg,
    test_lstm_shapes,
    test_lstm_forget_gate_bias,
    test_lstm_bptt_detach,
    test_lstm_hidden_reg,
    test_lstm_c_t_proj_ternary,
    test_memory_modules_backward_compat,
    test_loss_components_nine_fields_total,
    test_loss_components_nine_fields_log,
    test_moe_router_h_with_h_t,
    test_moe_router_without_h_t,
    test_forward_no_memory_backward_compat,
    test_forward_lstm_enabled_h_t_passed,
    test_forward_lstm_c_t_residual,
    test_forward_memgram_injection,
    test_forward_conv_vq_deferred,
    test_generate_carries_lstm_state,
    test_memory_schedule_warmup,
    test_memory_schedule_lstm_first,
    test_memory_schedule_memgram_second,
    test_memory_schedule_all_on,
    test_memory_schedule_conv_vq_requires_vq_util,
    test_memory_schedule_decay_reg_last,
    test_lstm_state_reset_per_batch,
    test_bptt_counter_separate,
    test_focus_gate_no_boundary,
    test_focus_gate_boundary_signal,
    test_focus_gate_all_boundary_types,
    test_focus_gate_unknown_token,
    test_focus_gate_c_focus_modulation,
    test_conv_stack_push_pop,
    test_conv_stack_pop_missing,
    test_conv_stack_clear,
    test_conv_stack_lru_eviction,
    test_conv_stack_reset,
    test_conversation_lstm_forward,
    test_conversation_lstm_dual_state,
    test_conversation_lstm_boundary_reset,
    test_conversation_lstm_topic_gate,
    test_conversation_lstm_bptt_dual_windows,
    test_conversation_lstm_topic_preserves_on_boundary,
    test_conversation_lstm_h_out_is_sum,
    test_extract_boundary_from_input,
    test_model_switch_conversation,
    test_model_reset_conversation,
    test_model_generate_with_conversation_id,
    test_conversation_lstm_ternary_projections,
    test_conversation_lstm_focus_gate_params,
    test_conversation_lstm_topic_cell_bias,
]
    print("Running MORPH tests (Phase 1 + Phase 2 VQ + Phase 3 Graph + Phase 4 MoE + Explore + Phase 5 ACT + Phase 6 Multi-Modal + Phase 7 Memory)...\n")
    passed = 0
    failed = 0
    for t in tests:
        try:
            t()
            passed += 1
        except Exception as e:
            print(f" FAIL {t.__name__}: {e}")
            failed += 1
    print(f"\n{passed} passed, {failed} failed out of {len(tests)} tests")