CLIWorks
/

Spider-FLEXITOKENS

Model card Files Files and versions

xet

Community

CLIWorks commited on 14 days ago

Commit

b3b689e

verified ·

1 Parent(s): 20f5e73

Upload spider.py with huggingface_hub

Browse files

Files changed (1) hide show

spider.py +1568 -0

spider.py ADDED Viewed

	@@ -0,0 +1,1568 @@

+#!/usr/bin/env python3
+"""Spider: MoE + RDT (Recurrent-Depth Transformer) architecture v5.
+Canonical architecture ported from mythos-fineweb-moe.py (SpiderPortal v5-Dense)
+with the following adaptations per Phase 02 decisions:
+- Full Spider rebrand (no SpiderPortal/SpiderPortal prefix) per D-07
+- Byte-level vocab: 272 tokens (256 bytes + 16 specials) per D-06
+- MLA (Multi-Latent Attention) with compressed KV cache per D-10
+- Engram conditional memory at recurrent layers 1 and 4
+- MoE: 16 routed experts + 1 shared expert, top-1 routing
+- Sliding window attention (sliding_window=8192) with 256k context (YaRN factor=8.0)
+- Weight-tied embeddings per v5 canonical config (tie_word_embeddings=True)
+- LTI Injection + ACT Halting + LoRA Adapter for RDT loops
+- BoundaryPredictor + downsample/upsample for FlexiToken integration
+- 272-token byte-level vocab with sentinel tokens for multimodal (D-11)
+Architecture: RDT (2 prelude + 6 recurrent + 2 coda) with:
+  - 2x Prelude (MLA + dense FFN)
+  - 6x Recurrent (MLA + Engram@L1,L4 + MoE) -- with gradient checkpointing
+  - 2x Coda (MLA + dense FFN)
+  - LTI Injection + ACT Halting + LoRA Adapter
+Config: hidden_size=2048, 6 recurrent layers, 16 experts, top-1 routing
+"""
+import math
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+# ============================================================================
+# Spider Configuration
+# ============================================================================
+@dataclass
+class SpiderConfig:
+    """Spider model configuration (hidden_size=2048, byte-level vocab).
+    Based on mythos-fineweb-moe.py SpiderPortalConfig with byte-level
+    tokenization, MLA attention, and Engram memory.
+    """
+    # Core architecture
+    vocab_size: int = 272  # 256 bytes + 16 specials (D-06)
+    hidden_size: int = 2048
+    num_hidden_layers: int = 6  # recurrent layers
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 4  # not used directly in MLA but kept for compat
+    intermediate_size: int = 1024
+    hidden_act: str = "silu"
+    # MoE configuration (D-20, D-21: shared-projection MoE)
+    num_experts: int = 32
+    num_experts_per_tok: int = 2
+    num_shared_experts: int = 1
+    router_aux_loss_coef: float = 0.05
+    shared_intermediate_size: int = 6144
+    expert_core_rank: int = 256
+    shared_expert_intermediate_size: int = 7424
+    prelude_coda_intermediate_size: int = 4096
+    # RDT configuration
+    max_loop_iters: int = 16
+    act_threshold: float = 0.5
+    prelude_layers: int = 2
+    coda_layers: int = 2
+    lora_rank: int = 128
+    loop_embed_dim: int = 128
+    # MLA parameters (DeepSeek-V2 style, scaled for hidden_size=2048)
+    kv_lora_rank: int = 128
+    q_lora_rank: int = 256
+    qk_rope_head_dim: int = 64
+    qk_nope_head_dim: int = 64
+    v_head_dim: int = 64
+    # Engram parameters (DeepSeek conditional memory, offloaded to CPU)
+    engram_layers: List[int] = field(default_factory=lambda: [1, 4])
+    engram_ngram_orders: Tuple[int, ...] = (2, 3)
+    engram_hash_heads: int = 4
+    engram_table_size: int = 8191 # prime, sized for byte vocab=272
+    engram_conv_kernel: int = 4
+    engram_conv_dilation: int = 3
+    engram_dim: int = 128 # per-head embedding dimension
+    engram_offload: bool = True # offload embed table to CPU (DeepSeek style)
+    # Attention / RoPE
+    max_position_embeddings: int = 262144  # 256k context
+    rope_theta: float = 10000000.0
+    rope_scaling: Optional[Dict] = field(default_factory=lambda: {
+        "type": "yarn",
+        "factor": 8.0,
+        "original_max_position_embeddings": 32768,
+    })
+    sliding_window: int = 8192  # local attention window
+    attention_dropout: float = 0.0
+    rms_norm_eps: float = 1e-6
+    initializer_range: float = 0.02
+    # Embeddings / head
+    tie_word_embeddings: bool = True  # per v5 canonical config
+    # Multimodal
+    vision_hidden_size: int = 2048
+    audio_hidden_size: int = 512
+    vision_num_frames: int = 60
+    vision_tokens_per_frame: int = 256
+    vision_temporal_tokens: int = 64
+    vision_temporal_layers: int = 2
+    # Metadata
+    model_type: str = "spider"
+    torch_dtype: str = "bfloat16"
+    # BoundaryPredictor (for FlexiToken integration)
+    bp_d_inner: int = 8192
+    @property
+    def head_dim(self):
+        return self.qk_nope_head_dim + self.qk_rope_head_dim  # 128
+def spider_flexitokens_997m() -> SpiderConfig:
+    """Spider-FLEXITOKENS 995.1M config per D-20."""
+    return SpiderConfig()
+# ============================================================================
+# Sentinel Token Vocabulary (D-06, D-11)
+# ============================================================================
+# 272-token vocab: 256 bytes + 16 specials
+# Sentinel tokens at indices 259-264 mark modality region boundaries
+SENTINEL_TOKENS = {
+    'PAD': 256, 'BOS': 257, 'EOS': 258,
+    'IMG_START': 259, 'IMG_END': 260,
+    'AUD_START': 261, 'AUD_END': 262,
+    'VID_START': 263, 'VID_END': 264,
+    'MASK': 265, 'im_start': 266, 'im_end': 267,
+    'prefix': 268, 'suffix': 269, 'middle': 270,
+    'THINK': 271,
+}
+# Sentinel pairs for modality regions (start_id, end_id)
+_SENTINEL_PAIRS = [
+    (SENTINEL_TOKENS['IMG_START'], SENTINEL_TOKENS['IMG_END']),  # (259, 260)
+    (SENTINEL_TOKENS['AUD_START'], SENTINEL_TOKENS['AUD_END']),  # (261, 262)
+    (SENTINEL_TOKENS['VID_START'], SENTINEL_TOKENS['VID_END']),  # (263, 264)
+]
+# Set of modality sentinel token IDs (259-264 only)
+_MODALITY_SENTINEL_IDS = {259, 260, 261, 262, 263, 264}
+# Reverse mapping (computed once at module level, per IN-01)
+_TOKEN_NAMES_BY_ID = {v: k for k, v in SENTINEL_TOKENS.items()}
+def is_sentinel_token(token_id: int) -> bool:
+    """Return True if token_id is one of the 6 modality sentinel tokens (259-264).
+    These are the sentinel tokens that mark modality region boundaries:
+    IMG_START/END, AUD_START/END, VID_START/END.
+    Other special tokens (PAD, BOS, EOS, MASK, etc.) are NOT modality sentinels.
+    """
+    return token_id in _MODALITY_SENTINEL_IDS
+def create_modality_mask(input_ids: torch.Tensor, strict: bool = True) -> torch.Tensor:
+    """Create boolean mask (B×L) marking sentinel and modality token positions.
+    Per D-11: Sentinel-gated passthrough ensures modality tokens bypass the
+    BoundaryPredictor entirely. This mask marks positions where:
+    - Sentinel tokens (IMG_START/END, AUD_START/END, VID_START/END) appear
+    - Modality tokens (between sentinel pairs) appear
+    The BoundaryPredictor uses this mask to force boundary=1.0 at these
+    positions, ensuring no boundary merging across modality boundaries.
+    Args:
+        input_ids: Token IDs of shape [B, L] with values in 0-271 range.
+        strict: If True, raise on mismatched sentinel pairs (training mode).
+            If False, skip mismatched pairs gracefully (generation mode).
+    Returns:
+        Boolean tensor of shape [B, L], True at sentinel+modality positions.
+    Raises:
+        ValueError: If strict=True and sentinel pairs are mismatched.
+    """
+    B, L = input_ids.shape
+    mask = torch.zeros(B, L, dtype=torch.bool, device=input_ids.device)
+    # Mark direct sentinel token positions
+    for sid in _MODALITY_SENTINEL_IDS:
+        mask |= (input_ids == sid)
+    # Mark regions between sentinel pairs (inclusive of sentinels)
+    for start_id, end_id in _SENTINEL_PAIRS:
+        for b in range(B):
+            starts = (input_ids[b] == start_id).nonzero(as_tuple=True)[0]
+            ends = (input_ids[b] == end_id).nonzero(as_tuple=True)[0]
+            # T-02-04 mitigation: validate sentinel pairs are matched (strict mode only)
+            if strict and len(starts) != len(ends):
+                raise ValueError(
+                    f"Batch {b}: mismatched sentinel pairs — "
+                    f"{len(starts)} {_TOKEN_NAMES_BY_ID[start_id]}(s) vs "
+                    f"{len(ends)} {_TOKEN_NAMES_BY_ID[end_id]}(s). "
+                    f"Every {_TOKEN_NAMES_BY_ID[start_id]} must have a matching "
+                    f"{_TOKEN_NAMES_BY_ID[end_id]}."
+                )
+            # Match pairs min(starts, ends) — skip unmatched in non-strict mode
+            n_pairs = min(len(starts), len(ends))
+            for i in range(n_pairs):
+                s, e = starts[i].item(), ends[i].item()
+                if s > e:
+                    if strict:
+                        raise ValueError(
+                            f"Batch {b}: {_TOKEN_NAMES_BY_ID[start_id]} at position {s} "
+                            f"appears after {_TOKEN_NAMES_BY_ID[end_id]} at position {e}. "
+                            f"Sentinel pairs must be properly ordered."
+                        )
+                    continue
+                mask[b, s:e + 1] = True
+    return mask
+# ============================================================================
+# RMSNorm
+# ============================================================================
+class SpiderRMSNorm(nn.Module):
+    """RMS normalization (bf16-only, no dtype conversions)."""
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size, dtype=torch.float32))  # IN-02: RMSNorm weight is float32 per convention
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states
+# ============================================================================
+# MLA: Multi-Latent Attention (DeepSeek-V2 style)
+# ============================================================================
+class SpiderMLA(nn.Module):
+    """Multi-Latent Attention with compressed KV cache.
+    For hidden_size=2048, num_heads=16:
+    - qk_nope_head_dim=64, qk_rope_head_dim=64 -> total head_dim=128
+    - kv_lora_rank=128 -> 10.7x compression vs full 2048-dim KV
+    - v_head_dim=64 -> value projection
+        - sliding_window=8192 -> local attention window
+    """
+    def __init__(self, config: SpiderConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.kv_lora_rank = config.kv_lora_rank
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.v_head_dim = config.v_head_dim
+        self.head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
+        self.sliding_window = getattr(config, 'sliding_window', 0)
+        # Q projection: optional low-rank -> full Q
+        if self.q_lora_rank > 0:
+            self.q_a_proj = nn.Linear(config.hidden_size, self.q_lora_rank, bias=False)
+            self.q_a_layernorm = SpiderRMSNorm(self.q_lora_rank)
+            self.q_b_proj = nn.Linear(self.q_lora_rank, self.num_heads * self.head_dim, bias=False)
+        else:
+            self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        # KV compression: hidden -> kv_lora_rank (shared latent)
+        self.kv_a_proj_with_mqa = nn.Linear(
+            config.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+        )
+        self.kv_a_layernorm = SpiderRMSNorm(self.kv_lora_rank)
+        # Decompress: kv_lora_rank -> nope heads + v heads
+        self.kv_b_proj = nn.Linear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+        # Output projection: [hidden_size, num_heads * v_head_dim]
+        # Per D-08 and MLA architecture: o_proj maps from num_heads*v_head_dim back to hidden_size
+        self.o_proj = nn.Linear(self.num_heads * self.v_head_dim, config.hidden_size, bias=False)
+        # RoPE frequencies
+        rope_scaling = getattr(config, 'rope_scaling', None)
+        if rope_scaling and rope_scaling.get("type") == "yarn":
+            factor = rope_scaling.get("factor", 1.0)
+            orig_max_pos = rope_scaling.get(
+                "original_max_position_embeddings", config.max_position_embeddings
+            )
+            inv_freq = self._compute_yarn_inv_freq(
+                self.qk_rope_head_dim, config.rope_theta, factor, orig_max_pos
+            )
+        else:
+            inv_freq = 1.0 / (
+                config.rope_theta
+                ** (torch.arange(0, self.qk_rope_head_dim, 2).float() / self.qk_rope_head_dim)
+            )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    @staticmethod
+    def _compute_yarn_inv_freq(head_dim, rope_theta, factor, orig_max, beta_fast=32.0, beta_slow=1.0):
+        dim = head_dim
+        orig_inv_freq = 1.0 / (rope_theta ** (torch.arange(0, dim, 2).float() / dim))
+        pos_freqs = torch.arange(0, dim, 2).float() / dim
+        beta = (pos_freqs * math.log(rope_theta) / math.log(orig_max))
+        scale = torch.where(
+            beta < beta_slow, torch.ones_like(beta),
+            torch.where(
+                beta > beta_fast, torch.ones_like(beta) / factor,
+                1.0 - (beta - beta_slow) / (beta_fast - beta_slow) * (1.0 - 1.0 / factor)
+            )
+        )
+        return orig_inv_freq * scale
+    def _rotate_half(self, x):
+        x1 = x[..., :x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2:]
+        return torch.cat((-x2, x1), dim=-1)
+    def _apply_rotary(self, x, cos, sin):
+        return (x * cos) + (self._rotate_half(x) * sin)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask=None,
+        position_ids=None,
+        past_key_value=None,
+        use_cache=False,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        # Q projection
+        if self.q_lora_rank > 0:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        else:
+            q = self.q_proj(hidden_states)
+        q = q.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        q_nope, q_rope = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        # KV: compress to latent, then decompress
+        kv_hidden = self.kv_a_proj_with_mqa(hidden_states)
+        kv_latent, k_rope = torch.split(
+            kv_hidden, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+        kv_latent_norm = self.kv_a_layernorm(kv_latent)
+        kv_b_out = self.kv_b_proj(kv_latent_norm)
+        k_nope, v = torch.split(
+            kv_b_out,
+            [self.num_heads * self.qk_nope_head_dim, self.num_heads * self.v_head_dim],
+            dim=-1,
+        )
+        k_nope = k_nope.view(bsz, q_len, self.num_heads, self.qk_nope_head_dim).transpose(1, 2)
+        v = v.view(bsz, q_len, self.num_heads, self.v_head_dim).transpose(1, 2)
+        k_rope = k_rope.unsqueeze(1)  # [B, 1, L, qk_rope_head_dim]
+        # RoPE on Q and K rope parts
+        if position_ids is None:
+            position_ids = torch.arange(q_len, device=hidden_states.device).unsqueeze(0).expand(bsz, -1)
+        max_pos = position_ids.max().item() + 1
+        seq_len = max(max_pos, q_len)
+        t = torch.arange(seq_len, device=hidden_states.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos, sin = emb.cos(), emb.sin()
+        cos_full = cos[position_ids].unsqueeze(1)
+        sin_full = sin[position_ids].unsqueeze(1)
+        q_rope = self._apply_rotary(q_rope, cos_full, sin_full)
+        k_rope = self._apply_rotary(k_rope, cos_full, sin_full)
+        # Assemble full K
+        k_rope_expanded = k_rope.expand(-1, self.num_heads, -1, -1)
+        k_full = torch.cat([k_nope, k_rope_expanded], dim=-1)
+        q_full = torch.cat([q_nope, q_rope], dim=-1)
+        # KV cache
+        past_kv = None
+        if past_key_value is not None:
+            k_full = torch.cat([past_key_value[0], k_full], dim=2)
+            v = torch.cat([past_key_value[1], v], dim=2)
+        if use_cache:
+            past_kv = (k_full, v)
+        # Attention with SDPA
+        attn_mask = None
+        if self.sliding_window > 0 and k_full.shape[2] > self.sliding_window:
+            kv_len = k_full.shape[2]
+            q_positions = torch.arange(kv_len - q_len, kv_len, device=q_full.device)
+            k_positions = torch.arange(kv_len, device=q_full.device)
+            diff = q_positions.unsqueeze(1) - k_positions.unsqueeze(0)
+            causal = diff >= 0
+            window = diff < self.sliding_window
+            attn_mask = (causal & window).float().unsqueeze(0).unsqueeze(0)
+            attn_mask = attn_mask.masked_fill(attn_mask == 0, float('-inf'))
+        attn_output = F.scaled_dot_product_attention(
+            q_full, k_full, v,
+            attn_mask=attn_mask,
+            dropout_p=self.config.attention_dropout if self.training else 0.0,
+            is_causal=(attn_mask is None),
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+        return self.o_proj(attn_output), past_kv
+# ============================================================================
+# Engram: Conditional Memory via Scalable Lookup (DeepSeek style)
+# ============================================================================
+def _tokenizer_compress(token_ids, vocab_size=272):
+    """Simulate NFKC + lowercase canonical ID projection.
+    Per D-06: vocab_size=272 for byte-level Spider vocab.
+    """
+    return token_ids % (vocab_size * 77 // 100)
+class SpiderEngram(nn.Module):
+    """Conditional memory module via NN-gram lookup.
+    Applied only at specific recurrent layers (config.engram_layers).
+    Ported from SpiderPortalEngram in mythos-fineweb-moe.py.
+    """
+    def __init__(self, config: SpiderConfig):
+        super().__init__()
+        self.config = config
+        self.ngram_orders = list(config.engram_ngram_orders)
+        self.num_heads_per_order = config.engram_hash_heads
+        self.table_size = config.engram_table_size
+        self.d_mem = config.engram_dim
+        self.total_mem_dim = len(self.ngram_orders) * self.num_heads_per_order * self.d_mem
+        # Stacked embedding table with offsets: [orders, heads, table_size, d_mem]
+        # Per DeepSeek Engram: static memory, offloaded to CPU, accessed via deterministic hash.
+        embed_data = torch.randn(len(self.ngram_orders), self.num_heads_per_order, self.table_size, self.d_mem) * 0.02
+        if config.engram_offload:
+            self.register_buffer("embed", embed_data, persistent=True)
+        else:
+            self.embed = nn.Parameter(embed_data)
+        # Seeds per (order, head) in a stable head_counter ordering.
+        seeds = []
+        for _order in self.ngram_orders:
+            for h in range(self.num_heads_per_order):
+                seeds.append((h + 1) * 2654435761)
+        self.register_buffer("hash_seeds", torch.tensor(seeds, dtype=torch.int64), persistent=False)
+        self.W_k = nn.Linear(self.total_mem_dim, config.hidden_size, bias=False)
+        self.W_v = nn.Linear(self.total_mem_dim, config.hidden_size, bias=False)
+        self.conv = nn.Conv1d(
+            config.hidden_size, config.hidden_size,
+            kernel_size=config.engram_conv_kernel,
+            padding=config.engram_conv_kernel - 1,
+            groups=config.hidden_size,
+        )
+        self.conv_dilation = config.engram_conv_dilation
+        with torch.no_grad():
+            self.conv.weight.zero_()
+            if self.conv.bias is not None:
+                self.conv.bias.zero_()
+        self.q_norm = SpiderRMSNorm(config.hidden_size)
+        self.k_norm = SpiderRMSNorm(config.hidden_size)
+    def _compute_hash(self, compressed, n, head_counter, bsz, seq_len):
+        """Compute n-gram hash indices (PyTorch-only path, no Numba/CUDA dependency)."""
+        pad = torch.zeros(bsz, n - 1, dtype=compressed.dtype, device=compressed.device)
+        padded = torch.cat([pad, compressed], dim=1)
+        ngrams = torch.stack([padded[:, i : i + seq_len] for i in range(n)], dim=-1)
+        h_val = torch.zeros(bsz, seq_len, dtype=torch.int64, device=compressed.device)
+        for i in range(n):
+            h_val = h_val * 31 + ngrams[:, :, i].to(torch.int64)
+            h_val = h_val % self.table_size
+        return h_val
+    def _retrieve(self, token_ids):
+        """Retrieve memory vectors for a batch of token sequences."""
+        bsz, seq_len = token_ids.shape
+        compressed = _tokenizer_compress(token_ids)
+        # PyTorch fallback (CPU and GPU, no external kernel dependency)
+        all_parts = []
+        head_counter = 0
+        for order_idx, n in enumerate(self.ngram_orders):
+            h_val = self._compute_hash(compressed, n, head_counter, bsz, seq_len)
+            seeds_slice = self.hash_seeds[head_counter : head_counter + self.num_heads_per_order]
+            indices_pt = (h_val.unsqueeze(-1) * seeds_slice.view(1, 1, -1)) % self.table_size
+            emb_table = self.embed[order_idx]
+            idx = indices_pt.permute(0, 2, 1).unsqueeze(-1).expand(-1, -1, -1, self.d_mem)
+            mem = torch.gather(emb_table.unsqueeze(0).expand(bsz, -1, -1, -1), dim=2, index=idx)
+            mem = mem.permute(0, 2, 1, 3).reshape(bsz, seq_len, self.num_heads_per_order * self.d_mem)
+            all_parts.append(mem)
+            head_counter += self.num_heads_per_order
+        return torch.cat(all_parts, dim=-1)
+    def forward(self, hidden_states, token_ids, layer_id: int):
+        mem = self._retrieve(token_ids)
+        q = hidden_states
+        k = self.W_k(mem)
+        v = self.W_v(mem)
+        q_norm = self.q_norm(q)
+        k_norm = self.k_norm(k)
+        alpha = torch.sigmoid(
+            (q_norm * k_norm).sum(dim=-1, keepdim=True) / math.sqrt(q.shape[-1])
+        )
+        v_gated = alpha * v
+        v_gated_t = v_gated.transpose(1, 2)
+        conv_out = self.conv(v_gated_t)
+        conv_out = conv_out[:, :, :v_gated_t.shape[-1]]
+        conv_out = conv_out.transpose(1, 2)
+        y = F.silu(conv_out) + v_gated
+        return y
+# ============================================================================
+# FFN Expert (SwiGLU)
+# ============================================================================
+class SpiderExpert(nn.Module):
+    """SwiGLU FFN expert for dense layers and MoE shared expert."""
+    def __init__(self, config: SpiderConfig, intermediate_size=None):
+        super().__init__()
+        inter_size = intermediate_size or config.intermediate_size
+        self.gate_proj = nn.Linear(config.hidden_size, inter_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, inter_size, bias=False)
+        self.down_proj = nn.Linear(inter_size, config.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+    def forward(self, hidden_states):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
+# ============================================================================
+# Simple MoE (top-1 routing, no torchtitan dependency)
+# ============================================================================
+class SimpleMoE(nn.Module):
+    """Mixture of Experts with top-1 routing and shared expert.
+    This is a self-contained MoE implementation that does not depend on
+    torchtitan's MoE. Used by SpiderRecurrentLayer when torchtitan
+    is not available (e.g., during weight transfer and testing).
+    """
+    def __init__(self, config: SpiderConfig):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # Shared expert
+        self.shared_expert = SpiderExpert(config, intermediate_size=config.intermediate_size)
+        # Routed experts
+        self.experts = nn.ModuleList([
+            SpiderExpert(config, intermediate_size=config.intermediate_size)
+            for _ in range(config.num_experts)
+        ])
+        # Router
+        self.router = nn.Linear(config.hidden_size, config.num_experts, bias=True)
+        # router.bias is named router_bias in the state dict for compatibility
+        self.router.bias = nn.Parameter(torch.zeros(config.num_experts, dtype=torch.float32))  # IN-02
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass with top-1 routing.
+        Returns:
+            Tuple of (output, aux_loss) where aux_loss is the load balancing loss.
+        """
+        B, L, D = x.shape
+        # Shared expert output (always applied)
+        shared_out = self.shared_expert(x)
+        # Router logits
+        router_logits = self.router(x)  # [B, L, num_experts]
+        router_probs = F.softmax(router_logits, dim=-1)
+        # Top-1 routing
+        top1_indices = router_probs.argmax(dim=-1)  # [B, L]
+        top1_probs = router_probs.gather(-1, top1_indices.unsqueeze(-1)).squeeze(-1)  # [B, L]
+        # Compute expert outputs for top-1
+        x_flat = x.reshape(B * L, D)
+        top1_flat = top1_indices.reshape(B * L)
+        expert_outs = torch.zeros_like(x_flat)
+        for e in range(self.num_experts):
+            mask = (top1_flat == e)
+            if mask.any():
+                expert_input = x_flat[mask]
+                expert_out = self.experts[e](expert_input)
+                expert_outs[mask] = expert_out
+        expert_outs = expert_outs.reshape(B, L, D)
+        routed_out = expert_outs * top1_probs.unsqueeze(-1)
+        # Aux loss: z-loss for load balancing
+        z_loss = (router_logits.logsumexp(dim=-1) ** 2).mean()
+        return shared_out + routed_out, z_loss
+# ============================================================================
+# Shared-Projection MoE (D-20, D-21: top-2 routing with shared projections)
+# ============================================================================
+class SharedProjectionMoE(nn.Module):
+    """Mixture of Experts with shared projections and low-rank expert cores.
+    Per D-20: 32 experts, top-2 routing, shared_intermediate_size=6144.
+    Per D-21: Shared up/down projections computed once per token, rank-192
+    expert cores specialize on the shared representation.
+    Architecture:
+    - shared_up: Linear(hidden, shared_inter) — computed once for all experts
+    - shared_down: Linear(shared_inter, hidden) — computed once for all experts
+    - W_gate: [num_experts, hidden, expert_core_rank] — per-expert gating
+    - W_transform: [num_experts, expert_core_rank, shared_inter] — per-expert transform
+    - shared_expert: SpiderExpert(hidden, shared_expert_inter=4096) — always active
+    Forward: shared_hidden = SiLU(shared_up(x))
+             routed_out = sum(top2_weights * shared_down(core_i(shared_hidden)))
+             output = routed_out + shared_expert(x)
+    """
+    def __init__(self, config: SpiderConfig):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        self.shared_inter = config.shared_intermediate_size
+        self.expert_core_rank = config.expert_core_rank
+        self.hidden_size = config.hidden_size
+        self.shared_up = nn.Linear(config.hidden_size, config.shared_intermediate_size, bias=False)
+        self.shared_down = nn.Linear(config.shared_intermediate_size, config.hidden_size, bias=False)
+        self.W_gate = nn.Parameter(
+            torch.randn(config.num_experts, config.hidden_size, config.expert_core_rank) * 0.02
+        )
+        self.W_transform = nn.Parameter(
+            torch.randn(config.num_experts, config.expert_core_rank, config.shared_intermediate_size) * 0.02
+        )
+        self.shared_expert = SpiderExpert(config, intermediate_size=config.shared_expert_intermediate_size)
+        self.router = nn.Linear(config.hidden_size, config.num_experts, bias=True)
+        self.router.bias = nn.Parameter(torch.zeros(config.num_experts, dtype=torch.float32))
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, L, D = x.shape
+        shared_hidden = F.silu(self.shared_up(x))
+        shared_out = self.shared_expert(x)
+        router_logits = self.router(x)
+        router_probs = F.softmax(router_logits, dim=-1)
+        top2_probs, top2_indices = router_probs.topk(self.num_experts_per_tok, dim=-1)
+        top2_probs = top2_probs / top2_probs.sum(dim=-1, keepdim=True)
+        x_flat = x.reshape(B * L, D)
+        shared_hidden_flat = shared_hidden.reshape(B * L, self.shared_inter)
+        routed_out = torch.zeros(B * L, D, device=x.device, dtype=x.dtype)
+        for k in range(self.num_experts_per_tok):
+            expert_indices = top2_indices[:, :, k].reshape(B * L)
+            expert_weights = top2_probs[:, :, k].reshape(B * L)
+            for e in range(self.num_experts):
+                mask = (expert_indices == e)
+                if not mask.any():
+                    continue
+                expert_input = x_flat[mask]
+                expert_sh = shared_hidden_flat[mask]
+                gate = expert_input @ self.W_gate[e]
+                core = gate @ self.W_transform[e]
+                expert_output = self.shared_down(core * expert_sh)
+                routed_out[mask] += expert_weights[mask].unsqueeze(-1) * expert_output
+        routed_out = routed_out.reshape(B, L, D)
+        z_loss = (router_logits.logsumexp(dim=-1) ** 2).mean()
+        return shared_out + routed_out, z_loss
+# ============================================================================
+# Prelude/Coda Dense Layer (uses MLA)
+# ============================================================================
+class SpiderDenseLayer(nn.Module):
+    """Prelude/coda dense layer with MLA attention."""
+    def __init__(self, config: SpiderConfig):
+        super().__init__()
+        self.self_attn = SpiderMLA(config)
+        dense_intermediate = config.prelude_coda_intermediate_size
+        self.ffn = SpiderExpert(config, intermediate_size=dense_intermediate)
+        self.input_layernorm = SpiderRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = SpiderRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        past_key_value=None,
+        use_cache=False,
+    ):
+        attn_input = self.input_layernorm(hidden_states)
+        attn_output, past_kv = self.self_attn(
+            attn_input, attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+        )
+        hidden_states = hidden_states + attn_output
+        ffn_input = self.post_attention_layernorm(hidden_states)
+        ffn_output = self.ffn(ffn_input)
+        hidden_states = hidden_states + ffn_output
+        return hidden_states, past_kv
+# ============================================================================
+# Recurrent Layer (uses MLA + optional Engram + MoE)
+# ============================================================================
+class SpiderRecurrentLayer(nn.Module):
+    """Recurrent layer with MLA attention, optional Engram memory, and MoE."""
+    def __init__(self, config: SpiderConfig, layer_idx: int, has_engram: bool = False):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.has_engram = has_engram
+        self.self_attn = SpiderMLA(config)
+        if has_engram:
+            self.engram = SpiderEngram(config)
+        self.moe = SharedProjectionMoE(config)
+        self.input_layernorm = SpiderRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = SpiderRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_engram_layernorm = (
+            SpiderRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if has_engram else None
+        )
+    def forward(
+        self,
+        hidden_states,
+        token_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        past_key_value=None,
+        use_cache=False,
+    ):
+        attn_input = self.input_layernorm(hidden_states)
+        attn_output, past_kv = self.self_attn(
+            attn_input, attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+        )
+        hidden_states = hidden_states + attn_output
+        if self.has_engram and token_ids is not None:
+            engram_out = self.engram(hidden_states, token_ids, layer_id=self.layer_idx)
+            hidden_states = hidden_states + engram_out
+            if self.post_engram_layernorm is not None:
+                hidden_states = self.post_engram_layernorm(hidden_states)
+        ffn_input = self.post_attention_layernorm(hidden_states)
+        ffn_output, aux_loss = self.moe(ffn_input)
+        hidden_states = hidden_states + ffn_output
+        return hidden_states, aux_loss, past_kv
+# ============================================================================
+# BoundaryPredictor (D-04, D-11)
+# ============================================================================
+class BoundaryPredictor(nn.Module):
+    """Boundary predictor for learnable byte-level tokenization.
+    2-layer MLP that predicts merge boundaries between tokens.
+    Per D-11: When modality_mask is provided, forces boundary=1.0 at
+    sentinel and modality token positions, preventing cross-modality merges.
+    Architecture: Linear(d_model, d_inner) -> GELU -> Linear(d_inner, 1)
+    Uses Gumbel-Softmax straight-through estimator for differentiable
+    boundary decisions (ported from FLEXITOKENS fxt.py).
+    """
+    def __init__(
+        self,
+        config: SpiderConfig,
+        temp: float = 1.0,
+        threshold: float = 0.5,
+    ):
+        super().__init__()
+        self.temp = temp
+        self.threshold = threshold
+        self.boundary_predictor = nn.Sequential(
+            nn.Linear(config.hidden_size, config.bp_d_inner),
+            nn.GELU(),
+            nn.Linear(config.bp_d_inner, 1),
+        )
+    def forward(self, hidden, modality_mask=None):
+        """Predict boundary decisions for token merging.
+        Args:
+            hidden: Hidden states of shape [B, L, D] (batch-first per D-08).
+            modality_mask: Optional boolean tensor [B, L], True at positions
+                where sentinel/modality tokens appear. Per D-11,
+                forces boundary=1.0 at these positions.
+        Returns:
+            Tuple of (soft_boundaries, hard_boundaries), each [B, L].
+            - soft_boundaries: Differentiable boundary probabilities
+            - hard_boundaries: Binary boundary decisions (straight-through)
+        """
+        boundary_logits = self.boundary_predictor(hidden).squeeze(-1)
+        boundary_probs = torch.sigmoid(boundary_logits)
+        # Gumbel-Softmax straight-through for differentiable boundary decisions
+        bernoulli = torch.distributions.relaxed_bernoulli.RelaxedBernoulli(
+            temperature=self.temp,
+            probs=boundary_probs,
+        )
+        soft_boundaries = bernoulli.rsample()
+        hard_boundaries = (soft_boundaries > self.threshold).float()
+        # Straight-through estimator: gradient flows through soft, forward uses hard
+        hard_boundaries = (
+            hard_boundaries - soft_boundaries.detach() + soft_boundaries
+        )
+        # Per D-11: Force boundaries at sentinel/modality positions
+        if modality_mask is not None:
+            soft_boundaries = soft_boundaries.masked_fill(modality_mask, 1.0)
+            hard_boundaries = hard_boundaries.masked_fill(modality_mask, 1.0)
+        return soft_boundaries, hard_boundaries
+# ============================================================================
+# Downsample / Upsample (D-05, D-08, D-11)
+# ============================================================================
+def _downsample_common(boundaries: torch.Tensor, upsample: bool = False):
+    """Common helper for downsample/upsample einsum weight computation.
+    Computes the assignment matrix that maps original positions to groups.
+    Based on FLEXITOKENS shortening.py, adapted for batch-first (B*L*D) layout.
+    Args:
+        boundaries: [B, L] binary boundary tensor (1 = new group starts)
+        upsample: If True, compute upsample weights; else downsample weights
+    Returns:
+        Assignment tensor [B, L, S] or None if n_segments == 0
+    """
+    boundaries = boundaries.clone()
+    n_segments = int(boundaries.sum(dim=-1).max().item())
+    if upsample:
+        n_segments += 1
+    if n_segments == 0:
+        return None
+    tmp = torch.zeros_like(boundaries).unsqueeze(2) + torch.arange(
+        start=0, end=n_segments, device=boundaries.device, dtype=boundaries.dtype
+    )
+    hh1 = boundaries.cumsum(dim=-1)
+    if not upsample:
+        hh1 -= boundaries  # Subtract current boundary so position belongs to previous group
+    foo = tmp - hh1.unsqueeze(-1)
+    # WR-01 fix: zero out unused columns for batch items with fewer segments
+    # When n_segments is set to the max across the batch, items with fewer
+    # segments have unused columns that would produce NaN on normalization.
+    item_segment_counts = boundaries.sum(dim=-1)
+    for b in range(boundaries.shape[0]):
+        item_segs = int(item_segment_counts[b].item())
+        if upsample:
+            item_segs += 1
+        if item_segs < n_segments:
+            foo[b, :, item_segs:] = 0
+    return foo
+def _downsample_final(foo: torch.Tensor, upsample: bool = False) -> torch.Tensor:
+    """Normalize assignment weights for downsample/upsample einsum."""
+    autoregressive = foo != 0
+    lel = 1.0 - foo.float()
+    lel[autoregressive] = 0.0
+    dim = 2 if upsample else 1
+    lel = lel / (lel.sum(dim=dim, keepdim=True) + 1e-9)
+    return lel
+def downsample(boundaries: torch.Tensor, hidden: torch.Tensor, null_group: torch.Tensor) -> torch.Tensor:
+    """Downsample hidden states using boundary decisions.
+    Per D-05: Exact einsum port from FLEXITOKENS shortening.py.
+    Per D-08: Batch-first layout [B, L, D].
+    Per D-11: Sentinel tokens forced to boundary=1 by modality_mask ->
+    downsample treats each sentinel+modality group as a separate merge
+    group -> groups appear intact in shortened sequence.
+    Args:
+        boundaries: [B, L] binary boundary tensor (1 = new group starts)
+        hidden: [B, L, D] hidden states (batch-first per D-08)
+        null_group: [1, B, D] null group token prepended to output
+    Returns:
+        shortened_hidden: [S, B, D] shortened sequence (LBD format for
+        compatibility with FLEXITOKENS upsample which expects SBD input)
+    """
+    foo = _downsample_common(boundaries, upsample=False)
+    if foo is None:
+        return null_group.repeat(1, hidden.size(0), 1)
+    else:
+        bar = _downsample_final(foo, upsample=False)
+        # Einsum: B*L*D @ B*L*S -> B*S*D, then transpose to S*B*D
+        shortened_hidden = torch.einsum('bld,bls->bsd', hidden, bar)
+        shortened_hidden = shortened_hidden.permute(1, 0, 2)
+        # Prepend null_group: [1, B, D] -> cat along dim=0 -> [S+1, B, D]
+        shortened_hidden = torch.cat([null_group, shortened_hidden], dim=0)
+        return shortened_hidden
+def upsample(boundaries: torch.Tensor, shortened_hidden: torch.Tensor) -> torch.Tensor:
+    """Upsample shortened hidden states back to original sequence length.
+    Per D-05: Exact einsum port from FLEXITOKENS shortening.py.
+    Per D-08: Batch-first layout.
+    Args:
+        boundaries: [B, L] binary boundary tensor
+        shortened_hidden: [S, B, D] shortened sequence
+    Returns:
+        upsampled_hidden: [B, L, D] upsampled sequence
+    """
+    foo = _downsample_common(boundaries, upsample=True)
+    bar = _downsample_final(foo, upsample=True)
+    upsampled_hidden = torch.einsum('sbd,bls->bld', shortened_hidden, bar)
+    return upsampled_hidden
+# ============================================================================
+# LTI Injection, ACT Halting, LoRA Adapter
+# ============================================================================
+class LTIInjection(nn.Module):
+    """Linear Time-Invariant injection module."""
+    def __init__(self, config: SpiderConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.log_A = nn.Parameter(torch.full((config.hidden_size,), -2.0))
+        self.delta_t = nn.Parameter(torch.tensor(1.0))
+        self.B = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        with torch.no_grad():
+            self.B.weight.data.normal_(mean=0.0, std=0.01)
+    def get_A(self):
+        return -torch.exp(self.log_A)
+    def forward(self, h_t, e):
+        A = self.get_A()
+        return A * h_t + self.B(e)
+class ACTHalting(nn.Module):
+    """Adaptive Computation Time halting module."""
+    def __init__(self, config: SpiderConfig):
+        super().__init__()
+        self.halt_predictor = nn.Linear(config.hidden_size, 1)
+        self.threshold = config.act_threshold
+    def forward(self, hidden_states):
+        return torch.sigmoid(self.halt_predictor(hidden_states))
+class LoRAAdapter(nn.Module):
+    """LoRA adapter for per-loop adaptation in recurrent layers.
+    Per CR-01 fix: up-projection (self.B) is initialized to EXACTLY ZERO
+    so that LoRA adapter output is zero at initialization -- meaning the
+    model starts behaving identically to the base model. This follows
+    standard LoRA convention (Hu et al., 2021).
+    """
+    def __init__(self, config: SpiderConfig):
+        super().__init__()
+        rank = config.lora_rank
+        self.down = nn.Linear(config.hidden_size, rank, bias=False)
+        self.B = nn.Parameter(torch.zeros(rank, config.hidden_size, dtype=torch.float32))  # CR-01 fix: zeros, not randn*0.02; IN-02
+        self.scale = nn.Embedding(config.max_loop_iters, rank)
+        with torch.no_grad():
+            self.scale.weight.data.zero_()
+            self.down.weight.data.normal_(mean=0.0, std=0.001)
+    def forward(self, x, loop_t):
+        max_t = self.scale.num_embeddings - 1
+        t_idx = min(loop_t, max_t)
+        s = self.scale(torch.tensor(t_idx, device=x.device))
+        down = self.down(x) * s
+        return down @ self.B
+def _loop_index_embedding(h, loop_t, loop_dim, theta=10000.0):
+    """Sinusoidal loop index embedding for RDT depth differentiation."""
+    freqs = 1.0 / (theta ** (torch.arange(0, loop_dim, 2, device=h.device, dtype=h.dtype) / loop_dim))
+    angles = loop_t * freqs
+    emb = torch.cat([angles.sin(), angles.cos()], dim=-1)[:loop_dim]
+    emb_full = torch.zeros(h.shape[-1], device=h.device, dtype=h.dtype)
+    emb_full[:loop_dim] = emb
+    return h + emb_full.unsqueeze(0).unsqueeze(0)
+def _checkpoint(func, *args, **kwargs):
+    """Gradient checkpointing wrapper -- saves VRAM at ~20% compute cost."""
+    if torch.is_grad_enabled():
+        return torch.utils.checkpoint.checkpoint(func, *args, use_reentrant=False, **kwargs)
+    return func(*args, **kwargs)
+# ============================================================================
+# Full Spider Model (with FlexiToken integration)
+# ============================================================================
+class SpiderModel(nn.Module):
+    """Full RDT model with MLA attention + Engram memory + FlexiToken.
+    Architecture:
+    2x Prelude (MLA + dense FFN)
+    6x Recurrent (MLA + Engram@L1,L4 + MoE) -- with gradient checkpointing
+    2x Coda (MLA + dense FFN)
+    LTI Injection + ACT Halting + LoRA Adapter
+    BoundaryPredictor + downsample/upsample for FlexiToken
+    """
+    def __init__(self, config: SpiderConfig):
+        super().__init__()
+        self.config = config
+        self.prelude_layers = nn.ModuleList([
+            SpiderDenseLayer(config) for _ in range(config.prelude_layers)
+        ])
+        self.recurrent_layers = nn.ModuleList([
+            SpiderRecurrentLayer(config, i, has_engram=(i in config.engram_layers))
+            for i in range(config.num_hidden_layers)
+        ])
+        self.coda_layers = nn.ModuleList([
+            SpiderDenseLayer(config) for _ in range(config.coda_layers)
+        ])
+        self.norm = SpiderRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.injection = LTIInjection(config)
+        self.act_halting = ACTHalting(config)
+        self.lora_adapter = LoRAAdapter(config)
+        self.loop_embed_dim = config.loop_embed_dim
+        self._gradient_checkpointing = False
+    def gradient_checkpointing_enable(self):
+        self._gradient_checkpointing = True
+    def gradient_checkpointing_disable(self):
+        self._gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        input_embedding=None,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        use_cache=False,
+        n_loops=None,
+        token_ids=None,
+        hard_boundaries=None,
+    ):
+        n_loops = n_loops or 1
+        input_embedding = input_embedding if input_embedding is not None else hidden_states
+        # Prelude layers
+        for layer in self.prelude_layers:
+            if self._gradient_checkpointing and torch.is_grad_enabled():
+                hidden_states, _ = _checkpoint(
+                    layer, hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                )
+            else:
+                hidden_states, _ = layer(
+                    hidden_states, attention_mask=attention_mask,
+                    position_ids=position_ids,
+                )
+        # FlexiToken: if hard_boundaries provided, downsample before recurrent core
+        if hard_boundaries is not None:
+            # Apply norm before downsample
+            hidden_normed = self.norm(hidden_states)
+            null_group = torch.zeros(
+                1, hidden_states.shape[0], hidden_states.shape[-1],
+                device=hidden_states.device, dtype=hidden_states.dtype,
+            )
+            shortened = downsample(hard_boundaries, hidden_normed, null_group)
+            # shortened: [S, B, D] -> [B, S, D]
+            hidden_states = shortened.permute(1, 0, 2)
+            # Shorten token_ids to match downsampled sequence length.
+            # Take the first token in each boundary group so the Engram
+            # hash-based lookup gets a representative token per group.
+            # hard_boundaries: [B, L], cumsum gives group index per position.
+            # Pick the first position (where boundary=1) of each group.
+            if token_ids is not None:
+                group_ids = hard_boundaries.cumsum(dim=-1) # [B, L], 1-based group indices
+                n_groups = int(group_ids.max().item()) # number of groups
+                B = hard_boundaries.shape[0]
+                # For each group g (1..n_groups), find the first position where group_ids == g
+                short_ids = torch.zeros(B, n_groups, device=token_ids.device, dtype=token_ids.dtype)
+                for g in range(1, n_groups + 1):
+                    # mask of positions belonging to group g
+                    mask = (group_ids == g)
+                    # first position in group g
+                    first_pos = mask.float().argmax(dim=-1) # [B]
+                    short_ids[:, g - 1] = token_ids.gather(1, first_pos.unsqueeze(1)).squeeze(1)
+                # Prepend a dummy token (0) for the null_group entry
+                null_token = torch.zeros(B, 1, device=token_ids.device, dtype=token_ids.dtype)
+                token_ids = torch.cat([null_token, short_ids], dim=1) # [B, S+1]
+            # After downsample, input_embedding must match the shortened sequence length
+            input_embedding = hidden_states.clone()
+        # Recurrent core with RDT looping
+        e = hidden_states.clone()
+        B, T_seq, D = hidden_states.shape
+        halted = torch.zeros(B, T_seq, device=hidden_states.device, dtype=torch.bool)
+        cumulative_p = torch.zeros(B, T_seq, device=hidden_states.device, dtype=hidden_states.dtype)
+        h_out = torch.zeros_like(hidden_states)
+        total_aux_loss = 0.0
+        past_key_values = past_key_values if past_key_values is not None else [None] * len(self.recurrent_layers)
+        for t in range(n_loops):
+            h_loop = _loop_index_embedding(hidden_states, t, self.loop_embed_dim)
+            if t > 0:
+                injection = self.injection(hidden_states, input_embedding)
+                hidden_states = hidden_states + injection
+            new_past_key_values = []
+            for i, layer in enumerate(self.recurrent_layers):
+                hidden_states, aux_loss, past_kv = _checkpoint(
+                    layer, hidden_states,
+                    token_ids=token_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values[i] if t == 0 else None,
+                    use_cache=use_cache,
+                )
+                total_aux_loss = total_aux_loss + aux_loss
+                new_past_key_values.append(past_kv)
+            lora_delta = self.lora_adapter(hidden_states, t)
+            hidden_states = hidden_states + lora_delta
+            halt_prob = self.act_halting(hidden_states).squeeze(-1)
+            still_running = ~halted
+            remainder = (1.0 - cumulative_p).clamp(min=0)
+            weight = torch.where(
+                cumulative_p + halt_prob >= self.config.act_threshold,
+                remainder, halt_prob,
+            )
+            weight = weight * still_running.to(hidden_states.dtype)
+            h_out = h_out + weight.unsqueeze(-1) * hidden_states
+            cumulative_p = cumulative_p + halt_prob * still_running.to(hidden_states.dtype)
+            halted = halted | (cumulative_p >= self.config.act_threshold)
+            if halted.all() and not self.training:
+                break
+        never_halted = (~halted).to(hidden_states.dtype).unsqueeze(-1)
+        hidden_states = h_out + never_halted * hidden_states
+        # FlexiToken: if hard_boundaries provided, upsample after recurrent core
+        if hard_boundaries is not None:
+            hidden_states_sbd = hidden_states.permute(1, 0, 2)  # [S, B, D]
+            hidden_states = upsample(hard_boundaries, hidden_states_sbd)  # [B, L, D]
+        # Coda layers
+        for layer in self.coda_layers:
+            if self._gradient_checkpointing and torch.is_grad_enabled():
+                hidden_states, _ = _checkpoint(
+                    layer, hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                )
+            else:
+                hidden_states, _ = layer(
+                    hidden_states, attention_mask=attention_mask,
+                    position_ids=position_ids,
+                )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states, total_aux_loss, new_past_key_values
+# ============================================================================
+# SpiderForConditionalGeneration
+# ============================================================================
+class SpiderForConditionalGeneration(nn.Module):
+    """Spider model with embedding, LM head, and FlexiToken boundary prediction.
+    Forward flow:
+    1. embed_tokens(input_ids) -> hidden_states
+    2. Inject modality features at sentinel positions
+    3. Prelude layers
+    4. BoundaryPredictor with modality_mask -> boundaries
+    5. SpiderModel (downsample -> recurrent -> upsample -> coda)
+    6. lm_head -> logits
+    """
+    def __init__(self, config: SpiderConfig):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.boundary_predictor = BoundaryPredictor(config)
+        self.model = SpiderModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.apply(self._init_weights)
+    def gradient_checkpointing_enable(self):
+        self.model.gradient_checkpointing_enable()
+    def gradient_checkpointing_disable(self):
+        self.model.gradient_checkpointing_disable()
+    def enable_input_require_grads(self):
+        def _make_inputs_require_grad(module, input, output):
+            output.requires_grad_(True)
+        self.embed_tokens.register_forward_hook(_make_inputs_require_grad)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            if hasattr(self, 'model') and module is self.model.injection.B:
+                return  # LTI injection B has its own init
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+    def _inject_modality_features(
+        self,
+        hidden_states: torch.Tensor,
+        input_ids: torch.Tensor,
+        features: list,
+        modality: str = 'IMG',
+    ) -> torch.Tensor:
+        """Replace placeholder embeddings with actual encoder features at modality regions.
+        Per D-11: Modality tokens (vision, audio, video) are injected at
+        sentinel-marked positions. Between sentinel pairs, the initial
+        embeddings are placeholders -- this method replaces them with the
+        actual encoder features.
+        T-02-06 mitigation: Validates feature shape and sentinel pair count.
+        """
+        start_token = SENTINEL_TOKENS[f'{modality}_START']
+        end_token = SENTINEL_TOKENS[f'{modality}_END']
+        for b in range(hidden_states.shape[0]):
+            starts = (input_ids[b] == start_token).nonzero(as_tuple=True)[0]
+            ends = (input_ids[b] == end_token).nonzero(as_tuple=True)[0]
+            if len(starts) != len(ends):
+                raise ValueError(
+                    f"Batch {b}: mismatched {modality} sentinel pairs -- "
+                    f"{len(starts)} {_TOKEN_NAMES_BY_ID[start_token]}(s) vs "
+                    f"{len(ends)} {_TOKEN_NAMES_BY_ID[end_token]}(s)."
+                )
+            if len(starts) != len(features):
+                raise ValueError(
+                    f"Batch {b}: {modality} sentinel pair count ({len(starts)}) "
+                    f"doesn't match feature count ({len(features)})."
+                )
+            for s, e, feat in zip(starts, ends, features):
+                num_tokens = e - s - 1
+                if feat.shape[0] != num_tokens:
+                    raise ValueError(
+                        f"Batch {b}: {modality} feature has {feat.shape[0]} tokens "
+                        f"but sentinel region has {num_tokens} positions "
+                        f"(from pos {s+1} to {e-1})."
+                    )
+                if feat.shape[1] != hidden_states.shape[-1]:
+                    raise ValueError(
+                        f"Batch {b}: {modality} feature hidden_size {feat.shape[1]} "
+                        f"doesn't match model hidden_size {hidden_states.shape[-1]}."
+                    )
+                hidden_states[b, s + 1:e] = feat.to(hidden_states.dtype)
+        return hidden_states
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask=None,
+        position_ids=None,
+        labels=None,
+        n_loops=None,
+        use_cache=False,
+        vision_features=None,
+        audio_features=None,
+        video_features=None,
+        **kwargs,
+    ):
+        hidden_states = self.embed_tokens(input_ids)
+        model_dtype = next(self.model.parameters()).dtype
+        hidden_states = hidden_states.to(model_dtype)
+        input_embedding = hidden_states.clone()
+        # Inject modality features at sentinel positions
+        if vision_features is not None:
+            hidden_states = self._inject_modality_features(
+                hidden_states, input_ids, vision_features, 'IMG'
+            )
+        if audio_features is not None:
+            hidden_states = self._inject_modality_features(
+                hidden_states, input_ids, audio_features, 'AUD'
+            )
+        if video_features is not None:
+            hidden_states = self._inject_modality_features(
+                hidden_states, input_ids, video_features, 'VID'
+            )
+        # Create modality mask and predict boundaries
+        modality_mask = create_modality_mask(input_ids, strict=(labels is not None))
+        soft_boundaries, hard_boundaries = self.boundary_predictor(
+            hidden_states, modality_mask=modality_mask
+        )
+        # Run model with FlexiToken boundaries
+        hidden_states, aux_loss, past_kv = self.model(
+            hidden_states,
+            input_embedding=input_embedding,
+            attention_mask=None,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            n_loops=n_loops,
+            token_ids=input_ids,
+            hard_boundaries=hard_boundaries,
+        )
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        return {
+            "loss": loss,
+            "logits": logits,
+            "aux_loss": aux_loss,
+            "past_key_values": past_kv,
+            "soft_boundaries": soft_boundaries,
+            "hard_boundaries": hard_boundaries,
+        }
+    @torch.inference_mode()
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int = 100,
+        temperature: float = 1.0,
+        top_k: Optional[int] = None,
+        n_loops: int = 1,
+        use_cache: bool = True,
+        boundary_mode: str = 'adaptive',
+    ) -> torch.Tensor:
+        """Token-level generation with compressed-prefix KV cache per D-28.
+        Strategy: Encode the prefix through prelude + BP + downsample to get
+        a compressed KV cache, then autoregressively decode byte-by-byte using
+        that cached prefix. The speedup comes from the prefix being shorter in
+        the KV cache (~3.3x fewer entries for English text).
+        Flow:
+        1. Embed prefix → prelude layers → BP → downsample → recurrent core
+           → collect KV cache for compressed prefix
+        2. Coda + lm_head on last position → sample first new byte
+        3. For each subsequent byte: embed → recurrent (with KV cache) → coda
+           → lm_head → sample → append
+        4. Stop at max_new_tokens or EOS
+        Args:
+            input_ids: Prefix token IDs [B, L] (byte values 0-255 + BOS/EOS)
+            max_new_tokens: Maximum number of new bytes to generate
+            temperature: Sampling temperature (0 = greedy, 1.0 = default)
+            top_k: If set, only sample from top-k logits
+            n_loops: Number of recurrent loops during generation
+            use_cache: Use KV cache for incremental decoding
+            boundary_mode: 'adaptive' (threshold) or 'fixed' (top-k) for BP
+        Returns:
+            Generated token IDs [B, N] where N ≤ max_new_tokens
+        """
+        B = input_ids.shape[0]
+        device = input_ids.device
+        model_dtype = next(self.model.parameters()).dtype
+        # --- Step 1: Encode prefix and collect KV cache ---
+        hidden_states = self.embed_tokens(input_ids).to(model_dtype)
+        # Prelude layers (byte-level, no compression)
+        for layer in self.model.prelude_layers:
+            hidden_states, _ = layer(hidden_states)
+        # Boundary prediction on prefix (strict=False for generation)
+        modality_mask = create_modality_mask(input_ids, strict=False)
+        soft_boundaries, hard_boundaries = self.boundary_predictor(
+            hidden_states, modality_mask=modality_mask
+        )
+        # Apply boundary mode
+        if boundary_mode == 'adaptive':
+            hard_boundaries = (soft_boundaries > 0.5).float()
+            hard_boundaries = hard_boundaries - soft_boundaries.detach() + soft_boundaries
+        elif boundary_mode == 'fixed':
+            k = max(1, int(soft_boundaries.shape[-1] / 3.3))
+            topk_vals, topk_idx = soft_boundaries.topk(k, dim=-1)
+            hard_boundaries = torch.zeros_like(soft_boundaries)
+            hard_boundaries.scatter_(-1, topk_idx, 1.0)
+            hard_boundaries = hard_boundaries - soft_boundaries.detach() + soft_boundaries
+        # Downsample prefix for compressed KV cache
+        hidden_normed = self.model.norm(hidden_states)
+        null_group = torch.zeros(
+            1, B, hidden_states.shape[-1], device=device, dtype=hidden_states.dtype
+        )
+        shortened = downsample(hard_boundaries, hidden_normed, null_group)
+        hidden_states = shortened.permute(1, 0, 2)  # [B, S, D]
+        input_embedding = hidden_states.clone()
+        # Run through recurrent core + coda (hard_boundaries=None skips downsample/upsample)
+        hidden_states, _, past_key_values = self.model(
+            hidden_states,
+            input_embedding=input_embedding,
+            use_cache=use_cache,
+            n_loops=n_loops,
+            hard_boundaries=None,
+        )
+        # Get logits for last position of prefix (norm + lm_head only, coda already applied)
+        logits = self.lm_head(hidden_states[:, -1:, :])  # [B, 1, vocab]
+        next_token = self._sample_token(logits, temperature, top_k)  # [B, 1]
+        generated = [next_token]
+        # --- Step 2: Autoregressive byte-level decoding with KV cache ---
+        for _ in range(max_new_tokens - 1):
+            # Check EOS
+            if (next_token == SENTINEL_TOKENS['EOS']).all():
+                break
+            # Embed the last generated token
+            hidden_states = self.embed_tokens(next_token).to(model_dtype)  # [B, 1, D]
+            input_embedding = hidden_states.clone()
+            if use_cache:
+                # Incremental forward: 1 new token, cached prefix in past_key_values
+                hidden_states, _, past_key_values = self.model(
+                    hidden_states,
+                    input_embedding=input_embedding,
+                    past_key_values=past_key_values,
+                    use_cache=True,
+                    n_loops=n_loops,
+                    hard_boundaries=None,
+                )
+            else:
+                # Naive: re-run full forward from scratch (no KV cache)
+                all_ids = torch.cat([input_ids, torch.cat(generated, dim=1)], dim=1)
+                output = self.forward(
+                    all_ids, n_loops=n_loops, use_cache=False,
+                )
+                logits_full = output['logits']
+                next_logits = logits_full[:, -1, :] / max(temperature, 1e-8)
+                if top_k is not None and top_k > 0:
+                    v, _ = torch.topk(next_logits, min(top_k, next_logits.size(-1)))
+                    next_logits = next_logits.masked_fill(next_logits < v[:, [-1]], float('-inf'))
+                if temperature < 1e-8:
+                    next_token = next_logits.argmax(dim=-1, keepdim=True)
+                else:
+                    probs = torch.softmax(next_logits, dim=-1)
+                    next_token = torch.multinomial(probs, num_samples=1)
+                generated.append(next_token)
+                continue
+            # lm_head on last position (coda + norm already applied by self.model)
+            logits = self.lm_head(hidden_states[:, -1:, :]) # [B, 1, vocab]
+            next_token = self._sample_token(logits, temperature, top_k)
+            generated.append(next_token)
+        return torch.cat(generated, dim=1)  # [B, N]
+    @staticmethod
+    def _sample_token(logits: torch.Tensor, temperature: float, top_k: Optional[int]) -> torch.Tensor:
+        """Sample next token from logits with temperature and top-k."""
+        logits = logits.squeeze(1)  # [B, vocab]
+        if temperature < 1e-8:
+            return logits.argmax(dim=-1, keepdim=True)  # greedy
+        logits = logits / temperature
+        if top_k is not None and top_k > 0:
+            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+            logits = logits.masked_fill(logits < v[:, [-1]], float('-inf'))
+        probs = torch.softmax(logits, dim=-1)
+        return torch.multinomial(probs, num_samples=1)  # [B, 1]
+    def get_num_params(self):
+        total = sum(p.numel() for p in self.parameters())
+        return {"total": total, "trainable": total}