inv0krr
/

leworld-memory-architecture

Model card Files Files and versions

xet

Community

inv0krr commited on 14 days ago

Commit

52bd035

verified ·

1 Parent(s): 2f44c12

Add core architecture: Memory, SLM, BLM, full system

Browse files

Files changed (1) hide show

leworld_architecture.py +990 -0

leworld_architecture.py ADDED Viewed

	@@ -0,0 +1,990 @@

+"""
+LeWorld Memory Architecture — Complete Implementation
+=====================================================
+Component 1: Artificial Memory (CPU-style bit storage)
+Component 2: SLMs (Small LeWorld Models, ~1.5M params each)
+Component 3: BLM (Big LeWorld Model, ~12M params)
+Component 4: Full System with training loop
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from dataclasses import dataclass
+from typing import Tuple, List, Optional
+# =============================================================================
+# Configuration
+# =============================================================================
+@dataclass
+class MemoryConfig:
+    """CPU-style artificial memory configuration."""
+    num_words: int = 65536       # 64K addressable words (like 64K RAM)
+    word_size: int = 32          # 32 bits per word
+    address_bits: int = 16       # 2^16 = 65536 addresses
+    max_read_range: int = 256    # max words per single read operation
+@dataclass
+class SLMConfig:
+    """Small LeWorld Model configuration (~1.5M params)."""
+    d_model: int = 128           # internal dimension
+    n_heads: int = 4             # attention heads
+    n_layers: int = 2            # transformer layers
+    state_dim: int = 64          # state vector dimension
+    char_dim: int = 32           # characteristics vector dimension
+    address_space: int = 65536   # must match MemoryConfig.num_words
+    max_read_range: int = 256    # must match MemoryConfig.max_read_range
+    dropout: float = 0.1
+@dataclass
+class BLMConfig:
+    """Big LeWorld Model configuration (~12M params)."""
+    d_model: int = 384           # internal dimension
+    n_heads: int = 6             # attention heads
+    n_layers: int = 6            # transformer layers
+    state_dim: int = 64          # state vector dimension
+    n_slms: int = 3              # number of SLMs to route over
+    memory_read_dim: int = 256   # dimension of encoded memory reads
+    info_query_dim: int = 128    # dimension of "what info do I need" query
+    dropout: float = 0.1
+# =============================================================================
+# Component 1: Artificial Memory
+# =============================================================================
+class ArtificialMemory(nn.Module):
+    """
+    CPU-style bit-level memory with address-range access.
+    Stores data as actual bits (0/1 tensors), organized into addressable words.
+    Supports:
+    - READ(start_addr, end_addr) → returns bit block
+    - WRITE(start_addr, data) → writes bits to memory
+    - Bit-to-embedding projection (for neural network consumption)
+    This mimics how a CPU accesses RAM:
+    - Each address points to a word (32 bits)
+    - Contiguous reads fetch a range of words
+    - No inherent "meaning" — bits are just bits until interpreted
+    """
+    def __init__(self, config: MemoryConfig):
+        super().__init__()
+        self.config = config
+        # The actual memory: (num_words, word_size) binary tensor
+        # Initialized randomly — represents "existing knowledge base"
+        self.register_buffer(
+            'memory',
+            torch.randint(0, 2, (config.num_words, config.word_size)).float()
+        )
+        # Bit-to-embedding projection: converts raw bits into dense vectors
+        # This is learnable — the system learns what bit patterns mean
+        self.bit_encoder = nn.Sequential(
+            nn.Linear(config.word_size, 64),
+            nn.GELU(),
+            nn.Linear(64, 128),
+            nn.LayerNorm(128)
+        )
+        # Write projection: converts dense vectors back to bit probabilities
+        self.bit_decoder = nn.Sequential(
+            nn.Linear(128, 64),
+            nn.GELU(),
+            nn.Linear(64, config.word_size),
+            nn.Sigmoid()  # output probabilities for each bit
+        )
+    def read(self, start_addr: torch.Tensor, end_addr: torch.Tensor) -> torch.Tensor:
+        """
+        Read a contiguous range of words from memory.
+        Args:
+            start_addr: (batch,) integer tensor of start addresses
+            end_addr: (batch,) integer tensor of end addresses
+        Returns:
+            bit_block: (batch, max_range, word_size) raw bits
+            encoded: (batch, max_range, 128) encoded memory content
+        """
+        batch_size = start_addr.shape[0]
+        max_range = self.config.max_read_range
+        # Clamp addresses to valid range
+        start_addr = start_addr.clamp(0, self.config.num_words - 1)
+        end_addr = end_addr.clamp(start_addr,
+                                   torch.minimum(start_addr + max_range,
+                                                 torch.tensor(self.config.num_words)))
+        # Gather memory contents for each batch element
+        # Create index tensor for the address ranges
+        offsets = torch.arange(max_range, device=start_addr.device).unsqueeze(0)  # (1, max_range)
+        addresses = start_addr.unsqueeze(1) + offsets  # (batch, max_range)
+        addresses = addresses.clamp(0, self.config.num_words - 1)
+        # Create validity mask (addresses within [start, end) are valid)
+        range_lengths = (end_addr - start_addr).unsqueeze(1)  # (batch, 1)
+        valid_mask = offsets < range_lengths  # (batch, max_range)
+        # Gather bits
+        bit_block = self.memory[addresses]  # (batch, max_range, word_size)
+        bit_block = bit_block * valid_mask.unsqueeze(-1).float()  # zero out invalid
+        # Encode bits to dense vectors
+        encoded = self.bit_encoder(bit_block)  # (batch, max_range, 128)
+        encoded = encoded * valid_mask.unsqueeze(-1).float()
+        return bit_block, encoded, valid_mask
+    def write(self, start_addr: torch.Tensor, data: torch.Tensor):
+        """
+        Write data to memory (differentiable soft-write).
+        Args:
+            start_addr: (batch,) start addresses
+            data: (batch, n_words, 128) encoded data to write
+        """
+        n_words = data.shape[1]
+        # Decode to bit probabilities
+        bit_probs = self.bit_decoder(data)  # (batch, n_words, word_size)
+        # Hard bits via straight-through
+        hard_bits = (bit_probs > 0.5).float()
+        bits_to_write = hard_bits - bit_probs.detach() + bit_probs  # ST trick
+        # Write to memory (last batch element wins for simplicity)
+        for b in range(start_addr.shape[0]):
+            addr = start_addr[b].long()
+            end = min(addr + n_words, self.config.num_words)
+            actual_n = end - addr
+            self.memory[addr:end] = bits_to_write[b, :actual_n].detach()
+    def soft_read(self, attention_weights: torch.Tensor) -> torch.Tensor:
+        """
+        Content-based soft read using attention weights over entire memory.
+        Used for differentiable end-to-end training.
+        Args:
+            attention_weights: (batch, num_words) soft address distribution
+        Returns:
+            encoded: (batch, 128) weighted memory content
+        """
+        # Encode all memory (expensive but differentiable)
+        all_encoded = self.bit_encoder(self.memory)  # (num_words, 128)
+        # Weighted sum
+        encoded = torch.matmul(attention_weights, all_encoded)  # (batch, 128)
+        return encoded
+# =============================================================================
+# Component 2: Small LeWorld Model (SLM)
+# =============================================================================
+class StateEncoder(nn.Module):
+    """Encodes past_state and current_state into a joint representation."""
+    def __init__(self, state_dim: int, d_model: int):
+        super().__init__()
+        self.past_proj = nn.Linear(state_dim, d_model)
+        self.curr_proj = nn.Linear(state_dim, d_model)
+        self.combiner = nn.Sequential(
+            nn.Linear(d_model * 2, d_model),
+            nn.GELU(),
+            nn.LayerNorm(d_model)
+        )
+    def forward(self, past_state: torch.Tensor, current_state: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            past_state: (batch, state_dim)
+            current_state: (batch, state_dim)
+        Returns:
+            combined: (batch, d_model)
+        """
+        past_enc = F.gelu(self.past_proj(past_state))
+        curr_enc = F.gelu(self.curr_proj(current_state))
+        combined = self.combiner(torch.cat([past_enc, curr_enc], dim=-1))
+        return combined
+class CharacteristicsEncoder(nn.Module):
+    """Encodes static characteristics/context."""
+    def __init__(self, char_dim: int, d_model: int):
+        super().__init__()
+        self.encoder = nn.Sequential(
+            nn.Linear(char_dim, d_model),
+            nn.GELU(),
+            nn.LayerNorm(d_model)
+        )
+    def forward(self, characteristics: torch.Tensor) -> torch.Tensor:
+        return self.encoder(characteristics)
+class TransformerBlock(nn.Module):
+    """Standard transformer block with pre-norm."""
+    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(d_model)
+        self.attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.ffn = nn.Sequential(
+            nn.Linear(d_model, d_model * 4),
+            nn.GELU(),
+            nn.Linear(d_model * 4, d_model),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Self-attention with pre-norm
+        normed = self.norm1(x)
+        attn_out, _ = self.attn(normed, normed, normed)
+        x = x + attn_out
+        # FFN with pre-norm
+        x = x + self.ffn(self.norm2(x))
+        return x
+class CrossAttentionBlock(nn.Module):
+    """Cross-attention: state attends to characteristics."""
+    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
+        super().__init__()
+        self.norm_q = nn.LayerNorm(d_model)
+        self.norm_kv = nn.LayerNorm(d_model)
+        self.cross_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
+        self.norm_ff = nn.LayerNorm(d_model)
+        self.ffn = nn.Sequential(
+            nn.Linear(d_model, d_model * 4),
+            nn.GELU(),
+            nn.Linear(d_model * 4, d_model),
+            nn.Dropout(dropout)
+        )
+    def forward(self, query: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
+        normed_q = self.norm_q(query)
+        normed_kv = self.norm_kv(context)
+        attn_out, _ = self.cross_attn(normed_q, normed_kv, normed_kv)
+        x = query + attn_out
+        x = x + self.ffn(self.norm_ff(x))
+        return x
+class AddressHead(nn.Module):
+    """
+    Produces memory address range (start_addr, end_addr) from hidden state.
+    Uses two approaches:
+    1. HARD mode: argmax over address space (for inference)
+    2. SOFT mode: attention weights over memory (for differentiable training)
+    """
+    def __init__(self, d_model: int, address_space: int, max_range: int):
+        super().__init__()
+        self.address_space = address_space
+        self.max_range = max_range
+        # Produce start address logits
+        # We don't have a linear over 65K — that's too many params
+        # Instead: predict address as composition of sub-addresses (like product keys)
+        self.addr_bits = int(math.log2(address_space))  # 16 for 65536
+        assert 2 ** self.addr_bits == address_space, "address_space must be power of 2"
+        # Split address into high byte and low byte (8+8 = 16 bits)
+        self.half_bits = self.addr_bits // 2  # 8
+        self.half_space = 2 ** self.half_bits  # 256
+        # Predict high and low parts separately (product key approach)
+        self.start_high = nn.Linear(d_model, self.half_space)  # 256 outputs
+        self.start_low = nn.Linear(d_model, self.half_space)   # 256 outputs
+        # Predict range length (how many words to read)
+        self.range_head = nn.Sequential(
+            nn.Linear(d_model, d_model // 2),
+            nn.GELU(),
+            nn.Linear(d_model // 2, max_range)
+        )
+        # Confidence head
+        self.confidence_head = nn.Sequential(
+            nn.Linear(d_model, d_model // 4),
+            nn.GELU(),
+            nn.Linear(d_model // 4, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, hidden: torch.Tensor) -> dict:
+        """
+        Args:
+            hidden: (batch, d_model)
+        Returns:
+            dict with:
+                start_addr: (batch,) integer addresses
+                end_addr: (batch,) integer addresses
+                range_length: (batch,) how many words to read
+                confidence: (batch,) read confidence score
+                start_logits_high: (batch, 256) for soft addressing
+                start_logits_low: (batch, 256) for soft addressing
+                range_logits: (batch, max_range) for soft range selection
+        """
+        batch_size = hidden.shape[0]
+        # Product-key address generation
+        high_logits = self.start_high(hidden)  # (batch, 256)
+        low_logits = self.start_low(hidden)    # (batch, 256)
+        # Hard address via argmax
+        high_idx = high_logits.argmax(dim=-1)  # (batch,)
+        low_idx = low_logits.argmax(dim=-1)    # (batch,)
+        start_addr = high_idx * self.half_space + low_idx  # (batch,) 0..65535
+        # Range length
+        range_logits = self.range_head(hidden)  # (batch, max_range)
+        range_length = range_logits.argmax(dim=-1) + 1  # (batch,) 1..max_range
+        end_addr = (start_addr + range_length).clamp(max=self.address_space - 1)
+        # Confidence
+        confidence = self.confidence_head(hidden).squeeze(-1)  # (batch,)
+        return {
+            'start_addr': start_addr,
+            'end_addr': end_addr,
+            'range_length': range_length,
+            'confidence': confidence,
+            'start_logits_high': high_logits,
+            'start_logits_low': low_logits,
+            'range_logits': range_logits,
+        }
+class SmallLeWorldModel(nn.Module):
+    """
+    SLM: Small LeWorld Model (~1.5M params)
+    Takes (past_state, current_state, characteristics) and produces
+    a memory address range pointing to the most useful memory for
+    next-state prediction.
+    Architecture:
+    1. Encode past + current state → state representation
+    2. Encode characteristics
+    3. Cross-attend: state attends to characteristics
+    4. Self-attention transformer layers
+    5. Address head: output (start_addr, end_addr, confidence)
+    """
+    def __init__(self, config: SLMConfig, slm_id: int = 0):
+        super().__init__()
+        self.config = config
+        self.slm_id = slm_id
+        # Encoders
+        self.state_encoder = StateEncoder(config.state_dim, config.d_model)
+        self.char_encoder = CharacteristicsEncoder(config.char_dim, config.d_model)
+        # Cross-attention: state ← characteristics
+        self.cross_attn = CrossAttentionBlock(config.d_model, config.n_heads, config.dropout)
+        # Self-attention transformer
+        self.transformer_layers = nn.ModuleList([
+            TransformerBlock(config.d_model, config.n_heads, config.dropout)
+            for _ in range(config.n_layers)
+        ])
+        self.final_norm = nn.LayerNorm(config.d_model)
+        # Address output head
+        self.address_head = AddressHead(config.d_model, config.address_space, config.max_read_range)
+    def forward(
+        self,
+        past_state: torch.Tensor,      # (batch, state_dim)
+        current_state: torch.Tensor,    # (batch, state_dim)
+        characteristics: torch.Tensor,  # (batch, char_dim)
+    ) -> dict:
+        """
+        Forward pass: state + characteristics → memory address range.
+        Returns dict with address info + internal hidden state.
+        """
+        # Encode states
+        state_repr = self.state_encoder(past_state, current_state)  # (batch, d_model)
+        # Encode characteristics
+        char_repr = self.char_encoder(characteristics)  # (batch, d_model)
+        # Cross-attention: state queries characteristics
+        # Unsqueeze to sequence dim for attention
+        state_seq = state_repr.unsqueeze(1)   # (batch, 1, d_model)
+        char_seq = char_repr.unsqueeze(1)     # (batch, 1, d_model)
+        enriched = self.cross_attn(state_seq, char_seq)  # (batch, 1, d_model)
+        # Self-attention layers
+        hidden = enriched
+        for layer in self.transformer_layers:
+            hidden = layer(hidden)
+        hidden = self.final_norm(hidden)
+        hidden = hidden.squeeze(1)  # (batch, d_model)
+        # Produce address range
+        addr_output = self.address_head(hidden)
+        addr_output['hidden'] = hidden  # keep for BLM to use
+        addr_output['slm_id'] = self.slm_id
+        return addr_output
+# =============================================================================
+# Component 3: Big LeWorld Model (BLM)
+# =============================================================================
+class StraightThroughSigmoid(torch.autograd.Function):
+    """
+    Binary gate: hard 0/1 in forward, sigmoid gradient in backward.
+    From literature: ST-GS (Jang et al. 2017) + Switch Transformer routing.
+    """
+    @staticmethod
+    def forward(ctx, logits):
+        probs = torch.sigmoid(logits)
+        ctx.save_for_backward(probs)
+        return (probs > 0.5).float()
+    @staticmethod
+    def backward(ctx, grad_output):
+        probs, = ctx.saved_tensors
+        # Sigmoid derivative: p * (1-p)
+        return grad_output * probs * (1 - probs)
+class BLMRouter(nn.Module):
+    """
+    Routes/selects which SLMs to activate.
+    Produces binary mask like [1, 0, 1].
+    Uses Straight-Through Sigmoid for differentiable binary selection.
+    Includes load-balancing loss to prevent degenerate routing.
+    """
+    def __init__(self, d_model: int, n_slms: int):
+        super().__init__()
+        self.n_slms = n_slms
+        self.gate = nn.Sequential(
+            nn.Linear(d_model, d_model // 2),
+            nn.GELU(),
+            nn.Linear(d_model // 2, n_slms)
+        )
+        # Temperature for annealing (start warm, cool down)
+        self.register_buffer('temperature', torch.tensor(1.0))
+    def forward(self, state_repr: torch.Tensor) -> Tuple[torch.Tensor, dict]:
+        """
+        Args:
+            state_repr: (batch, d_model) encoded current state
+        Returns:
+            binary_mask: (batch, n_slms) hard 0/1 selection
+            routing_info: dict with probs, losses, etc.
+        """
+        logits = self.gate(state_repr)  # (batch, n_slms)
+        # Scale by temperature
+        scaled_logits = logits / self.temperature.clamp(min=0.1)
+        probs = torch.sigmoid(scaled_logits)  # (batch, n_slms)
+        # Straight-through binary: hard in forward, soft in backward
+        hard_mask = (probs > 0.5).float()
+        binary_mask = hard_mask - probs.detach() + probs  # THE ST TRICK
+        # Ensure at least one SLM is selected (don't want all zeros)
+        # If all zeros, force-select the highest probability SLM
+        all_zero = (binary_mask.sum(dim=-1) == 0)  # (batch,)
+        if all_zero.any():
+            max_idx = probs[all_zero].argmax(dim=-1)
+            forced = torch.zeros_like(probs[all_zero])
+            forced.scatter_(1, max_idx.unsqueeze(1), 1.0)
+            binary_mask[all_zero] = forced
+        # Load balance loss: encourage roughly equal usage of SLMs
+        usage_per_slm = binary_mask.mean(dim=0)  # (n_slms,)
+        target_usage = 1.0 / self.n_slms
+        balance_loss = ((usage_per_slm - target_usage) ** 2).sum()
+        # Entropy loss: encourage decisive routing (not all ~0.5)
+        entropy = -(probs * torch.log(probs + 1e-8) +
+                     (1 - probs) * torch.log(1 - probs + 1e-8))
+        entropy_loss = entropy.mean()
+        routing_info = {
+            'probs': probs,
+            'binary_mask': binary_mask,
+            'balance_loss': balance_loss,
+            'entropy_loss': entropy_loss,
+            'logits': logits,
+        }
+        return binary_mask, routing_info
+    def anneal_temperature(self, step: int, anneal_rate: float = 3e-5, min_temp: float = 0.1):
+        """Anneal temperature: start warm (exploratory), cool down (decisive)."""
+        new_temp = max(min_temp, math.exp(-anneal_rate * step))
+        self.temperature.fill_(new_temp)
+class InfoRequestHead(nn.Module):
+    """
+    Produces a query vector representing "what information do I need next?"
+    This is the key innovation: instead of passively receiving all SLM outputs,
+    the BLM actively requests specific information. This query modulates which
+    memory regions the SLMs should focus on in the NEXT timestep.
+    """
+    def __init__(self, d_model: int, query_dim: int):
+        super().__init__()
+        self.query_generator = nn.Sequential(
+            nn.Linear(d_model, d_model),
+            nn.GELU(),
+            nn.Linear(d_model, query_dim),
+            nn.LayerNorm(query_dim)
+        )
+    def forward(self, hidden: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden: (batch, d_model) BLM's internal state
+        Returns:
+            info_query: (batch, query_dim) "what do I need next?"
+        """
+        return self.query_generator(hidden)
+class BigLeWorldModel(nn.Module):
+    """
+    BLM: Big LeWorld Model (~12M params)
+    Two roles:
+    1. ROUTER: Select which SLMs to activate (binary mask)
+    2. PREDICTOR: Given selected memory contents, predict next state
+    Plus: Info-Request Head that asks "what information is needed next?"
+    Architecture:
+    1. Encode current state → routing decision
+    2. Receive memory reads from selected SLMs
+    3. Transformer processes (state + memories)
+    4. Predict next state
+    5. Generate info request for next timestep
+    """
+    def __init__(self, config: BLMConfig):
+        super().__init__()
+        self.config = config
+        # State encoder (maps state_dim → d_model)
+        self.state_encoder = nn.Sequential(
+            nn.Linear(config.state_dim, config.d_model),
+            nn.GELU(),
+            nn.LayerNorm(config.d_model)
+        )
+        # Memory read encoder (maps encoded memory → d_model)
+        self.memory_encoder = nn.Sequential(
+            nn.Linear(128, config.d_model),  # 128 from ArtificialMemory bit_encoder
+            nn.GELU(),
+            nn.LayerNorm(config.d_model)
+        )
+        # SLM hidden state encoder (maps SLM hidden → d_model)
+        self.slm_hidden_encoder = nn.Sequential(
+            nn.Linear(128, config.d_model),  # 128 = SLM d_model
+            nn.GELU(),
+            nn.LayerNorm(config.d_model)
+        )
+        # Router: selects which SLMs to use
+        self.router = BLMRouter(config.d_model, config.n_slms)
+        # Transformer backbone
+        self.transformer_layers = nn.ModuleList([
+            TransformerBlock(config.d_model, config.n_heads, config.dropout)
+            for _ in range(config.n_layers)
+        ])
+        self.final_norm = nn.LayerNorm(config.d_model)
+        # Prediction heads
+        self.next_state_head = nn.Sequential(
+            nn.Linear(config.d_model, config.d_model),
+            nn.GELU(),
+            nn.Linear(config.d_model, config.state_dim)
+        )
+        # Info request head: "what do I need next?"
+        self.info_request = InfoRequestHead(config.d_model, config.info_query_dim)
+        # Learnable tokens
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.d_model) * 0.02)
+        self.state_type_embed = nn.Parameter(torch.randn(1, 1, config.d_model) * 0.02)
+        self.memory_type_embed = nn.Parameter(torch.randn(1, 1, config.d_model) * 0.02)
+    def forward(
+        self,
+        past_state: torch.Tensor,           # (batch, state_dim)
+        current_state: torch.Tensor,         # (batch, state_dim)
+        slm_outputs: List[dict],             # list of SLM output dicts
+        memory_reads: List[torch.Tensor],    # list of (batch, range, 128) encoded memory
+        info_query_prev: Optional[torch.Tensor] = None,  # (batch, query_dim) from previous step
+    ) -> dict:
+        """
+        Full BLM forward pass.
+        Returns:
+            dict with next_state, binary_mask, info_query, losses, etc.
+        """
+        batch_size = current_state.shape[0]
+        # 1. Encode current state for routing decision
+        state_enc = self.state_encoder(current_state)  # (batch, d_model)
+        # 2. Route: select which SLMs to use
+        binary_mask, routing_info = self.router(state_enc)  # (batch, n_slms)
+        # 3. Aggregate selected memory reads
+        # For each SLM, apply its binary gate and encode its memory read
+        memory_tokens = []
+        for i, (slm_out, mem_read) in enumerate(zip(slm_outputs, memory_reads)):
+            gate = binary_mask[:, i:i+1]  # (batch, 1)
+            # Gate the SLM's hidden representation
+            slm_hidden = self.slm_hidden_encoder(slm_out['hidden'])  # (batch, d_model)
+            slm_hidden = slm_hidden * gate  # zero if SLM not selected
+            # Gate and encode the memory read
+            # mem_read: (batch, range_len, 128)
+            mem_enc = self.memory_encoder(mem_read)  # (batch, range_len, d_model)
+            mem_enc = mem_enc * gate.unsqueeze(-1)  # zero if SLM not selected
+            # Pool memory read to single token (mean pool over range)
+            mem_pooled = mem_enc.mean(dim=1, keepdim=True)  # (batch, 1, d_model)
+            memory_tokens.append(slm_hidden.unsqueeze(1))  # SLM hidden as token
+            memory_tokens.append(mem_pooled)                 # memory content as token
+        # 4. Build input sequence for transformer
+        # [CLS] + [state] + [slm_0_hidden, slm_0_mem, slm_1_hidden, slm_1_mem, ...]
+        cls = self.cls_token.expand(batch_size, -1, -1)
+        state_token = state_enc.unsqueeze(1) + self.state_type_embed  # (batch, 1, d_model)
+        # Add memory type embedding to memory tokens
+        mem_sequence = torch.cat(memory_tokens, dim=1)  # (batch, 2*n_slms, d_model)
+        mem_sequence = mem_sequence + self.memory_type_embed
+        sequence = torch.cat([cls, state_token, mem_sequence], dim=1)
+        # Shape: (batch, 1 + 1 + 2*n_slms, d_model)
+        # 5. Transformer processing
+        hidden = sequence
+        for layer in self.transformer_layers:
+            hidden = layer(hidden)
+        hidden = self.final_norm(hidden)
+        # 6. Extract predictions from CLS token
+        cls_output = hidden[:, 0, :]  # (batch, d_model)
+        # 7. Predict next state
+        next_state_pred = self.next_state_head(cls_output)  # (batch, state_dim)
+        # 8. Generate info request for next timestep
+        info_query = self.info_request(cls_output)  # (batch, query_dim)
+        return {
+            'next_state': next_state_pred,
+            'binary_mask': binary_mask,
+            'info_query': info_query,
+            'routing_info': routing_info,
+            'cls_output': cls_output,
+        }
+# =============================================================================
+# Component 4: Full LeWorld System
+# =============================================================================
+class LeWorldSystem(nn.Module):
+    """
+    Complete LeWorld Memory Architecture.
+    Orchestrates:
+    - Artificial Memory (bit-level storage)
+    - 3 SLMs (produce memory address ranges)
+    - 1 BLM (selects SLMs, reads memory, predicts next state)
+    Training loop:
+    1. BLM sees current state → routes to SLMs
+    2. Selected SLMs produce address ranges
+    3. Memory is read at those ranges
+    4. BLM aggregates memory + state → predicts next state
+    5. BLM generates info-request for next step
+    Losses:
+    - next_state_loss: MSE between predicted and actual next state
+    - routing_balance_loss: encourage balanced SLM usage
+    - address_diversity_loss: encourage SLMs to read different memory regions
+    - info_utility_loss: did the info request lead to useful retrievals?
+    """
+    def __init__(
+        self,
+        mem_config: MemoryConfig = MemoryConfig(),
+        slm_config: SLMConfig = SLMConfig(),
+        blm_config: BLMConfig = BLMConfig(),
+    ):
+        super().__init__()
+        # Artificial Memory
+        self.memory = ArtificialMemory(mem_config)
+        # 3 SLMs
+        self.slms = nn.ModuleList([
+            SmallLeWorldModel(slm_config, slm_id=i)
+            for i in range(blm_config.n_slms)
+        ])
+        # BLM
+        self.blm = BigLeWorldModel(blm_config)
+        # Info-query → SLM modulation: the BLM's info request
+        # influences what SLMs look for in the next timestep
+        self.info_to_slm = nn.Linear(blm_config.info_query_dim, slm_config.state_dim)
+        self.config = {
+            'mem': mem_config,
+            'slm': slm_config,
+            'blm': blm_config,
+        }
+    def forward(
+        self,
+        past_state: torch.Tensor,          # (batch, state_dim)
+        current_state: torch.Tensor,        # (batch, state_dim)
+        characteristics: torch.Tensor,      # (batch, char_dim)
+        next_state_target: Optional[torch.Tensor] = None,  # (batch, state_dim) for training
+        info_query_prev: Optional[torch.Tensor] = None,    # from previous timestep
+    ) -> dict:
+        """
+        Full system forward pass.
+        """
+        batch_size = current_state.shape[0]
+        # If we have a previous info query, modulate the current state
+        # This is how the BLM's "what do I need?" influences retrieval
+        if info_query_prev is not None:
+            info_modulation = self.info_to_slm(info_query_prev)  # (batch, state_dim)
+            modulated_state = current_state + 0.1 * info_modulation  # gentle modulation
+        else:
+            modulated_state = current_state
+        # 1. Run all 3 SLMs to get address ranges
+        slm_outputs = []
+        for slm in self.slms:
+            out = slm(past_state, modulated_state, characteristics)
+            slm_outputs.append(out)
+        # 2. Read memory at each SLM's address range
+        memory_reads = []
+        for slm_out in slm_outputs:
+            _, encoded, valid_mask = self.memory.read(
+                slm_out['start_addr'],
+                slm_out['end_addr']
+            )
+            memory_reads.append(encoded)
+        # 3. BLM processes everything
+        blm_output = self.blm(
+            past_state, current_state,
+            slm_outputs, memory_reads,
+            info_query_prev
+        )
+        # 4. Compute losses if training
+        losses = {}
+        if next_state_target is not None:
+            # Primary loss: next state prediction
+            losses['next_state_loss'] = F.mse_loss(
+                blm_output['next_state'], next_state_target
+            )
+            # Routing balance loss
+            losses['balance_loss'] = blm_output['routing_info']['balance_loss']
+            # Address diversity loss: penalize SLMs for reading same regions
+            addresses = torch.stack([
+                slm_out['start_addr'].float() for slm_out in slm_outputs
+            ], dim=1)  # (batch, n_slms)
+            # Pairwise distance between SLM addresses (want to maximize)
+            addr_diff = torch.cdist(addresses.unsqueeze(-1), addresses.unsqueeze(-1))
+            diversity_loss = -addr_diff.mean()  # negative = encourage large distances
+            losses['diversity_loss'] = diversity_loss
+            # Total loss
+            losses['total_loss'] = (
+                losses['next_state_loss']
+                + 0.01 * losses['balance_loss']
+                + 0.001 * losses['diversity_loss']
+            )
+        return {
+            'next_state': blm_output['next_state'],
+            'binary_mask': blm_output['binary_mask'],
+            'info_query': blm_output['info_query'],
+            'slm_outputs': slm_outputs,
+            'memory_reads': memory_reads,
+            'losses': losses,
+            'routing_info': blm_output['routing_info'],
+        }
+    def multi_step_forward(
+        self,
+        states: torch.Tensor,           # (batch, T, state_dim) sequence of states
+        characteristics: torch.Tensor,   # (batch, char_dim) static
+        n_steps: int = None,
+    ) -> dict:
+        """
+        Run the system over multiple timesteps autoregressively.
+        For training: teacher forcing with ground-truth states
+        """
+        batch_size, T, state_dim = states.shape
+        if n_steps is None:
+            n_steps = T - 1  # predict all future states
+        all_predictions = []
+        all_masks = []
+        total_loss = None
+        info_query = None
+        for t in range(min(n_steps, T - 1)):
+            past_state = states[:, max(0, t-1), :]
+            current_state = states[:, t, :]
+            next_state_target = states[:, t+1, :]
+            output = self.forward(
+                past_state, current_state, characteristics,
+                next_state_target, info_query
+            )
+            all_predictions.append(output['next_state'])
+            all_masks.append(output['binary_mask'])
+            info_query = output['info_query']
+            if output['losses']:
+                if total_loss is None:
+                    total_loss = output['losses']['total_loss']
+                else:
+                    total_loss = total_loss + output['losses']['total_loss']
+        if total_loss is None:
+            total_loss = torch.tensor(0.0, device=states.device)
+        return {
+            'predictions': torch.stack(all_predictions, dim=1),
+            'masks': torch.stack(all_masks, dim=1),
+            'total_loss': total_loss / max(1, min(n_steps, T - 1)),
+            'final_info_query': info_query,
+        }
+# =============================================================================
+# Parameter Count Verification
+# =============================================================================
+def count_params(model, name="Model"):
+    """Count and display parameter breakdown."""
+    total = sum(p.numel() for p in model.parameters())
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"\n{'='*60}")
+    print(f"{name}: {total:,} total params ({trainable:,} trainable)")
+    print(f"{'='*60}")
+    for child_name, child in model.named_children():
+        child_params = sum(p.numel() for p in child.parameters())
+        if child_params > 0:
+            print(f"  {child_name}: {child_params:,}")
+    return total
+# =============================================================================
+# Demo / Test
+# =============================================================================
+if __name__ == "__main__":
+    print("LeWorld Memory Architecture — Component Verification")
+    print("=" * 60)
+    # Configs
+    mem_config = MemoryConfig()
+    slm_config = SLMConfig()
+    blm_config = BLMConfig()
+    # Build system
+    system = LeWorldSystem(mem_config, slm_config, blm_config)
+    # Count parameters
+    print("\n--- Parameter Counts ---")
+    count_params(system.memory, "Artificial Memory")
+    for i, slm in enumerate(system.slms):
+        count_params(slm, f"SLM-{i}")
+    count_params(system.blm, "BLM")
+    count_params(system, "Full System")
+    # Test forward pass
+    print("\n--- Forward Pass Test ---")
+    batch_size = 4
+    state_dim = slm_config.state_dim
+    char_dim = slm_config.char_dim
+    past_state = torch.randn(batch_size, state_dim)
+    current_state = torch.randn(batch_size, state_dim)
+    characteristics = torch.randn(batch_size, char_dim)
+    next_state = torch.randn(batch_size, state_dim)
+    output = system(past_state, current_state, characteristics, next_state)
+    print(f"Next state prediction shape: {output['next_state'].shape}")
+    print(f"Binary mask (SLM selection): {output['binary_mask']}")
+    print(f"Info query shape: {output['info_query'].shape}")
+    print(f"Losses: {output['losses']}")
+    # Test multi-step
+    print("\n--- Multi-Step Test ---")
+    T = 10
+    states = torch.randn(batch_size, T, state_dim)
+    ms_output = system.multi_step_forward(states, characteristics)
+    print(f"Predictions shape: {ms_output['predictions'].shape}")
+    print(f"Masks shape: {ms_output['masks'].shape}")
+    print(f"Average loss: {ms_output['total_loss'].item():.4f}")
+    # Show routing patterns over time
+    print("\n--- Routing Patterns Over Time ---")
+    masks = ms_output['masks'][0].detach()  # first batch element
+    for t in range(masks.shape[0]):
+        mask = masks[t].int().tolist()
+        print(f"  Step {t}: SLMs selected = {mask}")
+    print("\n✅ All components verified successfully!")