leworld-memory-architecture / leworld_architecture.py

Add core architecture: Memory, SLM, BLM, full system

52bd035 verified 14 days ago

37.7 kB

	"""
	LeWorld Memory Architecture — Complete Implementation
	=====================================================
	Component 1: Artificial Memory (CPU-style bit storage)
	Component 2: SLMs (Small LeWorld Models, ~1.5M params each)
	Component 3: BLM (Big LeWorld Model, ~12M params)
	Component 4: Full System with training loop
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import math
	from dataclasses import dataclass
	from typing import Tuple, List, Optional


	# =============================================================================
	# Configuration
	# =============================================================================

	@dataclass
	class MemoryConfig:
	"""CPU-style artificial memory configuration."""
	num_words: int = 65536 # 64K addressable words (like 64K RAM)
	word_size: int = 32 # 32 bits per word
	address_bits: int = 16 # 2^16 = 65536 addresses
	max_read_range: int = 256 # max words per single read operation


	@dataclass
	class SLMConfig:
	"""Small LeWorld Model configuration (~1.5M params)."""
	d_model: int = 128 # internal dimension
	n_heads: int = 4 # attention heads
	n_layers: int = 2 # transformer layers
	state_dim: int = 64 # state vector dimension
	char_dim: int = 32 # characteristics vector dimension
	address_space: int = 65536 # must match MemoryConfig.num_words
	max_read_range: int = 256 # must match MemoryConfig.max_read_range
	dropout: float = 0.1


	@dataclass
	class BLMConfig:
	"""Big LeWorld Model configuration (~12M params)."""
	d_model: int = 384 # internal dimension
	n_heads: int = 6 # attention heads
	n_layers: int = 6 # transformer layers
	state_dim: int = 64 # state vector dimension
	n_slms: int = 3 # number of SLMs to route over
	memory_read_dim: int = 256 # dimension of encoded memory reads
	info_query_dim: int = 128 # dimension of "what info do I need" query
	dropout: float = 0.1


	# =============================================================================
	# Component 1: Artificial Memory
	# =============================================================================

	class ArtificialMemory(nn.Module):
	"""
	CPU-style bit-level memory with address-range access.

	Stores data as actual bits (0/1 tensors), organized into addressable words.
	Supports:
	- READ(start_addr, end_addr) → returns bit block
	- WRITE(start_addr, data) → writes bits to memory
	- Bit-to-embedding projection (for neural network consumption)

	This mimics how a CPU accesses RAM:
	- Each address points to a word (32 bits)
	- Contiguous reads fetch a range of words
	- No inherent "meaning" — bits are just bits until interpreted
	"""

	def __init__(self, config: MemoryConfig):
	super().__init__()
	self.config = config

	# The actual memory: (num_words, word_size) binary tensor
	# Initialized randomly — represents "existing knowledge base"
	self.register_buffer(
	'memory',
	torch.randint(0, 2, (config.num_words, config.word_size)).float()
	)

	# Bit-to-embedding projection: converts raw bits into dense vectors
	# This is learnable — the system learns what bit patterns mean
	self.bit_encoder = nn.Sequential(
	nn.Linear(config.word_size, 64),
	nn.GELU(),
	nn.Linear(64, 128),
	nn.LayerNorm(128)
	)

	# Write projection: converts dense vectors back to bit probabilities
	self.bit_decoder = nn.Sequential(
	nn.Linear(128, 64),
	nn.GELU(),
	nn.Linear(64, config.word_size),
	nn.Sigmoid() # output probabilities for each bit
	)

	def read(self, start_addr: torch.Tensor, end_addr: torch.Tensor) -> torch.Tensor:
	"""
	Read a contiguous range of words from memory.

	Args:
	start_addr: (batch,) integer tensor of start addresses
	end_addr: (batch,) integer tensor of end addresses

	Returns:
	bit_block: (batch, max_range, word_size) raw bits
	encoded: (batch, max_range, 128) encoded memory content
	"""
	batch_size = start_addr.shape[0]
	max_range = self.config.max_read_range

	# Clamp addresses to valid range
	start_addr = start_addr.clamp(0, self.config.num_words - 1)
	end_addr = end_addr.clamp(start_addr,
	torch.minimum(start_addr + max_range,
	torch.tensor(self.config.num_words)))

	# Gather memory contents for each batch element
	# Create index tensor for the address ranges
	offsets = torch.arange(max_range, device=start_addr.device).unsqueeze(0) # (1, max_range)
	addresses = start_addr.unsqueeze(1) + offsets # (batch, max_range)
	addresses = addresses.clamp(0, self.config.num_words - 1)

	# Create validity mask (addresses within [start, end) are valid)
	range_lengths = (end_addr - start_addr).unsqueeze(1) # (batch, 1)
	valid_mask = offsets < range_lengths # (batch, max_range)

	# Gather bits
	bit_block = self.memory[addresses] # (batch, max_range, word_size)
	bit_block = bit_block * valid_mask.unsqueeze(-1).float() # zero out invalid

	# Encode bits to dense vectors
	encoded = self.bit_encoder(bit_block) # (batch, max_range, 128)
	encoded = encoded * valid_mask.unsqueeze(-1).float()

	return bit_block, encoded, valid_mask

	def write(self, start_addr: torch.Tensor, data: torch.Tensor):
	"""
	Write data to memory (differentiable soft-write).

	Args:
	start_addr: (batch,) start addresses
	data: (batch, n_words, 128) encoded data to write
	"""
	n_words = data.shape[1]

	# Decode to bit probabilities
	bit_probs = self.bit_decoder(data) # (batch, n_words, word_size)

	# Hard bits via straight-through
	hard_bits = (bit_probs > 0.5).float()
	bits_to_write = hard_bits - bit_probs.detach() + bit_probs # ST trick

	# Write to memory (last batch element wins for simplicity)
	for b in range(start_addr.shape[0]):
	addr = start_addr[b].long()
	end = min(addr + n_words, self.config.num_words)
	actual_n = end - addr
	self.memory[addr:end] = bits_to_write[b, :actual_n].detach()

	def soft_read(self, attention_weights: torch.Tensor) -> torch.Tensor:
	"""
	Content-based soft read using attention weights over entire memory.
	Used for differentiable end-to-end training.

	Args:
	attention_weights: (batch, num_words) soft address distribution

	Returns:
	encoded: (batch, 128) weighted memory content
	"""
	# Encode all memory (expensive but differentiable)
	all_encoded = self.bit_encoder(self.memory) # (num_words, 128)
	# Weighted sum
	encoded = torch.matmul(attention_weights, all_encoded) # (batch, 128)
	return encoded


	# =============================================================================
	# Component 2: Small LeWorld Model (SLM)
	# =============================================================================

	class StateEncoder(nn.Module):
	"""Encodes past_state and current_state into a joint representation."""

	def __init__(self, state_dim: int, d_model: int):
	super().__init__()
	self.past_proj = nn.Linear(state_dim, d_model)
	self.curr_proj = nn.Linear(state_dim, d_model)
	self.combiner = nn.Sequential(
	nn.Linear(d_model * 2, d_model),
	nn.GELU(),
	nn.LayerNorm(d_model)
	)

	def forward(self, past_state: torch.Tensor, current_state: torch.Tensor) -> torch.Tensor:
	"""
	Args:
	past_state: (batch, state_dim)
	current_state: (batch, state_dim)
	Returns:
	combined: (batch, d_model)
	"""
	past_enc = F.gelu(self.past_proj(past_state))
	curr_enc = F.gelu(self.curr_proj(current_state))
	combined = self.combiner(torch.cat([past_enc, curr_enc], dim=-1))
	return combined


	class CharacteristicsEncoder(nn.Module):
	"""Encodes static characteristics/context."""

	def __init__(self, char_dim: int, d_model: int):
	super().__init__()
	self.encoder = nn.Sequential(
	nn.Linear(char_dim, d_model),
	nn.GELU(),
	nn.LayerNorm(d_model)
	)

	def forward(self, characteristics: torch.Tensor) -> torch.Tensor:
	return self.encoder(characteristics)


	class TransformerBlock(nn.Module):
	"""Standard transformer block with pre-norm."""

	def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
	super().__init__()
	self.norm1 = nn.LayerNorm(d_model)
	self.attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
	self.norm2 = nn.LayerNorm(d_model)
	self.ffn = nn.Sequential(
	nn.Linear(d_model, d_model * 4),
	nn.GELU(),
	nn.Linear(d_model * 4, d_model),
	nn.Dropout(dropout)
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	# Self-attention with pre-norm
	normed = self.norm1(x)
	attn_out, _ = self.attn(normed, normed, normed)
	x = x + attn_out
	# FFN with pre-norm
	x = x + self.ffn(self.norm2(x))
	return x


	class CrossAttentionBlock(nn.Module):
	"""Cross-attention: state attends to characteristics."""

	def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
	super().__init__()
	self.norm_q = nn.LayerNorm(d_model)
	self.norm_kv = nn.LayerNorm(d_model)
	self.cross_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
	self.norm_ff = nn.LayerNorm(d_model)
	self.ffn = nn.Sequential(
	nn.Linear(d_model, d_model * 4),
	nn.GELU(),
	nn.Linear(d_model * 4, d_model),
	nn.Dropout(dropout)
	)

	def forward(self, query: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
	normed_q = self.norm_q(query)
	normed_kv = self.norm_kv(context)
	attn_out, _ = self.cross_attn(normed_q, normed_kv, normed_kv)
	x = query + attn_out
	x = x + self.ffn(self.norm_ff(x))
	return x


	class AddressHead(nn.Module):
	"""
	Produces memory address range (start_addr, end_addr) from hidden state.

	Uses two approaches:
	1. HARD mode: argmax over address space (for inference)
	2. SOFT mode: attention weights over memory (for differentiable training)
	"""

	def __init__(self, d_model: int, address_space: int, max_range: int):
	super().__init__()
	self.address_space = address_space
	self.max_range = max_range

	# Produce start address logits
	# We don't have a linear over 65K — that's too many params
	# Instead: predict address as composition of sub-addresses (like product keys)
	self.addr_bits = int(math.log2(address_space)) # 16 for 65536
	assert 2 ** self.addr_bits == address_space, "address_space must be power of 2"

	# Split address into high byte and low byte (8+8 = 16 bits)
	self.half_bits = self.addr_bits // 2 # 8
	self.half_space = 2 ** self.half_bits # 256

	# Predict high and low parts separately (product key approach)
	self.start_high = nn.Linear(d_model, self.half_space) # 256 outputs
	self.start_low = nn.Linear(d_model, self.half_space) # 256 outputs

	# Predict range length (how many words to read)
	self.range_head = nn.Sequential(
	nn.Linear(d_model, d_model // 2),
	nn.GELU(),
	nn.Linear(d_model // 2, max_range)
	)

	# Confidence head
	self.confidence_head = nn.Sequential(
	nn.Linear(d_model, d_model // 4),
	nn.GELU(),
	nn.Linear(d_model // 4, 1),
	nn.Sigmoid()
	)

	def forward(self, hidden: torch.Tensor) -> dict:
	"""
	Args:
	hidden: (batch, d_model)

	Returns:
	dict with:
	start_addr: (batch,) integer addresses
	end_addr: (batch,) integer addresses
	range_length: (batch,) how many words to read
	confidence: (batch,) read confidence score
	start_logits_high: (batch, 256) for soft addressing
	start_logits_low: (batch, 256) for soft addressing
	range_logits: (batch, max_range) for soft range selection
	"""
	batch_size = hidden.shape[0]

	# Product-key address generation
	high_logits = self.start_high(hidden) # (batch, 256)
	low_logits = self.start_low(hidden) # (batch, 256)

	# Hard address via argmax
	high_idx = high_logits.argmax(dim=-1) # (batch,)
	low_idx = low_logits.argmax(dim=-1) # (batch,)
	start_addr = high_idx * self.half_space + low_idx # (batch,) 0..65535

	# Range length
	range_logits = self.range_head(hidden) # (batch, max_range)
	range_length = range_logits.argmax(dim=-1) + 1 # (batch,) 1..max_range
	end_addr = (start_addr + range_length).clamp(max=self.address_space - 1)

	# Confidence
	confidence = self.confidence_head(hidden).squeeze(-1) # (batch,)

	return {
	'start_addr': start_addr,
	'end_addr': end_addr,
	'range_length': range_length,
	'confidence': confidence,
	'start_logits_high': high_logits,
	'start_logits_low': low_logits,
	'range_logits': range_logits,
	}


	class SmallLeWorldModel(nn.Module):
	"""
	SLM: Small LeWorld Model (~1.5M params)

	Takes (past_state, current_state, characteristics) and produces
	a memory address range pointing to the most useful memory for
	next-state prediction.

	Architecture:
	1. Encode past + current state → state representation
	2. Encode characteristics
	3. Cross-attend: state attends to characteristics
	4. Self-attention transformer layers
	5. Address head: output (start_addr, end_addr, confidence)
	"""

	def __init__(self, config: SLMConfig, slm_id: int = 0):
	super().__init__()
	self.config = config
	self.slm_id = slm_id

	# Encoders
	self.state_encoder = StateEncoder(config.state_dim, config.d_model)
	self.char_encoder = CharacteristicsEncoder(config.char_dim, config.d_model)

	# Cross-attention: state ← characteristics
	self.cross_attn = CrossAttentionBlock(config.d_model, config.n_heads, config.dropout)

	# Self-attention transformer
	self.transformer_layers = nn.ModuleList([
	TransformerBlock(config.d_model, config.n_heads, config.dropout)
	for _ in range(config.n_layers)
	])
	self.final_norm = nn.LayerNorm(config.d_model)

	# Address output head
	self.address_head = AddressHead(config.d_model, config.address_space, config.max_read_range)

	def forward(
	self,
	past_state: torch.Tensor, # (batch, state_dim)
	current_state: torch.Tensor, # (batch, state_dim)
	characteristics: torch.Tensor, # (batch, char_dim)
	) -> dict:
	"""
	Forward pass: state + characteristics → memory address range.

	Returns dict with address info + internal hidden state.
	"""
	# Encode states
	state_repr = self.state_encoder(past_state, current_state) # (batch, d_model)

	# Encode characteristics
	char_repr = self.char_encoder(characteristics) # (batch, d_model)

	# Cross-attention: state queries characteristics
	# Unsqueeze to sequence dim for attention
	state_seq = state_repr.unsqueeze(1) # (batch, 1, d_model)
	char_seq = char_repr.unsqueeze(1) # (batch, 1, d_model)

	enriched = self.cross_attn(state_seq, char_seq) # (batch, 1, d_model)

	# Self-attention layers
	hidden = enriched
	for layer in self.transformer_layers:
	hidden = layer(hidden)

	hidden = self.final_norm(hidden)
	hidden = hidden.squeeze(1) # (batch, d_model)

	# Produce address range
	addr_output = self.address_head(hidden)
	addr_output['hidden'] = hidden # keep for BLM to use
	addr_output['slm_id'] = self.slm_id

	return addr_output


	# =============================================================================
	# Component 3: Big LeWorld Model (BLM)
	# =============================================================================

	class StraightThroughSigmoid(torch.autograd.Function):
	"""
	Binary gate: hard 0/1 in forward, sigmoid gradient in backward.
	From literature: ST-GS (Jang et al. 2017) + Switch Transformer routing.
	"""
	@staticmethod
	def forward(ctx, logits):
	probs = torch.sigmoid(logits)
	ctx.save_for_backward(probs)
	return (probs > 0.5).float()

	@staticmethod
	def backward(ctx, grad_output):
	probs, = ctx.saved_tensors
	# Sigmoid derivative: p * (1-p)
	return grad_output * probs * (1 - probs)


	class BLMRouter(nn.Module):
	"""
	Routes/selects which SLMs to activate.
	Produces binary mask like [1, 0, 1].

	Uses Straight-Through Sigmoid for differentiable binary selection.
	Includes load-balancing loss to prevent degenerate routing.
	"""

	def __init__(self, d_model: int, n_slms: int):
	super().__init__()
	self.n_slms = n_slms

	self.gate = nn.Sequential(
	nn.Linear(d_model, d_model // 2),
	nn.GELU(),
	nn.Linear(d_model // 2, n_slms)
	)

	# Temperature for annealing (start warm, cool down)
	self.register_buffer('temperature', torch.tensor(1.0))

	def forward(self, state_repr: torch.Tensor) -> Tuple[torch.Tensor, dict]:
	"""
	Args:
	state_repr: (batch, d_model) encoded current state

	Returns:
	binary_mask: (batch, n_slms) hard 0/1 selection
	routing_info: dict with probs, losses, etc.
	"""
	logits = self.gate(state_repr) # (batch, n_slms)

	# Scale by temperature
	scaled_logits = logits / self.temperature.clamp(min=0.1)

	probs = torch.sigmoid(scaled_logits) # (batch, n_slms)

	# Straight-through binary: hard in forward, soft in backward
	hard_mask = (probs > 0.5).float()
	binary_mask = hard_mask - probs.detach() + probs # THE ST TRICK

	# Ensure at least one SLM is selected (don't want all zeros)
	# If all zeros, force-select the highest probability SLM
	all_zero = (binary_mask.sum(dim=-1) == 0) # (batch,)
	if all_zero.any():
	max_idx = probs[all_zero].argmax(dim=-1)
	forced = torch.zeros_like(probs[all_zero])
	forced.scatter_(1, max_idx.unsqueeze(1), 1.0)
	binary_mask[all_zero] = forced

	# Load balance loss: encourage roughly equal usage of SLMs
	usage_per_slm = binary_mask.mean(dim=0) # (n_slms,)
	target_usage = 1.0 / self.n_slms
	balance_loss = ((usage_per_slm - target_usage) ** 2).sum()

	# Entropy loss: encourage decisive routing (not all ~0.5)
	entropy = -(probs * torch.log(probs + 1e-8) +
	(1 - probs) * torch.log(1 - probs + 1e-8))
	entropy_loss = entropy.mean()

	routing_info = {
	'probs': probs,
	'binary_mask': binary_mask,
	'balance_loss': balance_loss,
	'entropy_loss': entropy_loss,
	'logits': logits,
	}

	return binary_mask, routing_info

	def anneal_temperature(self, step: int, anneal_rate: float = 3e-5, min_temp: float = 0.1):
	"""Anneal temperature: start warm (exploratory), cool down (decisive)."""
	new_temp = max(min_temp, math.exp(-anneal_rate * step))
	self.temperature.fill_(new_temp)


	class InfoRequestHead(nn.Module):
	"""
	Produces a query vector representing "what information do I need next?"

	This is the key innovation: instead of passively receiving all SLM outputs,
	the BLM actively requests specific information. This query modulates which
	memory regions the SLMs should focus on in the NEXT timestep.
	"""

	def __init__(self, d_model: int, query_dim: int):
	super().__init__()
	self.query_generator = nn.Sequential(
	nn.Linear(d_model, d_model),
	nn.GELU(),
	nn.Linear(d_model, query_dim),
	nn.LayerNorm(query_dim)
	)

	def forward(self, hidden: torch.Tensor) -> torch.Tensor:
	"""
	Args:
	hidden: (batch, d_model) BLM's internal state
	Returns:
	info_query: (batch, query_dim) "what do I need next?"
	"""
	return self.query_generator(hidden)


	class BigLeWorldModel(nn.Module):
	"""
	BLM: Big LeWorld Model (~12M params)

	Two roles:
	1. ROUTER: Select which SLMs to activate (binary mask)
	2. PREDICTOR: Given selected memory contents, predict next state

	Plus: Info-Request Head that asks "what information is needed next?"

	Architecture:
	1. Encode current state → routing decision
	2. Receive memory reads from selected SLMs
	3. Transformer processes (state + memories)
	4. Predict next state
	5. Generate info request for next timestep
	"""

	def __init__(self, config: BLMConfig):
	super().__init__()
	self.config = config

	# State encoder (maps state_dim → d_model)
	self.state_encoder = nn.Sequential(
	nn.Linear(config.state_dim, config.d_model),
	nn.GELU(),
	nn.LayerNorm(config.d_model)
	)

	# Memory read encoder (maps encoded memory → d_model)
	self.memory_encoder = nn.Sequential(
	nn.Linear(128, config.d_model), # 128 from ArtificialMemory bit_encoder
	nn.GELU(),
	nn.LayerNorm(config.d_model)
	)

	# SLM hidden state encoder (maps SLM hidden → d_model)
	self.slm_hidden_encoder = nn.Sequential(
	nn.Linear(128, config.d_model), # 128 = SLM d_model
	nn.GELU(),
	nn.LayerNorm(config.d_model)
	)

	# Router: selects which SLMs to use
	self.router = BLMRouter(config.d_model, config.n_slms)

	# Transformer backbone
	self.transformer_layers = nn.ModuleList([
	TransformerBlock(config.d_model, config.n_heads, config.dropout)
	for _ in range(config.n_layers)
	])
	self.final_norm = nn.LayerNorm(config.d_model)

	# Prediction heads
	self.next_state_head = nn.Sequential(
	nn.Linear(config.d_model, config.d_model),
	nn.GELU(),
	nn.Linear(config.d_model, config.state_dim)
	)

	# Info request head: "what do I need next?"
	self.info_request = InfoRequestHead(config.d_model, config.info_query_dim)

	# Learnable tokens
	self.cls_token = nn.Parameter(torch.randn(1, 1, config.d_model) * 0.02)
	self.state_type_embed = nn.Parameter(torch.randn(1, 1, config.d_model) * 0.02)
	self.memory_type_embed = nn.Parameter(torch.randn(1, 1, config.d_model) * 0.02)

	def forward(
	self,
	past_state: torch.Tensor, # (batch, state_dim)
	current_state: torch.Tensor, # (batch, state_dim)
	slm_outputs: List[dict], # list of SLM output dicts
	memory_reads: List[torch.Tensor], # list of (batch, range, 128) encoded memory
	info_query_prev: Optional[torch.Tensor] = None, # (batch, query_dim) from previous step
	) -> dict:
	"""
	Full BLM forward pass.

	Returns:
	dict with next_state, binary_mask, info_query, losses, etc.
	"""
	batch_size = current_state.shape[0]

	# 1. Encode current state for routing decision
	state_enc = self.state_encoder(current_state) # (batch, d_model)

	# 2. Route: select which SLMs to use
	binary_mask, routing_info = self.router(state_enc) # (batch, n_slms)

	# 3. Aggregate selected memory reads
	# For each SLM, apply its binary gate and encode its memory read
	memory_tokens = []
	for i, (slm_out, mem_read) in enumerate(zip(slm_outputs, memory_reads)):
	gate = binary_mask[:, i:i+1] # (batch, 1)

	# Gate the SLM's hidden representation
	slm_hidden = self.slm_hidden_encoder(slm_out['hidden']) # (batch, d_model)
	slm_hidden = slm_hidden * gate # zero if SLM not selected

	# Gate and encode the memory read
	# mem_read: (batch, range_len, 128)
	mem_enc = self.memory_encoder(mem_read) # (batch, range_len, d_model)
	mem_enc = mem_enc * gate.unsqueeze(-1) # zero if SLM not selected

	# Pool memory read to single token (mean pool over range)
	mem_pooled = mem_enc.mean(dim=1, keepdim=True) # (batch, 1, d_model)

	memory_tokens.append(slm_hidden.unsqueeze(1)) # SLM hidden as token
	memory_tokens.append(mem_pooled) # memory content as token

	# 4. Build input sequence for transformer
	# [CLS] + [state] + [slm_0_hidden, slm_0_mem, slm_1_hidden, slm_1_mem, ...]
	cls = self.cls_token.expand(batch_size, -1, -1)
	state_token = state_enc.unsqueeze(1) + self.state_type_embed # (batch, 1, d_model)

	# Add memory type embedding to memory tokens
	mem_sequence = torch.cat(memory_tokens, dim=1) # (batch, 2*n_slms, d_model)
	mem_sequence = mem_sequence + self.memory_type_embed

	sequence = torch.cat([cls, state_token, mem_sequence], dim=1)
	# Shape: (batch, 1 + 1 + 2*n_slms, d_model)

	# 5. Transformer processing
	hidden = sequence
	for layer in self.transformer_layers:
	hidden = layer(hidden)
	hidden = self.final_norm(hidden)

	# 6. Extract predictions from CLS token
	cls_output = hidden[:, 0, :] # (batch, d_model)

	# 7. Predict next state
	next_state_pred = self.next_state_head(cls_output) # (batch, state_dim)

	# 8. Generate info request for next timestep
	info_query = self.info_request(cls_output) # (batch, query_dim)

	return {
	'next_state': next_state_pred,
	'binary_mask': binary_mask,
	'info_query': info_query,
	'routing_info': routing_info,
	'cls_output': cls_output,
	}


	# =============================================================================
	# Component 4: Full LeWorld System
	# =============================================================================

	class LeWorldSystem(nn.Module):
	"""
	Complete LeWorld Memory Architecture.

	Orchestrates:
	- Artificial Memory (bit-level storage)
	- 3 SLMs (produce memory address ranges)
	- 1 BLM (selects SLMs, reads memory, predicts next state)

	Training loop:
	1. BLM sees current state → routes to SLMs
	2. Selected SLMs produce address ranges
	3. Memory is read at those ranges
	4. BLM aggregates memory + state → predicts next state
	5. BLM generates info-request for next step

	Losses:
	- next_state_loss: MSE between predicted and actual next state
	- routing_balance_loss: encourage balanced SLM usage
	- address_diversity_loss: encourage SLMs to read different memory regions
	- info_utility_loss: did the info request lead to useful retrievals?
	"""

	def __init__(
	self,
	mem_config: MemoryConfig = MemoryConfig(),
	slm_config: SLMConfig = SLMConfig(),
	blm_config: BLMConfig = BLMConfig(),
	):
	super().__init__()

	# Artificial Memory
	self.memory = ArtificialMemory(mem_config)

	# 3 SLMs
	self.slms = nn.ModuleList([
	SmallLeWorldModel(slm_config, slm_id=i)
	for i in range(blm_config.n_slms)
	])

	# BLM
	self.blm = BigLeWorldModel(blm_config)

	# Info-query → SLM modulation: the BLM's info request
	# influences what SLMs look for in the next timestep
	self.info_to_slm = nn.Linear(blm_config.info_query_dim, slm_config.state_dim)

	self.config = {
	'mem': mem_config,
	'slm': slm_config,
	'blm': blm_config,
	}

	def forward(
	self,
	past_state: torch.Tensor, # (batch, state_dim)
	current_state: torch.Tensor, # (batch, state_dim)
	characteristics: torch.Tensor, # (batch, char_dim)
	next_state_target: Optional[torch.Tensor] = None, # (batch, state_dim) for training
	info_query_prev: Optional[torch.Tensor] = None, # from previous timestep
	) -> dict:
	"""
	Full system forward pass.
	"""
	batch_size = current_state.shape[0]

	# If we have a previous info query, modulate the current state
	# This is how the BLM's "what do I need?" influences retrieval
	if info_query_prev is not None:
	info_modulation = self.info_to_slm(info_query_prev) # (batch, state_dim)
	modulated_state = current_state + 0.1 * info_modulation # gentle modulation
	else:
	modulated_state = current_state

	# 1. Run all 3 SLMs to get address ranges
	slm_outputs = []
	for slm in self.slms:
	out = slm(past_state, modulated_state, characteristics)
	slm_outputs.append(out)

	# 2. Read memory at each SLM's address range
	memory_reads = []
	for slm_out in slm_outputs:
	_, encoded, valid_mask = self.memory.read(
	slm_out['start_addr'],
	slm_out['end_addr']
	)
	memory_reads.append(encoded)

	# 3. BLM processes everything
	blm_output = self.blm(
	past_state, current_state,
	slm_outputs, memory_reads,
	info_query_prev
	)

	# 4. Compute losses if training
	losses = {}
	if next_state_target is not None:
	# Primary loss: next state prediction
	losses['next_state_loss'] = F.mse_loss(
	blm_output['next_state'], next_state_target
	)

	# Routing balance loss
	losses['balance_loss'] = blm_output['routing_info']['balance_loss']

	# Address diversity loss: penalize SLMs for reading same regions
	addresses = torch.stack([
	slm_out['start_addr'].float() for slm_out in slm_outputs
	], dim=1) # (batch, n_slms)
	# Pairwise distance between SLM addresses (want to maximize)
	addr_diff = torch.cdist(addresses.unsqueeze(-1), addresses.unsqueeze(-1))
	diversity_loss = -addr_diff.mean() # negative = encourage large distances
	losses['diversity_loss'] = diversity_loss

	# Total loss
	losses['total_loss'] = (
	losses['next_state_loss']
	+ 0.01 * losses['balance_loss']
	+ 0.001 * losses['diversity_loss']
	)

	return {
	'next_state': blm_output['next_state'],
	'binary_mask': blm_output['binary_mask'],
	'info_query': blm_output['info_query'],
	'slm_outputs': slm_outputs,
	'memory_reads': memory_reads,
	'losses': losses,
	'routing_info': blm_output['routing_info'],
	}

	def multi_step_forward(
	self,
	states: torch.Tensor, # (batch, T, state_dim) sequence of states
	characteristics: torch.Tensor, # (batch, char_dim) static
	n_steps: int = None,
	) -> dict:
	"""
	Run the system over multiple timesteps autoregressively.

	For training: teacher forcing with ground-truth states
	"""
	batch_size, T, state_dim = states.shape
	if n_steps is None:
	n_steps = T - 1 # predict all future states

	all_predictions = []
	all_masks = []
	total_loss = None
	info_query = None

	for t in range(min(n_steps, T - 1)):
	past_state = states[:, max(0, t-1), :]
	current_state = states[:, t, :]
	next_state_target = states[:, t+1, :]

	output = self.forward(
	past_state, current_state, characteristics,
	next_state_target, info_query
	)

	all_predictions.append(output['next_state'])
	all_masks.append(output['binary_mask'])
	info_query = output['info_query']

	if output['losses']:
	if total_loss is None:
	total_loss = output['losses']['total_loss']
	else:
	total_loss = total_loss + output['losses']['total_loss']

	if total_loss is None:
	total_loss = torch.tensor(0.0, device=states.device)
	return {
	'predictions': torch.stack(all_predictions, dim=1),
	'masks': torch.stack(all_masks, dim=1),
	'total_loss': total_loss / max(1, min(n_steps, T - 1)),
	'final_info_query': info_query,
	}


	# =============================================================================
	# Parameter Count Verification
	# =============================================================================

	def count_params(model, name="Model"):
	"""Count and display parameter breakdown."""
	total = sum(p.numel() for p in model.parameters())
	trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
	print(f"\n{'='*60}")
	print(f"{name}: {total:,} total params ({trainable:,} trainable)")
	print(f"{'='*60}")

	for child_name, child in model.named_children():
	child_params = sum(p.numel() for p in child.parameters())
	if child_params > 0:
	print(f" {child_name}: {child_params:,}")

	return total


	# =============================================================================
	# Demo / Test
	# =============================================================================

	if __name__ == "__main__":
	print("LeWorld Memory Architecture — Component Verification")
	print("=" * 60)

	# Configs
	mem_config = MemoryConfig()
	slm_config = SLMConfig()
	blm_config = BLMConfig()

	# Build system
	system = LeWorldSystem(mem_config, slm_config, blm_config)

	# Count parameters
	print("\n--- Parameter Counts ---")
	count_params(system.memory, "Artificial Memory")
	for i, slm in enumerate(system.slms):
	count_params(slm, f"SLM-{i}")
	count_params(system.blm, "BLM")
	count_params(system, "Full System")

	# Test forward pass
	print("\n--- Forward Pass Test ---")
	batch_size = 4
	state_dim = slm_config.state_dim
	char_dim = slm_config.char_dim

	past_state = torch.randn(batch_size, state_dim)
	current_state = torch.randn(batch_size, state_dim)
	characteristics = torch.randn(batch_size, char_dim)
	next_state = torch.randn(batch_size, state_dim)

	output = system(past_state, current_state, characteristics, next_state)

	print(f"Next state prediction shape: {output['next_state'].shape}")
	print(f"Binary mask (SLM selection): {output['binary_mask']}")
	print(f"Info query shape: {output['info_query'].shape}")
	print(f"Losses: {output['losses']}")

	# Test multi-step
	print("\n--- Multi-Step Test ---")
	T = 10
	states = torch.randn(batch_size, T, state_dim)

	ms_output = system.multi_step_forward(states, characteristics)
	print(f"Predictions shape: {ms_output['predictions'].shape}")
	print(f"Masks shape: {ms_output['masks'].shape}")
	print(f"Average loss: {ms_output['total_loss'].item():.4f}")

	# Show routing patterns over time
	print("\n--- Routing Patterns Over Time ---")
	masks = ms_output['masks'][0].detach() # first batch element
	for t in range(masks.shape[0]):
	mask = masks[t].int().tolist()
	print(f" Step {t}: SLMs selected = {mask}")

	print("\n✅ All components verified successfully!")