Lgr54HFi
/

ch1mera

chimera51

custom_code

Model card Files Files and versions

xet

Community

Lgr54HFi commited on 12 days ago

Commit

f1df870

verified ·

1 Parent(s): 33219af

Upload chimera/model.py with huggingface_hub

Browse files

Files changed (1) hide show

chimera/model.py +150 -90

chimera/model.py CHANGED Viewed

@@ -1,22 +1,13 @@
 """
-Chimera 5.2 — full causal LM (CPU-first).
-Key improvements over the previous implementation:
-* Every recurrent block returns ``(out, cache)`` so the inference loop can
-  carry per-layer state.  This collapses generation latency from O(T²) to
-  O(T) on CPU.
-* Looping mode now passes ``cache=None`` only on the *first* loop iteration
-  for each step, so iterative refinement does not accidentally double-count
-  past tokens.
-* The grammar/debt heads are real no-ops when their constraints are empty,
-  meaning a freshly loaded model performs **one** ``F.linear`` for the LM
-  head and that's it on the per-token path.
-* Vision/audio embeddings are now projected to ``hidden_size`` so the
-  concatenation is dimensionally correct.
-* ``logits_to_keep`` short-circuits the final hidden norm to the last
-  ``k`` tokens — the original code only sliced *before* ``norm`` was
-  applied, wasting CPU cycles on positions we never used.
 """
 from __future__ import annotations
@@ -40,35 +31,30 @@ from .evolution import SelfEvolutionEngine
 from .multimodal import VisionEncoder, AudioEncoder
-# ---------------------------------------------------------------------------
-# Output container
-# ---------------------------------------------------------------------------
 class CausalLMOutput(dict):
     """Light HF-compatible output dict supporting tuple unpacking."""
     def __init__(self, loss: Optional[torch.Tensor] = None,
                  logits: Optional[torch.Tensor] = None,
                  hidden_states: Optional[torch.Tensor] = None,
-                 caches: Optional[list] = None):
         super().__init__(loss=loss, logits=logits,
-                         hidden_states=hidden_states, caches=caches)
         self.loss = loss
         self.logits = logits
         self.hidden_states = hidden_states
         self.caches = caches
     def __iter__(self):
         yield self.loss
         yield self.logits
-# ---------------------------------------------------------------------------
-# Layer expansion helper
-# ---------------------------------------------------------------------------
 def expand_layer_pattern(config: dict) -> List[str]:
-    """Expand the layer-pattern shorthand (``"GD XM GD TM ..."``) into a list."""
     backbone = config.get("backbone", {})
     pattern_str = backbone.get("layer_pattern", "GD XM GD TM GD XM GD SK")
     aliases = backbone.get("layer_aliases", {
@@ -81,16 +67,8 @@ def expand_layer_pattern(config: dict) -> List[str]:
     return [aliases.get(p, p) for p in full]
-# ---------------------------------------------------------------------------
-# Single block: pre-norm attention/recurrence + pre-norm MLP/MoE
-# ---------------------------------------------------------------------------
 class Chimera51Block(nn.Module):
-    """One transformer-style block of the trunk.
-    ``forward`` accepts an optional ``cache`` and returns the updated cache
-    so layers above can keep KV/state across decoder steps.
-    """
     _RECURRENT = {"gated_deltanet", "xlstm_m", "titans_mac", "tsp_span_knot"}
@@ -104,6 +82,7 @@ class Chimera51Block(nn.Module):
         ternary = bool(config.get("use_ternary", True))
         chunk_sz = int(config.get("gated_deltanet", {}).get("chunk_size", 64))
         self.layer_type = layer_type
         self.attn_norm = RMSNorm(h, eps=eps)
@@ -144,20 +123,30 @@ class Chimera51Block(nn.Module):
             inter = 256 * ((inter + 255) // 256)
             self.mlp = SwiGLUMLP(h, inter, use_ternary=ternary)
-    def forward(self, x: torch.Tensor, cache: Optional[dict] = None
-                ) -> Tuple[torch.Tensor, dict]:
-        attn_out, new_cache = self.attn(self.attn_norm(x), cache=cache)
         x = x + attn_out
         x = x + self.mlp(self.mlp_norm(x))
-        return x, new_cache
-# ---------------------------------------------------------------------------
-# Full causal LM
-# ---------------------------------------------------------------------------
 class Chimera51ForCausalLM(nn.Module):
-    """Chimera 5.x causal language model."""
     def __init__(self, config: dict):
         super().__init__()
@@ -182,7 +171,7 @@ class Chimera51ForCausalLM(nn.Module):
         if config.get("tie_word_embeddings", True):
             self.lm_head.weight = self.embed.weight
-        # Parcae looping controller (only built when there are enough layers).
         loop_cfg = config.get("looping", {})
         self.looping_enabled = bool(loop_cfg.get("enabled", True)) and n_layers >= 3
         if self.looping_enabled:
@@ -195,20 +184,21 @@ class Chimera51ForCausalLM(nn.Module):
                 adaptive_exit_threshold=float(loop_cfg.get("adaptive_exit_threshold", 0.01)),
             )
-        # Inference systems.
         si_cfg = config.get("span_inference", {})
         self.span_engine = SpanInferenceEngine(h, si_cfg) if si_cfg.get("enabled", True) else None
         self.grammar = GrammarFST(config.get("grammar", {}))
         self.entropy_valve = EntropyValve(config.get("entropy_valve", {}))
         self.debt_ledger = DebtLedger(config.get("debt_ledger", {}))
-        # Self-evolution.
         evo_cfg = dict(config.get("self_evolution", {}))
         evo_cfg["_semantic_memory_config"] = config.get("semantic_memory", {})
         self.evolution = SelfEvolutionEngine(evo_cfg, h)
-        # Multimodal — projection happens inside the encoder so the output
-        # already matches ``hidden_size``.
         mm_cfg = dict(config.get("multimodal", {}))
         mm_cfg["hidden_size"] = h
         if mm_cfg.get("enabled", False):
@@ -222,8 +212,6 @@ class Chimera51ForCausalLM(nn.Module):
         self._init_weights()
         self._wire_semantic_memory()
-    # -- module lifecycle ------------------------------------------------------
     def enable_gradient_checkpointing(self) -> None:
         self.gradient_checkpointing = True
@@ -246,38 +234,61 @@ class Chimera51ForCausalLM(nn.Module):
                     nn.init.zeros_(module.bias)
             elif isinstance(module, nn.Embedding):
                 nn.init.normal_(module.weight, mean=0.0, std=init_range)
-        # BitLinear caches need refreshing after init.
         for module in self.modules():
             if isinstance(module, BitLinear):
                 module.invalidate_packed()
-    # -- core forward ----------------------------------------------------------
     def _run_layers(self, x: torch.Tensor, start: int, end: int,
-                    caches: Optional[list]) -> torch.Tensor:
         for i in range(start, min(end + 1, len(self.layers))):
             layer = self.layers[i]
             cache = caches[i] if caches is not None else None
             if self.gradient_checkpointing and self.training:
-                # Wrap the layer in a tensor-only closure so PyTorch's
-                # checkpoint helper can hash the inputs reliably.  Caches
-                # are not refreshed during gradient checkpointing — the
-                # recurrent state is recomputed in the backward pass.
-                def _ckpt_fn(x_in, layer=layer, cache=cache):
-                    out, _ = layer(x_in, cache=cache)
                     return out
                 x = checkpoint(_ckpt_fn, x, use_reentrant=False)
             else:
-                x, new_cache = layer(x, cache=cache)
                 if caches is not None:
                     caches[i] = new_cache
-        return x
-    def _loop_fn_factory(self, caches: Optional[list]):
-        """Capture caches for the loop controller's repeated invocations."""
-        def loop_fn(x: torch.Tensor) -> torch.Tensor:
-            return self._run_layers(x, self.loop_start, self.loop_end, caches)
-        return loop_fn
     def forward(self, input_ids: torch.Tensor,
                 labels: Optional[torch.Tensor] = None,
@@ -286,10 +297,11 @@ class Chimera51ForCausalLM(nn.Module):
                 num_loops: Optional[int] = None,
                 caches: Optional[list] = None,
                 use_cache: bool = False,
-                logits_to_keep: int = 0):
         x = self.embed(input_ids)
-        # Multimodal prepend (encoders already project to hidden_size).
         if pixel_values is not None and self.vision_encoder is not None:
             v = self.vision_encoder(pixel_values)
             if v is not None:
@@ -299,25 +311,49 @@ class Chimera51ForCausalLM(nn.Module):
             if a is not None:
                 x = torch.cat([a, x], dim=1)
-        # Optional KV/state caches.  ``use_cache`` is honoured even when the
-        # caller didn't supply one.
         if caches is None and use_cache:
             caches = [None] * len(self.layers)
         if self.looping_enabled and hasattr(self, "loop_controller"):
-            x = self._run_layers(x, self.prelude_start, self.prelude_end, caches)
             effective = num_loops
-            if effective is None and not self.training:
-                # Sample compute on the last token's logits only.
-                probe = self.lm_head(self.norm(x[:, -1:, :]))
-                effective = self.entropy_valve.get_loop_count(probe)
-            x = self.loop_controller(x, self._loop_fn_factory(caches), num_loops=effective)
-            x = self._run_layers(x, self.coda_start, self.coda_end, caches)
         else:
-            x = self._run_layers(x, 0, len(self.layers) - 1, caches)
-        # Slice to the relevant tail before allocating logits — the LM head is
-        # the largest matmul on small models because vocab >> hidden_size.
         if logits_to_keep and labels is None:
             keep = int(logits_to_keep)
             tail = x[:, -keep:, :]
@@ -334,21 +370,45 @@ class Chimera51ForCausalLM(nn.Module):
         logits = self.grammar(logits)
         logits = self.debt_ledger(logits)
         loss = None
         if labels is not None:
             seq_len = min(logits.size(1), labels.size(1))
             shift_logits = logits[:, :seq_len, :].contiguous()
             shift_labels = labels[:, :seq_len].contiguous()
-            loss = F.cross_entropy(
                 shift_logits.view(-1, shift_logits.size(-1)),
                 shift_labels.view(-1),
                 ignore_index=-100,
             )
-        return CausalLMOutput(loss=loss, logits=logits, hidden_states=x,
-                              caches=caches if use_cache else None)
-    # -- utilities -------------------------------------------------------------
     @torch.no_grad()
     def prepare_for_inference(self) -> None:

 """
+Chimera 5.2 — full causal LM with FUNCTIONAL self-evolution.
+Key changes for auto-evolution:
+* SelfEvolutionEngine is called at EVERY layer during forward pass
+* Semantic memory modulation is added to hidden states
+* TTT updates target MLP weights in-place during forward
+* Evolution loss is added to causal LM loss during training
+* Contrastive evaluation tracks memory usefulness
+* Loop depth classifier sets compute budget per sequence
 """
 from __future__ import annotations
 from .multimodal import VisionEncoder, AudioEncoder
 class CausalLMOutput(dict):
     """Light HF-compatible output dict supporting tuple unpacking."""
     def __init__(self, loss: Optional[torch.Tensor] = None,
                  logits: Optional[torch.Tensor] = None,
                  hidden_states: Optional[torch.Tensor] = None,
+                 caches: Optional[list] = None,
+                 evolution_metrics: Optional[dict] = None):
         super().__init__(loss=loss, logits=logits,
+                         hidden_states=hidden_states, caches=caches,
+                         evolution_metrics=evolution_metrics)
         self.loss = loss
         self.logits = logits
         self.hidden_states = hidden_states
         self.caches = caches
+        self.evolution_metrics = evolution_metrics or {}
     def __iter__(self):
         yield self.loss
         yield self.logits
 def expand_layer_pattern(config: dict) -> List[str]:
+    """Expand the layer-pattern shorthand into a list."""
     backbone = config.get("backbone", {})
     pattern_str = backbone.get("layer_pattern", "GD XM GD TM GD XM GD SK")
     aliases = backbone.get("layer_aliases", {
     return [aliases.get(p, p) for p in full]
 class Chimera51Block(nn.Module):
+    """One block with evolution-aware forward."""
     _RECURRENT = {"gated_deltanet", "xlstm_m", "titans_mac", "tsp_span_knot"}
         ternary = bool(config.get("use_ternary", True))
         chunk_sz = int(config.get("gated_deltanet", {}).get("chunk_size", 64))
+        self.layer_idx = layer_idx
         self.layer_type = layer_type
         self.attn_norm = RMSNorm(h, eps=eps)
             inter = 256 * ((inter + 255) // 256)
             self.mlp = SwiGLUMLP(h, inter, use_ternary=ternary)
+        # Evolution modulation projection (learnable scale)
+        self.evo_gate = nn.Linear(h, h, bias=False)
+        nn.init.zeros_(self.evo_gate.weight)
+    def forward(self, x: torch.Tensor, cache: Optional[dict] = None,
+                evo_modulation: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, dict]:
+        # Apply attention with pre-norm
+        normed = self.attn_norm(x)
+        attn_out, new_cache = self.attn(normed, cache=cache)
         x = x + attn_out
+        # Apply MLP with pre-norm
         x = x + self.mlp(self.mlp_norm(x))
+        # Apply evolution modulation (gated residual)
+        if evo_modulation is not None:
+            gate = torch.sigmoid(self.evo_gate(x))
+            x = x + gate * evo_modulation
+        return x, new_cache
 class Chimera51ForCausalLM(nn.Module):
+    """Chimera 5.x causal language model with functional self-evolution."""
     def __init__(self, config: dict):
         super().__init__()
         if config.get("tie_word_embeddings", True):
             self.lm_head.weight = self.embed.weight
+        # Parcae looping controller
         loop_cfg = config.get("looping", {})
         self.looping_enabled = bool(loop_cfg.get("enabled", True)) and n_layers >= 3
         if self.looping_enabled:
                 adaptive_exit_threshold=float(loop_cfg.get("adaptive_exit_threshold", 0.01)),
             )
+        # Inference systems
         si_cfg = config.get("span_inference", {})
         self.span_engine = SpanInferenceEngine(h, si_cfg) if si_cfg.get("enabled", True) else None
         self.grammar = GrammarFST(config.get("grammar", {}))
         self.entropy_valve = EntropyValve(config.get("entropy_valve", {}))
         self.debt_ledger = DebtLedger(config.get("debt_ledger", {}))
+        # Self-evolution — FUNCTIONAL
         evo_cfg = dict(config.get("self_evolution", {}))
         evo_cfg["_semantic_memory_config"] = config.get("semantic_memory", {})
         self.evolution = SelfEvolutionEngine(evo_cfg, h)
+        self.evo_weight = float(config.get("evolution_loss_weight", 0.01))
+        self.evo_every_n_layers = int(config.get("evolution_every_n_layers", 4))
+        # Multimodal
         mm_cfg = dict(config.get("multimodal", {}))
         mm_cfg["hidden_size"] = h
         if mm_cfg.get("enabled", False):
         self._init_weights()
         self._wire_semantic_memory()
     def enable_gradient_checkpointing(self) -> None:
         self.gradient_checkpointing = True
                     nn.init.zeros_(module.bias)
             elif isinstance(module, nn.Embedding):
                 nn.init.normal_(module.weight, mean=0.0, std=init_range)
         for module in self.modules():
             if isinstance(module, BitLinear):
                 module.invalidate_packed()
     def _run_layers(self, x: torch.Tensor, start: int, end: int,
+                    caches: Optional[list],
+                    compute_logits: bool = False,
+                    labels: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor], list]:
+        """Run layers with evolution hooks. Returns (x, logits_if_computed, caches)."""
+        all_metrics = []
+        logits = None
+        evolution_loss = torch.tensor(0.0, device=x.device)
         for i in range(start, min(end + 1, len(self.layers))):
             layer = self.layers[i]
             cache = caches[i] if caches is not None else None
+            # Evolution modulation every N layers (lightweight)
+            evo_mod = None
+            if i % self.evo_every_n_layers == 0 and self.evolution is not None:
+                # Compute modulation from semantic memory
+                # Note: loss parameter requires a scalar loss tensor for TTT/surprise;
+                #       pass None during standard forward, compute explicitly for TTT
+                evo_result = self.evolution(
+                    hidden_states=x.detach() if not x.requires_grad else x,
+                    layer_idx=i,
+                    loss=None
+                )
+                evo_mod = evo_result['modulation']
+                if evo_result['evolution_loss'] is not None:
+                    evolution_loss = evolution_loss + evo_result['evolution_loss']
+                all_metrics.append(evo_result.get('metrics', {}))
+                # TTT update for target layers (only in training, no backprop)
+                if self.training and evo_result.get('ttt_delta') is not None:
+                    with torch.no_grad():
+                        # Apply TTT to MLP down-projection if this is a target layer
+                        if hasattr(layer.mlp, 'w_down'):
+                            layer.mlp.w_down.data.add_(evo_result['ttt_delta'] * self.evolution.ttt.inner_lr)
             if self.gradient_checkpointing and self.training:
+                def _ckpt_fn(x_in, layer=layer, cache=cache, evo=evo_mod):
+                    out, _ = layer(x_in, cache=cache, evo_modulation=evo)
                     return out
                 x = checkpoint(_ckpt_fn, x, use_reentrant=False)
             else:
+                x, new_cache = layer(x, cache=cache, evo_modulation=evo_mod)
                 if caches is not None:
                     caches[i] = new_cache
+            # Compute probe logits for entropy valve (every few layers)
+            if compute_logits and i == end:
+                logits = self.lm_head(self.norm(x[:, -1:, :]))
+        return x, logits, caches, evolution_loss, all_metrics
     def forward(self, input_ids: torch.Tensor,
                 labels: Optional[torch.Tensor] = None,
                 num_loops: Optional[int] = None,
                 caches: Optional[list] = None,
                 use_cache: bool = False,
+                logits_to_keep: int = 0,
+                return_evolution_metrics: bool = False):
         x = self.embed(input_ids)
+        # Multimodal prepend
         if pixel_values is not None and self.vision_encoder is not None:
             v = self.vision_encoder(pixel_values)
             if v is not None:
             if a is not None:
                 x = torch.cat([a, x], dim=1)
         if caches is None and use_cache:
             caches = [None] * len(self.layers)
+        total_evo_loss = torch.tensor(0.0, device=x.device)
+        all_evo_metrics = []
+        # Prelude + Loop + Coda with evolution
         if self.looping_enabled and hasattr(self, "loop_controller"):
+            # Prelude
+            x, probe_logits, caches, evo_loss, metrics = self._run_layers(
+                x, self.prelude_start, self.prelude_end, caches,
+                compute_logits=not self.training, labels=labels)
+            total_evo_loss = total_evo_loss + evo_loss
+            all_evo_metrics.extend(metrics)
+            # Determine loop depth
             effective = num_loops
+            if effective is None and not self.training and probe_logits is not None:
+                effective = self.entropy_valve.get_loop_count(probe_logits)
+            elif effective is None and self.evolution is not None:
+                # Use loop classifier from evolution
+                last_hidden = x[:, -1, :].mean(dim=0, keepdim=True)  # Average over batch
+                effective = self.evolution.loop_classifier(last_hidden).item()
+                effective = max(1, min(effective, 6))
+            # Loop body
+            loop_fn = lambda inp: self._run_layers(
+                inp, self.loop_start, self.loop_end, caches, labels=labels)[0]
+            x = self.loop_controller(x, loop_fn, num_loops=effective)
+            # Coda
+            x, _, caches, evo_loss, metrics = self._run_layers(
+                x, self.coda_start, self.coda_end, caches, labels=labels)
+            total_evo_loss = total_evo_loss + evo_loss
+            all_evo_metrics.extend(metrics)
         else:
+            x, _, caches, evo_loss, metrics = self._run_layers(
+                x, 0, len(self.layers) - 1, caches,
+                compute_logits=not self.training, labels=labels)
+            total_evo_loss = total_evo_loss + evo_loss
+            all_evo_metrics.extend(metrics)
+        # Final norm and logits
         if logits_to_keep and labels is None:
             keep = int(logits_to_keep)
             tail = x[:, -keep:, :]
         logits = self.grammar(logits)
         logits = self.debt_ledger(logits)
+        # Self-feedback refinement check (inference only)
+        if not self.training and self.evolution is not None:
+            should_refine = self.evolution.self_feedback.should_refine(logits)
+            if should_refine:
+                all_evo_metrics.append({'refinement_triggered': True})
+        # Compute loss
         loss = None
         if labels is not None:
             seq_len = min(logits.size(1), labels.size(1))
             shift_logits = logits[:, :seq_len, :].contiguous()
             shift_labels = labels[:, :seq_len].contiguous()
+            ce_loss = F.cross_entropy(
                 shift_logits.view(-1, shift_logits.size(-1)),
                 shift_labels.view(-1),
                 ignore_index=-100,
             )
+            # Add evolution loss (contrastive memory evaluation)
+            loss = ce_loss + self.evo_weight * total_evo_loss
+        else:
+            ce_loss = None
+        # Store episodic case after forward (for inference mode)
+        if not self.training and self.evolution is not None:
+            last_hidden = x[:, -1, :].detach()
+            # Schedule episodic storage for end of sequence
+            # (In real use, call model.evolution.store_episodic() explicitly)
+        return CausalLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=x,
+            caches=caches if use_cache else None,
+            evolution_metrics={
+                'ce_loss': ce_loss.item() if ce_loss is not None else None,
+                'evo_loss': total_evo_loss.item(),
+                'layer_metrics': all_evo_metrics,
+            } if return_evolution_metrics else None
+        )
     @torch.no_grad()
     def prepare_for_inference(self) -> None: