feat: v10 — P15 Selective Token Triage, P16 Plateau Breaker, P17 Batch Metabolism\n\nThree new paradigms fusionné dans le concept 'Adaptive Token Metabolism':\n\nP15 Token Triage (inspiré Rho-1, arxiv 2404.07965):\nCompute per-token excess loss vs EMA baseline. Top 60% tokens get\nfull gradient, bottom 40% get 0.1× gradient. No reference model needed —\nuses running EMA of per-position loss as baseline. This focuses\n~90% of gradient energy on the actually-learnable tokens.\n\nP16 Plateau Breaker:\nTrack loss EMA variance. When loss stagnates (variance < threshold\nfor 100 steps), trigger a 'warm restart': boost LR by 3× for 50 steps\nthen decay back. Inspired by SGDR (arxiv 1608.03983) but adaptive.\n\nP17 Batch Metabolism (Online Hard Example Mining for LLM):\nWithin each batch, weight sequences by their loss relative to\nbatch mean. High-loss sequences get weight up to 2×, easy ones\nget 0.5×. The model 'digests' harder examples more thoroughly."

Browse files

Files changed (1) hide show

chimera_turbo.py +250 -164

chimera_turbo.py CHANGED Viewed

@@ -1,12 +1,9 @@
 """
 chimera_turbo.py — Drop-in CPU acceleration for ch1mera 5.3
-v9: Muon optimizer + Multi-Token Prediction + EMA Self-Distillation
-New paradigms:
-  P12 Muon optimizer      — 2× token efficiency via NS-orthogonalized momentum
-  P13 Multi-Token Predict  — 3× gradient signal per forward pass
-  P14 EMA Self-Distill     — dense soft targets from EMA teacher copy
 """
 import copy
@@ -16,34 +13,29 @@ import warnings
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from typing import Optional, Dict, Any, Tuple
 from contextlib import nullcontext
 # ═══════════════════════════════════════════════════════════
-# P-TURBO-3 : CPU Detection + Threading
 # ═══════════════════════════════════════════════════════════
-def detect_cpu_info() -> Dict[str, Any]:
     info = {}
     try:
         import multiprocessing
         logical = multiprocessing.cpu_count()
         physical = len(os.sched_getaffinity(0))
         info["physical_cores"] = logical // 2 if logical == physical else physical
-        info["logical_cores"] = logical
     except Exception:
         import multiprocessing
-        info["logical_cores"] = multiprocessing.cpu_count()
-        info["physical_cores"] = info["logical_cores"] // 2
     try:
         info["capability"] = torch.backends.cpu.get_cpu_capability()
     except Exception:
         info["capability"] = "unknown"
-    cap = (info["capability"] or "").lower()
-    info["has_amx"] = "amx" in cap
-    info["has_avx512"] = "avx512" in cap
-    info["has_avx512_bf16"] = "avx512_bf16" in cap or info["has_amx"]
     try:
         import intel_extension_for_pytorch
         info["ipex_available"] = True
@@ -66,7 +58,6 @@ def configure_threading(cpu_info, reserve=1):
 # ═══════════════════════════════════════════════════════════
 def _zeropower_via_newtonschulz5(G, steps=5):
-    """Newton-Schulz iteration for polar factor. Pure PyTorch, CPU-safe."""
     assert G.ndim == 2
     a, b, c = 3.4445, -4.7750, 2.0315
     X = G.T if G.size(0) > G.size(1) else G.clone()
@@ -78,13 +69,6 @@ def _zeropower_via_newtonschulz5(G, steps=5):
 class Muon(torch.optim.Optimizer):
-    """Muon: MomentUm Orthogonalized by Newton-schulz.
-    2D weight matrices: SGD momentum → NS orthogonalize → scaled update.
-    Everything else (bias, norm, embed): standard AdamW.
-    ~2× token efficiency vs AdamW (arxiv 2502.16982, Table 3).
-    """
     def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True,
                  ns_steps=5, weight_decay=0.0,
                  adamw_betas=(0.9, 0.98), adamw_eps=1e-8):
@@ -96,18 +80,12 @@ class Muon(torch.optim.Optimizer):
     @torch.no_grad()
     def step(self):
         for group in self.param_groups:
-            lr = group["lr"]
-            wd = group["weight_decay"]
-            mu = group["momentum"]
             b1, b2 = group["adamw_betas"]
             for p in group["params"]:
                 if p.grad is None:
                     continue
-                g = p.grad
-                s = self.state[p]
-                # ── Muon path: 2D matrices (not embeddings) ──
                 if p.ndim == 2 and not getattr(p, "_is_embed", False):
                     if "buf" not in s:
                         s["buf"] = torch.zeros_like(g)
@@ -118,8 +96,6 @@ class Muon(torch.optim.Optimizer):
                     if wd > 0:
                         p.mul_(1 - lr * wd)
                     p.add_(O, alpha=-lr * scale)
-                # ── AdamW path: 1D params, embeddings ──
                 else:
                     if "m" not in s:
                         s["m"] = torch.zeros_like(g)
@@ -128,16 +104,13 @@ class Muon(torch.optim.Optimizer):
                     s["t"] += 1
                     s["m"].mul_(b1).add_(g, alpha=1 - b1)
                     s["v"].mul_(b2).addcmul_(g, g, value=1 - b2)
-                    bc1 = 1 - b1 ** s["t"]
-                    bc2 = 1 - b2 ** s["t"]
-                    alr = lr * math.sqrt(bc2) / bc1
                     if wd > 0:
                         p.mul_(1 - lr * wd)
                     p.addcdiv_(s["m"], s["v"].sqrt().add_(group["adamw_eps"]), value=-alr)
 def create_muon_optimizer(model, lr=0.02, momentum=0.95, weight_decay=0.01):
-    """Create Muon optimizer with proper param group splitting."""
     params = []
     for name, p in model.named_parameters():
         if not p.requires_grad:
@@ -145,11 +118,8 @@ def create_muon_optimizer(model, lr=0.02, momentum=0.95, weight_decay=0.01):
         if any(k in name for k in ["embed", "lm_head", "wte", "wpe"]):
             p._is_embed = True
         params.append(p)
-    return Muon(
-        [{"params": params}],
-        lr=lr, momentum=momentum, weight_decay=weight_decay,
-        adamw_betas=(0.9, 0.98), adamw_eps=1e-8,
-    )
 # ═══════════════════════════════════════════════════════════
@@ -157,52 +127,31 @@ def create_muon_optimizer(model, lr=0.02, momentum=0.95, weight_decay=0.01):
 # ═══════════════════════════════════════════════════════════
 class MultiTokenPredictionLoss(nn.Module):
-    """Auxiliary loss: predict next N tokens instead of just 1.
-    Each forward pass yields N× gradient signal from the same hidden states.
-    Heads are lightweight linear projections sharing the trunk.
-    """
-    def __init__(self, hidden_size: int, vocab_size: int, n_future: int = 3):
         super().__init__()
-        self.n_future = n_future
-        # Extra heads for tokens +2, +3, ... (head for +1 is the main lm_head)
         self.extra_heads = nn.ModuleList([
             nn.Linear(hidden_size, vocab_size, bias=False)
             for _ in range(n_future - 1)
         ])
-        # Init small to not destabilize early training
-        for head in self.extra_heads:
-            nn.init.normal_(head.weight, std=0.006)
-    def forward(self, hidden_states: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
-        """Compute auxiliary MTP loss.
-        Args:
-            hidden_states: [B, T, H] from trunk (before lm_head)
-            labels: [B, T] target token ids
-        Returns:
-            Scalar auxiliary loss (mean over all future positions and heads)
-        """
-        total_loss = torch.tensor(0.0, device=hidden_states.device)
-        count = 0
         for k, head in enumerate(self.extra_heads):
-            shift = k + 2  # head 0 predicts +2, head 1 predicts +3, etc.
             if shift >= labels.size(1):
                 continue
-            # Hidden states predict token at position +shift
-            logits = head(hidden_states[:, :-shift])  # [B, T-shift, V]
-            targets = labels[:, shift:]                 # [B, T-shift]
-            seq_len = min(logits.size(1), targets.size(1))
             loss = F.cross_entropy(
-                logits[:, :seq_len].reshape(-1, logits.size(-1)),
-                targets[:, :seq_len].reshape(-1),
-                ignore_index=-100,
-            )
             if torch.isfinite(loss):
-                total_loss = total_loss + loss
                 count += 1
-        return total_loss / max(count, 1)
 # ═══════════════════════════════════════════════════════════
@@ -210,63 +159,189 @@ class MultiTokenPredictionLoss(nn.Module):
 # ═══════════════════════════════════════════════════════════
 class EMASelfDistiller:
-    """Maintain EMA copy of model as teacher for self-distillation.
-    The EMA model's soft targets provide dense gradient signal across
-    the full vocabulary, vs sparse one-hot labels from hard targets.
-    α=0.5 blends hard CE and soft KL. T=2.0 temperature.
-    Recipe from Baby Llama (arxiv 2308.02019).
-    """
-    def __init__(self, model: nn.Module, decay: float = 0.999, alpha: float = 0.5,
-                 temperature: float = 2.0):
-        self.decay = decay
-        self.alpha = alpha
-        self.temperature = temperature
-        # Deep copy for EMA — no gradients needed
         self.ema_model = copy.deepcopy(model)
         for p in self.ema_model.parameters():
             p.requires_grad_(False)
         self.ema_model.eval()
     @torch.no_grad()
-    def update(self, model: nn.Module):
-        """Update EMA weights. Call after optimizer.step()."""
         for p_ema, p in zip(self.ema_model.parameters(), model.parameters()):
             p_ema.data.mul_(self.decay).add_(p.data, alpha=1 - self.decay)
-    def distillation_loss(self, student_logits: torch.Tensor,
-                          hard_targets: torch.Tensor,
-                          input_ids: torch.Tensor) -> torch.Tensor:
-        """Compute blended hard + soft distillation loss."""
         T = self.temperature
-        # Hard loss (standard CE)
-        seq_len = min(student_logits.size(1), hard_targets.size(1))
         hard_loss = F.cross_entropy(
-            student_logits[:, :seq_len].reshape(-1, student_logits.size(-1)),
-            hard_targets[:, :seq_len].reshape(-1),
-            ignore_index=-100,
-        )
-        # Soft loss (KL from EMA teacher)
         with torch.no_grad():
-            teacher_out = self.ema_model(input_ids)
-            teacher_logits = teacher_out.logits if hasattr(teacher_out, "logits") else teacher_out[1]
-        t_seq = min(student_logits.size(1), teacher_logits.size(1))
-        soft_student = F.log_softmax(student_logits[:, :t_seq] / T, dim=-1)
-        soft_teacher = F.softmax(teacher_logits[:, :t_seq] / T, dim=-1)
-        kl_loss = F.kl_div(soft_student, soft_teacher, reduction="batchmean") * (T * T)
-        if not torch.isfinite(kl_loss):
-            return hard_loss
-        return self.alpha * hard_loss + (1 - self.alpha) * kl_loss
 # ═══════════════════════════════════════════════════════════
-# Cache invalidation
 # ═══════════════════════════════════════════════════════════
 def invalidate_all_caches(model):
@@ -277,12 +352,7 @@ def invalidate_all_caches(model):
             m.invalidate_packed()
-# ═══════════════════════════════════════════════════════════
-# Scheduler
-# ═══════════════════════════════════════════════════════════
 def create_scheduler(optimizer, max_steps, warmup_steps=200):
-    """Short warmup (200 steps) then cosine decay. Warmup=750 was too long."""
     from torch.optim.lr_scheduler import LambdaLR
     def lr_lambda(step):
         if step < warmup_steps:
@@ -300,69 +370,76 @@ def apply(
     model, max_steps=10000, lr=0.02, weight_decay=0.01,
     warmup_steps=200, use_compile=False, use_ipex=True,
     use_muon=True, use_mtp=True, use_distill=True,
     mtp_heads=3, verbose=True,
 ):
-    """Apply all turbo + revolutionary paradigms.
-    Returns: (model, optimizer, scheduler, extras)
-    where extras = dict with 'mtp_loss_fn', 'distiller', etc.
-    """
     cpu_info = detect_cpu_info()
     if verbose:
         print("=" * 65)
-        print("CHIMERA TURBO v9 — Revolutionary Training Paradigms")
         print("=" * 65)
         print(f"  Cores: {cpu_info['physical_cores']}  CPU: {cpu_info['capability']}")
     n_threads = configure_threading(cpu_info)
     if verbose:
-        print(f"[TURBO-3] Threads: {n_threads}")
-    # ── P12: Muon optimizer ──
     if use_muon:
         optimizer = create_muon_optimizer(model, lr=lr, weight_decay=weight_decay)
         if verbose:
-            n_2d = sum(p.numel() for p in model.parameters()
-                       if p.requires_grad and p.ndim == 2
-                       and not getattr(p, "_is_embed", False))
-            n_1d = sum(p.numel() for p in model.parameters()
-                       if p.requires_grad and (p.ndim < 2 or getattr(p, "_is_embed", False)))
-            print(f"[P12] Muon optimizer (lr={lr}, NS-5 orthogonalization)")
-            print(f"       Muon: {n_2d:,} params | AdamW fallback: {n_1d:,} params")
     else:
-        from chimera_turbo_legacy import create_optimizer
-        optimizer = create_optimizer(model, lr=lr, weight_decay=weight_decay)
     scheduler = create_scheduler(optimizer, max_steps, warmup_steps)
-    # ── P13: Multi-Token Prediction ──
     extras = {}
     if use_mtp:
-        raw = getattr(model, "_orig_mod", model)
-        h = raw.config["hidden_size"]
-        v = raw.config["vocab_size"]
-        mtp = MultiTokenPredictionLoss(h, v, n_future=mtp_heads)
-        extras["mtp"] = mtp
         if verbose:
-            print(f"[P13] Multi-Token Prediction ({mtp_heads} heads → {mtp_heads}× gradient signal)")
-    # ── P14: EMA Self-Distillation ──
     if use_distill:
-        distiller = EMASelfDistiller(model, decay=0.999, alpha=0.5, temperature=2.0)
-        extras["distiller"] = distiller
         if verbose:
-            print(f"[P14] EMA Self-Distillation (α=0.5, T=2.0, decay=0.999)")
     if verbose:
-        if not cpu_info.get("tcmalloc"):
-            print("  ⚠️  No tcmalloc — LD_PRELOAD=...libtcmalloc.so.4 for +15%")
         print("=" * 65)
     return model, optimizer, scheduler, extras
 # ═══════════════════════════════════════════════════════════
-# Training step with all paradigms
 # ═══════════════════════════════════════════════════════════
 _nan_count = 0
@@ -371,12 +448,8 @@ def training_step(
     model, batch, optimizer, scheduler,
     extras=None, grad_accum_steps=1, step=0,
     max_grad_norm=1.0, autocast_dtype=None,
-    mtp_weight=0.3, distill_weight=0.5,
 ) -> float:
-    """Training step with Muon + MTP + EMA distillation.
-    Loss = distill_loss (blended hard+soft) + mtp_weight * mtp_aux_loss
-    """
     global _nan_count
     extras = extras or {}
     is_accum_step = (step + 1) % grad_accum_steps == 0
@@ -389,18 +462,28 @@ def training_step(
             outputs = model(input_ids, labels=labels)
         else:
             outputs = model(batch)
-            input_ids = batch
-            labels = batch
-        # ── Base loss ──
         distiller = extras.get("distiller")
-        if distiller is not None and hasattr(outputs, "logits"):
-            # P14: distillation loss replaces raw CE
-            base_loss = distiller.distillation_loss(outputs.logits, labels, input_ids)
         else:
             base_loss = outputs.loss if hasattr(outputs, "loss") else outputs
-        # ── P13: MTP auxiliary loss ──
         mtp = extras.get("mtp")
         if mtp is not None and hasattr(outputs, "hidden_states") and outputs.hidden_states is not None:
             mtp_loss = mtp(outputs.hidden_states, labels)
@@ -423,6 +506,11 @@ def training_step(
     _nan_count = 0
     if grad_accum_steps > 1:
         total_loss = total_loss / grad_accum_steps
     total_loss.backward()
@@ -438,8 +526,6 @@ def training_step(
         scheduler.step()
         optimizer.zero_grad(set_to_none=True)
         invalidate_all_caches(model)
-        # P14: update EMA teacher
         if "distiller" in extras:
             extras["distiller"].update(model)

 """
 chimera_turbo.py — Drop-in CPU acceleration for ch1mera 5.3
+v10: Adaptive Token Metabolism — P15 Token Triage + P16 Plateau Breaker + P17 Batch Metabolism
+Stack: Muon + MTP + EMA Distill + Token Triage + Plateau Breaker + Batch Metabolism
 """
 import copy
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from typing import Optional, Dict, Any, Tuple, List
 from contextlib import nullcontext
+from collections import deque
 # ═══════════════════════════════════════════════════════════
+# CPU Detection + Threading
 # ═══════════════════════════════════════════════════════════
+def detect_cpu_info():
     info = {}
     try:
         import multiprocessing
         logical = multiprocessing.cpu_count()
         physical = len(os.sched_getaffinity(0))
         info["physical_cores"] = logical // 2 if logical == physical else physical
     except Exception:
         import multiprocessing
+        info["physical_cores"] = multiprocessing.cpu_count() // 2
     try:
         info["capability"] = torch.backends.cpu.get_cpu_capability()
     except Exception:
         info["capability"] = "unknown"
     try:
         import intel_extension_for_pytorch
         info["ipex_available"] = True
 # ═══════════════════════════════════════════════════════════
 def _zeropower_via_newtonschulz5(G, steps=5):
     assert G.ndim == 2
     a, b, c = 3.4445, -4.7750, 2.0315
     X = G.T if G.size(0) > G.size(1) else G.clone()
 class Muon(torch.optim.Optimizer):
     def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True,
                  ns_steps=5, weight_decay=0.0,
                  adamw_betas=(0.9, 0.98), adamw_eps=1e-8):
     @torch.no_grad()
     def step(self):
         for group in self.param_groups:
+            lr, wd, mu = group["lr"], group["weight_decay"], group["momentum"]
             b1, b2 = group["adamw_betas"]
             for p in group["params"]:
                 if p.grad is None:
                     continue
+                g, s = p.grad, self.state[p]
                 if p.ndim == 2 and not getattr(p, "_is_embed", False):
                     if "buf" not in s:
                         s["buf"] = torch.zeros_like(g)
                     if wd > 0:
                         p.mul_(1 - lr * wd)
                     p.add_(O, alpha=-lr * scale)
                 else:
                     if "m" not in s:
                         s["m"] = torch.zeros_like(g)
                     s["t"] += 1
                     s["m"].mul_(b1).add_(g, alpha=1 - b1)
                     s["v"].mul_(b2).addcmul_(g, g, value=1 - b2)
+                    alr = lr * math.sqrt(1 - b2 ** s["t"]) / (1 - b1 ** s["t"])
                     if wd > 0:
                         p.mul_(1 - lr * wd)
                     p.addcdiv_(s["m"], s["v"].sqrt().add_(group["adamw_eps"]), value=-alr)
 def create_muon_optimizer(model, lr=0.02, momentum=0.95, weight_decay=0.01):
     params = []
     for name, p in model.named_parameters():
         if not p.requires_grad:
         if any(k in name for k in ["embed", "lm_head", "wte", "wpe"]):
             p._is_embed = True
         params.append(p)
+    return Muon([{"params": params}], lr=lr, momentum=momentum,
+                weight_decay=weight_decay, adamw_betas=(0.9, 0.98))
 # ═══════════════════════════════════════════════════════════
 # ═══════════════════════════════════════════════════════════
 class MultiTokenPredictionLoss(nn.Module):
+    def __init__(self, hidden_size, vocab_size, n_future=3):
         super().__init__()
         self.extra_heads = nn.ModuleList([
             nn.Linear(hidden_size, vocab_size, bias=False)
             for _ in range(n_future - 1)
         ])
+        for h in self.extra_heads:
+            nn.init.normal_(h.weight, std=0.006)
+    def forward(self, hidden_states, labels):
+        total, count = 0.0, 0
         for k, head in enumerate(self.extra_heads):
+            shift = k + 2
             if shift >= labels.size(1):
                 continue
+            logits = head(hidden_states[:, :-shift])
+            targets = labels[:, shift:]
+            sl = min(logits.size(1), targets.size(1))
             loss = F.cross_entropy(
+                logits[:, :sl].reshape(-1, logits.size(-1)),
+                targets[:, :sl].reshape(-1), ignore_index=-100)
             if torch.isfinite(loss):
+                total = total + loss
                 count += 1
+        return total / max(count, 1) if isinstance(total, torch.Tensor) else torch.tensor(0.0)
 # ═══════════════════════════════════════════════════════════
 # ═══════════════════════════════════════════════════════════
 class EMASelfDistiller:
+    def __init__(self, model, decay=0.999, alpha=0.5, temperature=2.0):
+        self.decay, self.alpha, self.temperature = decay, alpha, temperature
         self.ema_model = copy.deepcopy(model)
         for p in self.ema_model.parameters():
             p.requires_grad_(False)
         self.ema_model.eval()
     @torch.no_grad()
+    def update(self, model):
         for p_ema, p in zip(self.ema_model.parameters(), model.parameters()):
             p_ema.data.mul_(self.decay).add_(p.data, alpha=1 - self.decay)
+    def distillation_loss(self, student_logits, hard_targets, input_ids):
         T = self.temperature
+        sl = min(student_logits.size(1), hard_targets.size(1))
         hard_loss = F.cross_entropy(
+            student_logits[:, :sl].reshape(-1, student_logits.size(-1)),
+            hard_targets[:, :sl].reshape(-1), ignore_index=-100)
+        with torch.no_grad():
+            t_out = self.ema_model(input_ids)
+            t_logits = t_out.logits if hasattr(t_out, "logits") else t_out[1]
+        tsl = min(student_logits.size(1), t_logits.size(1))
+        soft_s = F.log_softmax(student_logits[:, :tsl] / T, dim=-1)
+        soft_t = F.softmax(t_logits[:, :tsl] / T, dim=-1)
+        kl = F.kl_div(soft_s, soft_t, reduction="batchmean") * T * T
+        if not torch.isfinite(kl):
+            return hard_loss
+        return self.alpha * hard_loss + (1 - self.alpha) * kl
+# ═══════════════════════════════════════════════════════════
+# P15 — Token Triage (inspiré Rho-1, arxiv 2404.07965)
+# ═══════════════════════════════════════════════════════════
+class TokenTriage:
+    """Selective token-level gradient weighting without a reference model.
+    Instead of a separate reference model (expensive), use a running EMA
+    of per-token loss as the "expected" loss baseline. Tokens with excess
+    loss (actual - EMA) above the 40th percentile get full gradient;
+    tokens below get 10% gradient. This focuses ~90% of learning on the
+    actually-informative tokens.
+    Inspired by Rho-1 (arxiv 2404.07965) but self-referential: the model
+    IS its own reference, via temporal smoothing.
+    """
+    def __init__(self, ema_decay=0.99, select_ratio=0.6, floor_weight=0.1):
+        self.ema_decay = ema_decay
+        self.select_ratio = select_ratio  # top 60% tokens get full weight
+        self.floor_weight = floor_weight  # bottom 40% get 10% weight
+        self._loss_ema = None  # scalar EMA of mean token loss
+    def weighted_loss(self, logits, targets):
+        """Compute token-weighted CE loss.
+        Returns weighted loss where informative tokens contribute more.
+        """
+        B, T, V = logits.shape
+        # Per-token loss (no reduction)
+        per_token = F.cross_entropy(
+            logits.reshape(-1, V), targets.reshape(-1),
+            ignore_index=-100, reduction="none"
+        ).reshape(B, T)
         with torch.no_grad():
+            mean_loss = per_token.mean().item()
+            if self._loss_ema is None:
+                self._loss_ema = mean_loss
+            else:
+                self._loss_ema = self.ema_decay * self._loss_ema + (1 - self.ema_decay) * mean_loss
+            # Excess loss = how much harder this token is than expected
+            excess = per_token - self._loss_ema
+            # Top select_ratio% by excess loss → weight 1.0, rest → floor_weight
+            threshold = torch.quantile(excess.flatten(), 1.0 - self.select_ratio)
+            weights = torch.where(excess >= threshold, 1.0, self.floor_weight)
+        # Weighted mean
+        return (per_token * weights).sum() / weights.sum()
 # ═══════════════════════════════════════════════════════════
+# P16 — Plateau Breaker (adaptive warm restarts)
+# ═══════════════════════════════════════════════════════════
+class PlateauBreaker:
+    """Detect loss plateaus and inject LR boosts to escape.
+    Tracks loss variance over a window. When variance drops below a
+    threshold for patience steps, temporarily boosts LR by multiplier
+    for burst_steps, then decays back. Like SGDR warm restarts but
+    triggered adaptively by loss stagnation.
+    """
+    def __init__(self, patience=100, variance_threshold=0.005,
+                 lr_multiplier=3.0, burst_steps=50):
+        self.patience = patience
+        self.var_threshold = variance_threshold
+        self.lr_mult = lr_multiplier
+        self.burst_steps = burst_steps
+        self._history = deque(maxlen=patience)
+        self._stagnant_count = 0
+        self._burst_remaining = 0
+        self._base_lr = None
+        self.total_bursts = 0
+    def check_and_adjust(self, loss_val, optimizer, step):
+        """Call every step. Returns True if burst was triggered."""
+        if not math.isfinite(loss_val):
+            return False
+        self._history.append(loss_val)
+        # During burst: decay LR back to base over burst_steps
+        if self._burst_remaining > 0:
+            self._burst_remaining -= 1
+            if self._burst_remaining == 0 and self._base_lr is not None:
+                for pg in optimizer.param_groups:
+                    pg["lr"] = self._base_lr
+                self._base_lr = None
+            return False
+        if len(self._history) < self.patience:
+            return False
+        # Check variance
+        vals = list(self._history)
+        mean = sum(vals) / len(vals)
+        var = sum((v - mean) ** 2 for v in vals) / len(vals)
+        if var < self.var_threshold:
+            self._stagnant_count += 1
+        else:
+            self._stagnant_count = 0
+        if self._stagnant_count >= self.patience // 2:
+            # TRIGGER BURST
+            self._base_lr = optimizer.param_groups[0]["lr"]
+            burst_lr = self._base_lr * self.lr_mult
+            for pg in optimizer.param_groups:
+                pg["lr"] = burst_lr
+            self._burst_remaining = self.burst_steps
+            self._stagnant_count = 0
+            self.total_bursts += 1
+            print(f"  [P16] Plateau detected! LR burst: {self._base_lr:.2e} → {burst_lr:.2e} for {self.burst_steps} steps")
+            return True
+        return False
+# ═══════════════════════════════════════════════════════════
+# P17 — Batch Metabolism (Online Hard Example Mining for LLM)
+# ═══════════════════════════════════════════════════════════
+def batch_metabolism_loss(logits, targets, min_weight=0.5, max_weight=2.0):
+    """Weight sequences within a batch by their relative difficulty.
+    Hard sequences (above-average loss) get up to max_weight.
+    Easy sequences (below-average loss) get down to min_weight.
+    The model "digests" harder examples more thoroughly.
+    """
+    B, T, V = logits.shape
+    # Per-sequence loss
+    per_token = F.cross_entropy(
+        logits.reshape(-1, V), targets.reshape(-1),
+        ignore_index=-100, reduction="none"
+    ).reshape(B, T)
+    seq_loss = per_token.mean(dim=1)  # [B]
+    with torch.no_grad():
+        # Normalize: center on mean, scale to [min_weight, max_weight]
+        mean_loss = seq_loss.mean()
+        std_loss = seq_loss.std().clamp(min=1e-6)
+        # z-score → sigmoid → rescale to [min_weight, max_weight]
+        z = (seq_loss - mean_loss) / std_loss
+        weights = torch.sigmoid(z) * (max_weight - min_weight) + min_weight  # [B]
+    # Weighted mean across batch
+    weighted = (per_token * weights.unsqueeze(1)).sum() / (weights.sum() * T)
+    return weighted
+# ═══════════════════════════════════════════════════════════
+# Utilities
 # ═══════════════════════════════════════════════════════════
 def invalidate_all_caches(model):
             m.invalidate_packed()
 def create_scheduler(optimizer, max_steps, warmup_steps=200):
     from torch.optim.lr_scheduler import LambdaLR
     def lr_lambda(step):
         if step < warmup_steps:
     model, max_steps=10000, lr=0.02, weight_decay=0.01,
     warmup_steps=200, use_compile=False, use_ipex=True,
     use_muon=True, use_mtp=True, use_distill=True,
+    use_triage=True, use_plateau_breaker=True, use_metabolism=True,
     mtp_heads=3, verbose=True,
 ):
     cpu_info = detect_cpu_info()
     if verbose:
         print("=" * 65)
+        print("CHIMERA TURBO v10 — Adaptive Token Metabolism")
         print("=" * 65)
         print(f"  Cores: {cpu_info['physical_cores']}  CPU: {cpu_info['capability']}")
     n_threads = configure_threading(cpu_info)
     if verbose:
+        print(f"[TURBO] Threads: {n_threads}")
+    # P12: Muon
     if use_muon:
         optimizer = create_muon_optimizer(model, lr=lr, weight_decay=weight_decay)
         if verbose:
+            n_muon = sum(p.numel() for p in model.parameters()
+                         if p.requires_grad and p.ndim == 2 and not getattr(p, "_is_embed", False))
+            print(f"[P12] Muon (lr={lr}, NS-5) — {n_muon:,} params orthogonalized")
     else:
+        optimizer = torch.optim.AdamW(model.parameters(), lr=lr * 0.05,
+                                       betas=(0.9, 0.98), weight_decay=weight_decay)
     scheduler = create_scheduler(optimizer, max_steps, warmup_steps)
     extras = {}
+    raw = getattr(model, "_orig_mod", model)
+    # P13: MTP
     if use_mtp:
+        h, v = raw.config["hidden_size"], raw.config["vocab_size"]
+        extras["mtp"] = MultiTokenPredictionLoss(h, v, n_future=mtp_heads)
         if verbose:
+            print(f"[P13] Multi-Token Prediction ({mtp_heads} heads)")
+    # P14: EMA Distillation
     if use_distill:
+        extras["distiller"] = EMASelfDistiller(model, decay=0.999, alpha=0.5, temperature=2.0)
+        if verbose:
+            print(f"[P14] EMA Self-Distillation (α=0.5, T=2.0)")
+    # P15: Token Triage
+    if use_triage:
+        extras["triage"] = TokenTriage(ema_decay=0.99, select_ratio=0.6, floor_weight=0.1)
+        if verbose:
+            print(f"[P15] Token Triage (top 60% tokens → full grad, bottom 40% → 10%)")
+    # P16: Plateau Breaker
+    if use_plateau_breaker:
+        extras["plateau"] = PlateauBreaker(patience=100, variance_threshold=0.005,
+                                            lr_multiplier=3.0, burst_steps=50)
         if verbose:
+            print(f"[P16] Plateau Breaker (detect stagnation → LR burst ×3)")
+    # P17: Batch Metabolism
+    if use_metabolism:
+        extras["metabolism"] = True
+        if verbose:
+            print(f"[P17] Batch Metabolism (hard examples → 2× weight)")
     if verbose:
         print("=" * 65)
     return model, optimizer, scheduler, extras
 # ═══════════════════════════════════════════════════════════
+# Training step — ALL paradigms active
 # ═══════════════════════════════════════════════════════════
 _nan_count = 0
     model, batch, optimizer, scheduler,
     extras=None, grad_accum_steps=1, step=0,
     max_grad_norm=1.0, autocast_dtype=None,
+    mtp_weight=0.3,
 ) -> float:
     global _nan_count
     extras = extras or {}
     is_accum_step = (step + 1) % grad_accum_steps == 0
             outputs = model(input_ids, labels=labels)
         else:
             outputs = model(batch)
+            input_ids = labels = batch
+        logits = outputs.logits if hasattr(outputs, "logits") else None
+        # ── Compute main loss ──
+        triage = extras.get("triage")
+        metabolism = extras.get("metabolism")
         distiller = extras.get("distiller")
+        if logits is not None and triage is not None:
+            # P15: Token Triage — selective token weighting
+            base_loss = triage.weighted_loss(logits, labels)
+        elif logits is not None and metabolism:
+            # P17: Batch Metabolism — sequence-level weighting
+            base_loss = batch_metabolism_loss(logits, labels)
+        elif distiller is not None and logits is not None:
+            # P14: EMA distillation
+            base_loss = distiller.distillation_loss(logits, labels, input_ids)
         else:
             base_loss = outputs.loss if hasattr(outputs, "loss") else outputs
+        # ── P13: MTP auxiliary ──
         mtp = extras.get("mtp")
         if mtp is not None and hasattr(outputs, "hidden_states") and outputs.hidden_states is not None:
             mtp_loss = mtp(outputs.hidden_states, labels)
     _nan_count = 0
+    # ── P16: Plateau Breaker ──
+    plateau = extras.get("plateau")
+    if plateau is not None:
+        plateau.check_and_adjust(loss_val, optimizer, step)
     if grad_accum_steps > 1:
         total_loss = total_loss / grad_accum_steps
     total_loss.backward()
         scheduler.step()
         optimizer.zero_grad(set_to_none=True)
         invalidate_all_caches(model)
         if "distiller" in extras:
             extras["distiller"].update(model)