feat: P12 Muon optimizer, P13 Multi-Token Prediction, P14 EMA Self-Distillation\n\nThree new paradigms for revolutionary sample efficiency:\n\nP12 Muon: Newton-Schulz orthogonalized momentum for 2D weight matrices.\nSame loss in 52% of FLOPs vs AdamW (arxiv 2502.16982). AdamW fallback\nfor 1D params (biases, norms, embeddings).\n\nP13 MTP: predict next 3 tokens instead of 1. Each forward pass yields\n3x gradient signal. Implemented as auxiliary loss heads sharing the trunk.\n\nP14 EMA Self-Distillation: EMA copy of model acts as teacher. KL loss\nbetween student and EMA soft targets gives dense signal across full vocab\nvs sparse one-hot labels. α=0.5, T=2.0 (Baby Llama recipe, arxiv 2308.02019)."

Browse files

Files changed (1) hide show

chimera_turbo.py +335 -115

chimera_turbo.py CHANGED Viewed

@@ -1,10 +1,15 @@
 """
 chimera_turbo.py — Drop-in CPU acceleration for ch1mera 5.3
-Usage: import chimera_turbo; chimera_turbo.apply(model, max_steps=N)
-v8: BitNet-paper aligned hyperparams — β2=0.98, wd=0.01, warmup=750
 """
 import math
 import os
 import warnings
@@ -15,12 +20,16 @@ from typing import Optional, Dict, Any, Tuple
 from contextlib import nullcontext
 def detect_cpu_info() -> Dict[str, Any]:
     info = {}
     try:
-        physical = len(os.sched_getaffinity(0))
         import multiprocessing
         logical = multiprocessing.cpu_count()
         info["physical_cores"] = logical // 2 if logical == physical else physical
         info["logical_cores"] = logical
     except Exception:
@@ -33,9 +42,8 @@ def detect_cpu_info() -> Dict[str, Any]:
         info["capability"] = "unknown"
     cap = (info["capability"] or "").lower()
     info["has_amx"] = "amx" in cap
-    info["has_avx512"] = "avx512" in cap or "avx512_vnni" in cap
     info["has_avx512_bf16"] = "avx512_bf16" in cap or info["has_amx"]
-    info["has_vnni"] = info["has_avx512"]
     try:
         import intel_extension_for_pytorch
         info["ipex_available"] = True
@@ -45,173 +53,381 @@ def detect_cpu_info() -> Dict[str, Any]:
     return info
-def configure_threading(cpu_info: Dict[str, Any], reserve_for_io: int = 1):
-    n_compute = max(1, cpu_info["physical_cores"] - reserve_for_io)
-    torch.set_num_threads(n_compute)
-    os.environ["OMP_NUM_THREADS"] = str(n_compute)
-    os.environ["MKL_NUM_THREADS"] = str(n_compute)
-    return n_compute
-def create_optimizer(
-    model: nn.Module,
-    lr: float = 1.5e-3,           # ← BitNet interpolated: 125M→2.4e-3, 350M→1.2e-3
-    weight_decay: float = 0.01,   # ← BitNet original (2310.11453 Table 5)
-    use_lion: bool = False,
-    betas: Tuple[float, float] = (0.9, 0.98),  # ← BitNet: β2=0.98 NOT 0.95/0.999
-) -> torch.optim.Optimizer:
-    """AdamW with BitNet-paper hyperparameters.
-    Key differences from standard:
-    - β2=0.98 (not 0.999): faster variance adaptation for ternary noise
-    - wd=0.01: original BitNet paper value, more stable than 0.05 for from-scratch
-    - lr=1.5e-3: interpolated from BitNet Table 5 (125M→2.4e-3, 350M→1.2e-3)
     """
-    decay_params, no_decay_params = [], []
-    for name, param in model.named_parameters():
-        if not param.requires_grad:
             continue
-        if param.ndim <= 1 or "bias" in name or "norm" in name or "embed" in name:
-            no_decay_params.append(param)
-        else:
-            decay_params.append(param)
-    param_groups = [
-        {"params": decay_params, "weight_decay": weight_decay},
-        {"params": no_decay_params, "weight_decay": 0.0},
-    ]
-    if use_lion:
-        try:
-            from lion_pytorch import Lion
-            return Lion(param_groups, lr=lr * 0.3, betas=(0.95, 0.98))
-        except ImportError:
-            pass
-    return torch.optim.AdamW(param_groups, lr=lr, betas=betas, eps=1e-8, fused=False)
-def create_scheduler(optimizer, max_steps: int, warmup_steps: int = 750):
-    """Cosine decay with 750-step warmup (BitNet paper-exact)."""
-    from torch.optim.lr_scheduler import LambdaLR
-    def lr_lambda(step):
-        if step < warmup_steps:
-            return step / max(1, warmup_steps)
-        progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
-        return max(0.01, 0.5 * (1.0 + math.cos(math.pi * progress)))
-    return LambdaLR(optimizer, lr_lambda)
-def invalidate_all_caches(model: nn.Module):
     from chimera.quantization import BitLinear
-    for m in model.modules():
         if isinstance(m, BitLinear):
             m.invalidate_packed()
-def try_ipex_optimize(model, optimizer, cpu_info, dtype=None):
-    if not cpu_info.get("ipex_available"):
-        return model, optimizer
-    try:
-        import intel_extension_for_pytorch as ipex
-    except Exception:
-        return model, optimizer
-    if dtype is None:
-        dtype = torch.bfloat16 if (cpu_info["has_amx"] or cpu_info["has_avx512"]) else torch.float32
-    model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=dtype, level="O1", inplace=True)
-    return model, optimizer
-def try_compile_model(model: nn.Module, mode: str = "default") -> nn.Module:
-    if not hasattr(torch, "compile"):
-        return model
-    try:
-        compiled = torch.compile(model, backend="inductor", mode=mode, fullgraph=False)
-        print(f"[TURBO-2] torch.compile enabled (mode={mode})")
-        return compiled
-    except Exception:
-        return model
 def apply(
-    model: nn.Module, max_steps: int = 10000, lr: float = 1.5e-3,
-    weight_decay: float = 0.01, warmup_steps: int = 750,
-    use_compile: bool = True, use_ipex: bool = True,
-    use_lion: bool = False, verbose: bool = True,
-) -> Tuple[nn.Module, torch.optim.Optimizer, Any]:
     cpu_info = detect_cpu_info()
     if verbose:
         print("=" * 65)
-        print("CHIMERA TURBO v8 — BitNet-aligned hyperparams")
         print("=" * 65)
         print(f"  Cores: {cpu_info['physical_cores']}  CPU: {cpu_info['capability']}")
-        print(f"  IPEX: {cpu_info['ipex_available']}  tcmalloc: {cpu_info['tcmalloc']}")
     n_threads = configure_threading(cpu_info)
     if verbose:
         print(f"[TURBO-3] Threads: {n_threads}")
-    optimizer = create_optimizer(model, lr=lr, weight_decay=weight_decay)
-    scheduler = create_scheduler(optimizer, max_steps=max_steps, warmup_steps=warmup_steps)
-    if verbose:
-        n_params = sum(p.numel() for g in optimizer.param_groups for p in g["params"])
-        print(f"[TURBO-1] AdamW (lr={lr}, β=(0.9,0.98), wd={weight_decay}) — {n_params:,} params")
-    if use_ipex:
-        model, optimizer = try_ipex_optimize(model, optimizer, cpu_info)
-    if use_compile:
-        model = try_compile_model(model, mode="default")
     if verbose:
-        if not cpu_info["has_avx512_bf16"]:
-            print("  ⚠️  No BF16 hw — use --no-bf16")
-        if not cpu_info["tcmalloc"]:
             print("  ⚠️  No tcmalloc — LD_PRELOAD=...libtcmalloc.so.4 for +15%")
         print("=" * 65)
-    return model, optimizer, scheduler
-_nan_count = 0
-_MAX_CONSECUTIVE_NAN = 5
 def training_step(
-    model: nn.Module, batch, optimizer: torch.optim.Optimizer, scheduler,
-    grad_accum_steps: int = 1, step: int = 0,
-    max_grad_norm: float = 1.0,   # ← raised back to 1.0 (papers use none, this is light)
-    autocast_dtype: Optional[torch.dtype] = torch.bfloat16,
 ) -> float:
-    """NaN-safe training step with BitNet-aligned grad clipping.
-    BitNet papers use NO grad clipping. We keep a light clip (1.0) as safety
-    net for the evolution engine side-effects, but it should rarely activate.
     """
     global _nan_count
     is_accum_step = (step + 1) % grad_accum_steps == 0
     ctx = torch.autocast(device_type="cpu", dtype=autocast_dtype) if autocast_dtype else nullcontext()
     with ctx:
         if isinstance(batch, dict):
-            outputs = model(batch["input_ids"], labels=batch.get("labels"))
-        elif isinstance(batch, (tuple, list)):
-            outputs = model(*batch)
         else:
             outputs = model(batch)
-        loss = outputs if isinstance(outputs, torch.Tensor) else outputs.loss
-        loss_val = loss.item()
-    # ── NaN detection ──
     if not math.isfinite(loss_val):
         _nan_count += 1
         optimizer.zero_grad(set_to_none=True)
-        if _nan_count >= _MAX_CONSECUTIVE_NAN:
             for pg in optimizer.param_groups:
                 pg["lr"] *= 0.5
-            print(f"  [NaN] {_nan_count}x — LR halved to {optimizer.param_groups[0]['lr']:.2e}")
             _nan_count = 0
         return loss_val
     _nan_count = 0
     if grad_accum_steps > 1:
-        loss = loss / grad_accum_steps
-    loss.backward()
-    # Sanitize any NaN grads from evolution engine
     for p in model.parameters():
         if p.grad is not None and not torch.isfinite(p.grad).all():
             p.grad.nan_to_num_(nan=0.0, posinf=0.0, neginf=0.0)
@@ -223,4 +439,8 @@ def training_step(
         optimizer.zero_grad(set_to_none=True)
         invalidate_all_caches(model)
     return loss_val

 """
 chimera_turbo.py — Drop-in CPU acceleration for ch1mera 5.3
+v9: Muon optimizer + Multi-Token Prediction + EMA Self-Distillation
+New paradigms:
+  P12 Muon optimizer      — 2× token efficiency via NS-orthogonalized momentum
+  P13 Multi-Token Predict  — 3× gradient signal per forward pass
+  P14 EMA Self-Distill     — dense soft targets from EMA teacher copy
 """
+import copy
 import math
 import os
 import warnings
 from contextlib import nullcontext
+# ═══════════════════════════════════════════════════════════
+# P-TURBO-3 : CPU Detection + Threading
+# ═══════════════════════════════════════════════════════════
 def detect_cpu_info() -> Dict[str, Any]:
     info = {}
     try:
         import multiprocessing
         logical = multiprocessing.cpu_count()
+        physical = len(os.sched_getaffinity(0))
         info["physical_cores"] = logical // 2 if logical == physical else physical
         info["logical_cores"] = logical
     except Exception:
         info["capability"] = "unknown"
     cap = (info["capability"] or "").lower()
     info["has_amx"] = "amx" in cap
+    info["has_avx512"] = "avx512" in cap
     info["has_avx512_bf16"] = "avx512_bf16" in cap or info["has_amx"]
     try:
         import intel_extension_for_pytorch
         info["ipex_available"] = True
     return info
+def configure_threading(cpu_info, reserve=1):
+    n = max(1, cpu_info["physical_cores"] - reserve)
+    torch.set_num_threads(n)
+    os.environ["OMP_NUM_THREADS"] = str(n)
+    os.environ["MKL_NUM_THREADS"] = str(n)
+    return n
+# ═══════════════════════════════════════════════════════════
+# P12 — Muon Optimizer (arxiv 2502.16982)
+# ═══════════════════════════════════════════════════════════
+def _zeropower_via_newtonschulz5(G, steps=5):
+    """Newton-Schulz iteration for polar factor. Pure PyTorch, CPU-safe."""
+    assert G.ndim == 2
+    a, b, c = 3.4445, -4.7750, 2.0315
+    X = G.T if G.size(0) > G.size(1) else G.clone()
+    X = X / (X.norm() + 1e-7)
+    for _ in range(steps):
+        A = X @ X.T
+        X = a * X + (b * A + c * A @ A) @ X
+    return X.T if G.size(0) > G.size(1) else X
+class Muon(torch.optim.Optimizer):
+    """Muon: MomentUm Orthogonalized by Newton-schulz.
+    2D weight matrices: SGD momentum → NS orthogonalize → scaled update.
+    Everything else (bias, norm, embed): standard AdamW.
+    ~2× token efficiency vs AdamW (arxiv 2502.16982, Table 3).
     """
+    def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True,
+                 ns_steps=5, weight_decay=0.0,
+                 adamw_betas=(0.9, 0.98), adamw_eps=1e-8):
+        defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov,
+                        ns_steps=ns_steps, weight_decay=weight_decay,
+                        adamw_betas=adamw_betas, adamw_eps=adamw_eps)
+        super().__init__(params, defaults)
+    @torch.no_grad()
+    def step(self):
+        for group in self.param_groups:
+            lr = group["lr"]
+            wd = group["weight_decay"]
+            mu = group["momentum"]
+            b1, b2 = group["adamw_betas"]
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                g = p.grad
+                s = self.state[p]
+                # ── Muon path: 2D matrices (not embeddings) ──
+                if p.ndim == 2 and not getattr(p, "_is_embed", False):
+                    if "buf" not in s:
+                        s["buf"] = torch.zeros_like(g)
+                    s["buf"].mul_(mu).add_(g)
+                    ns_in = s["buf"] * mu + g if group["nesterov"] else s["buf"]
+                    O = _zeropower_via_newtonschulz5(ns_in, group["ns_steps"])
+                    scale = math.sqrt(max(1, p.size(0) / p.size(1)))
+                    if wd > 0:
+                        p.mul_(1 - lr * wd)
+                    p.add_(O, alpha=-lr * scale)
+                # ── AdamW path: 1D params, embeddings ──
+                else:
+                    if "m" not in s:
+                        s["m"] = torch.zeros_like(g)
+                        s["v"] = torch.zeros_like(g)
+                        s["t"] = 0
+                    s["t"] += 1
+                    s["m"].mul_(b1).add_(g, alpha=1 - b1)
+                    s["v"].mul_(b2).addcmul_(g, g, value=1 - b2)
+                    bc1 = 1 - b1 ** s["t"]
+                    bc2 = 1 - b2 ** s["t"]
+                    alr = lr * math.sqrt(bc2) / bc1
+                    if wd > 0:
+                        p.mul_(1 - lr * wd)
+                    p.addcdiv_(s["m"], s["v"].sqrt().add_(group["adamw_eps"]), value=-alr)
+def create_muon_optimizer(model, lr=0.02, momentum=0.95, weight_decay=0.01):
+    """Create Muon optimizer with proper param group splitting."""
+    params = []
+    for name, p in model.named_parameters():
+        if not p.requires_grad:
             continue
+        if any(k in name for k in ["embed", "lm_head", "wte", "wpe"]):
+            p._is_embed = True
+        params.append(p)
+    return Muon(
+        [{"params": params}],
+        lr=lr, momentum=momentum, weight_decay=weight_decay,
+        adamw_betas=(0.9, 0.98), adamw_eps=1e-8,
+    )
+# ═══════════════════════════════════════════════════════════
+# P13 — Multi-Token Prediction (arxiv 2404.19737)
+# ═══════════════════════════════════════════════════════════
+class MultiTokenPredictionLoss(nn.Module):
+    """Auxiliary loss: predict next N tokens instead of just 1.
+    Each forward pass yields N× gradient signal from the same hidden states.
+    Heads are lightweight linear projections sharing the trunk.
+    """
+    def __init__(self, hidden_size: int, vocab_size: int, n_future: int = 3):
+        super().__init__()
+        self.n_future = n_future
+        # Extra heads for tokens +2, +3, ... (head for +1 is the main lm_head)
+        self.extra_heads = nn.ModuleList([
+            nn.Linear(hidden_size, vocab_size, bias=False)
+            for _ in range(n_future - 1)
+        ])
+        # Init small to not destabilize early training
+        for head in self.extra_heads:
+            nn.init.normal_(head.weight, std=0.006)
+    def forward(self, hidden_states: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+        """Compute auxiliary MTP loss.
+        Args:
+            hidden_states: [B, T, H] from trunk (before lm_head)
+            labels: [B, T] target token ids
+        Returns:
+            Scalar auxiliary loss (mean over all future positions and heads)
+        """
+        total_loss = torch.tensor(0.0, device=hidden_states.device)
+        count = 0
+        for k, head in enumerate(self.extra_heads):
+            shift = k + 2  # head 0 predicts +2, head 1 predicts +3, etc.
+            if shift >= labels.size(1):
+                continue
+            # Hidden states predict token at position +shift
+            logits = head(hidden_states[:, :-shift])  # [B, T-shift, V]
+            targets = labels[:, shift:]                 # [B, T-shift]
+            seq_len = min(logits.size(1), targets.size(1))
+            loss = F.cross_entropy(
+                logits[:, :seq_len].reshape(-1, logits.size(-1)),
+                targets[:, :seq_len].reshape(-1),
+                ignore_index=-100,
+            )
+            if torch.isfinite(loss):
+                total_loss = total_loss + loss
+                count += 1
+        return total_loss / max(count, 1)
+# ═══════════════════════════════════════════════════════════
+# P14 — EMA Self-Distillation (arxiv 2308.02019)
+# ═══════════════════════════════════════════════════════════
+class EMASelfDistiller:
+    """Maintain EMA copy of model as teacher for self-distillation.
+    The EMA model's soft targets provide dense gradient signal across
+    the full vocabulary, vs sparse one-hot labels from hard targets.
+    α=0.5 blends hard CE and soft KL. T=2.0 temperature.
+    Recipe from Baby Llama (arxiv 2308.02019).
+    """
+    def __init__(self, model: nn.Module, decay: float = 0.999, alpha: float = 0.5,
+                 temperature: float = 2.0):
+        self.decay = decay
+        self.alpha = alpha
+        self.temperature = temperature
+        # Deep copy for EMA — no gradients needed
+        self.ema_model = copy.deepcopy(model)
+        for p in self.ema_model.parameters():
+            p.requires_grad_(False)
+        self.ema_model.eval()
+    @torch.no_grad()
+    def update(self, model: nn.Module):
+        """Update EMA weights. Call after optimizer.step()."""
+        for p_ema, p in zip(self.ema_model.parameters(), model.parameters()):
+            p_ema.data.mul_(self.decay).add_(p.data, alpha=1 - self.decay)
+    def distillation_loss(self, student_logits: torch.Tensor,
+                          hard_targets: torch.Tensor,
+                          input_ids: torch.Tensor) -> torch.Tensor:
+        """Compute blended hard + soft distillation loss."""
+        T = self.temperature
+        # Hard loss (standard CE)
+        seq_len = min(student_logits.size(1), hard_targets.size(1))
+        hard_loss = F.cross_entropy(
+            student_logits[:, :seq_len].reshape(-1, student_logits.size(-1)),
+            hard_targets[:, :seq_len].reshape(-1),
+            ignore_index=-100,
+        )
+        # Soft loss (KL from EMA teacher)
+        with torch.no_grad():
+            teacher_out = self.ema_model(input_ids)
+            teacher_logits = teacher_out.logits if hasattr(teacher_out, "logits") else teacher_out[1]
+        t_seq = min(student_logits.size(1), teacher_logits.size(1))
+        soft_student = F.log_softmax(student_logits[:, :t_seq] / T, dim=-1)
+        soft_teacher = F.softmax(teacher_logits[:, :t_seq] / T, dim=-1)
+        kl_loss = F.kl_div(soft_student, soft_teacher, reduction="batchmean") * (T * T)
+        if not torch.isfinite(kl_loss):
+            return hard_loss
+        return self.alpha * hard_loss + (1 - self.alpha) * kl_loss
+# ═══════════════════════════════════════════════════════════
+# Cache invalidation
+# ═══════════════════════════════════════════════════════════
+def invalidate_all_caches(model):
     from chimera.quantization import BitLinear
+    raw = getattr(model, "_orig_mod", model)
+    for m in raw.modules():
         if isinstance(m, BitLinear):
             m.invalidate_packed()
+# ═══════════════════════════════════════════════════════════
+# Scheduler
+# ══════════════════════════════════════════════════════════��
+def create_scheduler(optimizer, max_steps, warmup_steps=200):
+    """Short warmup (200 steps) then cosine decay. Warmup=750 was too long."""
+    from torch.optim.lr_scheduler import LambdaLR
+    def lr_lambda(step):
+        if step < warmup_steps:
+            return step / max(1, warmup_steps)
+        progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
+        return max(0.01, 0.5 * (1.0 + math.cos(math.pi * progress)))
+    return LambdaLR(optimizer, lr_lambda)
+# ═══════════════════════════════════════════════════════════
+# MAIN: apply()
+# ═══════════════════════════════════════════════════════════
 def apply(
+    model, max_steps=10000, lr=0.02, weight_decay=0.01,
+    warmup_steps=200, use_compile=False, use_ipex=True,
+    use_muon=True, use_mtp=True, use_distill=True,
+    mtp_heads=3, verbose=True,
+):
+    """Apply all turbo + revolutionary paradigms.
+    Returns: (model, optimizer, scheduler, extras)
+    where extras = dict with 'mtp_loss_fn', 'distiller', etc.
+    """
     cpu_info = detect_cpu_info()
     if verbose:
         print("=" * 65)
+        print("CHIMERA TURBO v9 — Revolutionary Training Paradigms")
         print("=" * 65)
         print(f"  Cores: {cpu_info['physical_cores']}  CPU: {cpu_info['capability']}")
     n_threads = configure_threading(cpu_info)
     if verbose:
         print(f"[TURBO-3] Threads: {n_threads}")
+    # ── P12: Muon optimizer ──
+    if use_muon:
+        optimizer = create_muon_optimizer(model, lr=lr, weight_decay=weight_decay)
+        if verbose:
+            n_2d = sum(p.numel() for p in model.parameters()
+                       if p.requires_grad and p.ndim == 2
+                       and not getattr(p, "_is_embed", False))
+            n_1d = sum(p.numel() for p in model.parameters()
+                       if p.requires_grad and (p.ndim < 2 or getattr(p, "_is_embed", False)))
+            print(f"[P12] Muon optimizer (lr={lr}, NS-5 orthogonalization)")
+            print(f"       Muon: {n_2d:,} params | AdamW fallback: {n_1d:,} params")
+    else:
+        from chimera_turbo_legacy import create_optimizer
+        optimizer = create_optimizer(model, lr=lr, weight_decay=weight_decay)
+    scheduler = create_scheduler(optimizer, max_steps, warmup_steps)
+    # ── P13: Multi-Token Prediction ──
+    extras = {}
+    if use_mtp:
+        raw = getattr(model, "_orig_mod", model)
+        h = raw.config["hidden_size"]
+        v = raw.config["vocab_size"]
+        mtp = MultiTokenPredictionLoss(h, v, n_future=mtp_heads)
+        extras["mtp"] = mtp
+        if verbose:
+            print(f"[P13] Multi-Token Prediction ({mtp_heads} heads → {mtp_heads}× gradient signal)")
+    # ── P14: EMA Self-Distillation ──
+    if use_distill:
+        distiller = EMASelfDistiller(model, decay=0.999, alpha=0.5, temperature=2.0)
+        extras["distiller"] = distiller
+        if verbose:
+            print(f"[P14] EMA Self-Distillation (α=0.5, T=2.0, decay=0.999)")
     if verbose:
+        if not cpu_info.get("tcmalloc"):
             print("  ⚠️  No tcmalloc — LD_PRELOAD=...libtcmalloc.so.4 for +15%")
         print("=" * 65)
+    return model, optimizer, scheduler, extras
+# ══════════���════════════════════════════════════════════════
+# Training step with all paradigms
+# ═══════════════════════════════════════════════════════════
+_nan_count = 0
 def training_step(
+    model, batch, optimizer, scheduler,
+    extras=None, grad_accum_steps=1, step=0,
+    max_grad_norm=1.0, autocast_dtype=None,
+    mtp_weight=0.3, distill_weight=0.5,
 ) -> float:
+    """Training step with Muon + MTP + EMA distillation.
+    Loss = distill_loss (blended hard+soft) + mtp_weight * mtp_aux_loss
     """
     global _nan_count
+    extras = extras or {}
     is_accum_step = (step + 1) % grad_accum_steps == 0
     ctx = torch.autocast(device_type="cpu", dtype=autocast_dtype) if autocast_dtype else nullcontext()
     with ctx:
         if isinstance(batch, dict):
+            input_ids = batch["input_ids"]
+            labels = batch.get("labels", input_ids)
+            outputs = model(input_ids, labels=labels)
         else:
             outputs = model(batch)
+            input_ids = batch
+            labels = batch
+        # ── Base loss ──
+        distiller = extras.get("distiller")
+        if distiller is not None and hasattr(outputs, "logits"):
+            # P14: distillation loss replaces raw CE
+            base_loss = distiller.distillation_loss(outputs.logits, labels, input_ids)
+        else:
+            base_loss = outputs.loss if hasattr(outputs, "loss") else outputs
+        # ── P13: MTP auxiliary loss ──
+        mtp = extras.get("mtp")
+        if mtp is not None and hasattr(outputs, "hidden_states") and outputs.hidden_states is not None:
+            mtp_loss = mtp(outputs.hidden_states, labels)
+            total_loss = base_loss + mtp_weight * mtp_loss
+        else:
+            total_loss = base_loss
+        loss_val = total_loss.item()
+    # ── NaN guard ──
     if not math.isfinite(loss_val):
         _nan_count += 1
         optimizer.zero_grad(set_to_none=True)
+        if _nan_count >= 5:
             for pg in optimizer.param_groups:
                 pg["lr"] *= 0.5
+            print(f"  [NaN] 5× — LR halved to {optimizer.param_groups[0]['lr']:.2e}")
             _nan_count = 0
         return loss_val
     _nan_count = 0
     if grad_accum_steps > 1:
+        total_loss = total_loss / grad_accum_steps
+    total_loss.backward()
+    # Sanitize grads
     for p in model.parameters():
         if p.grad is not None and not torch.isfinite(p.grad).all():
             p.grad.nan_to_num_(nan=0.0, posinf=0.0, neginf=0.0)
         optimizer.zero_grad(set_to_none=True)
         invalidate_all_caches(model)
+        # P14: update EMA teacher
+        if "distiller" in extras:
+            extras["distiller"].update(model)
     return loss_val