perf: tune chimera_turbo.py for 300-step convergence + throughput

All changes target joint throughput ≥1000 tok/s + near-optimal loss by step 300:

apply() defaults:
- lr 0.02→0.012: ternary STE clamp zone [-1,1] causes overshoot at 0.02
- weight_decay 0.01→0.02: BitNet SLM paper finds wd=0.05 optimal; 0.02
balances with Muon's NS regularization
- warmup_steps 200→30: 200 wastes 67% of a 300-step budget at sub-optimal LR
- mtp_heads 3→0: each head adds Linear(256,200073)=51M params; 3 heads =
153M extra params (4.4× model) destroying cache residency + throughput
- llrd_decay 0.85→0.90: 0.85^27=0.009 nearly freezes bottom layers;
0.90^27=0.058 gives 6× more gradient at bottom, critical for 300 steps
- grokfast_alpha 0.98→0.95: EMA window ~50→~20 steps, better for short runs
- grokfast_lambda 2.0→1.5: reduce instability risk with Muon NS updates

TokenTriage:
- select_ratio 0.6→0.50: focus top-50% informative tokens
- floor_weight 0.1→0.15: ensure minimum signal from all tokens
- warmup_steps 500→30: original never activates in 300-step run

PlateauBreaker:
- patience 200→60: original never fires in 300 steps
- variance_threshold 0.02→0.01: tighter detection for converging loss
- lr_multiplier 2.0→1.8: gentler burst with Muon
- burst_steps 50→20: controlled escape window

Scheduler:
- cosine floor 0.01→0.05: keep LR active through final steps

configure_threading:
- Respect OMP_NUM_THREADS from launch_turbo.sh instead of overriding"

Files changed (1) hide show

chimera_turbo.py +55 -17

chimera_turbo.py CHANGED Viewed

@@ -10,6 +10,13 @@ Interaction-audited paradigm stack. Every paradigm verified cumulative.
   P17 Batch Metabolism  — hard sequences weighted higher
   P18 Grokfast-EMA     — amplify slow grads (1D params ONLY — NS cancels on 2D)
   P19 LLRD             — layer-wise LR decay for ternary
 """
 import math
@@ -43,9 +50,15 @@ def detect_cpu_info():
 def configure_threading(cpu_info, reserve=1):
-    n = max(1, cpu_info["physical_cores"] - reserve)
     torch.set_num_threads(n)
-    os.environ["OMP_NUM_THREADS"] = str(n)
     return n
@@ -187,13 +200,18 @@ class MultiTokenPredictionLoss(nn.Module):
 # ═══════════════════════════════════════════════════════════
 class TokenTriage:
-    def __init__(self, ema_decay=0.99, select_ratio=0.6, floor_weight=0.1):
         self.ema_decay = ema_decay
         self.select_ratio = select_ratio
         self.floor_weight = floor_weight
         self._loss_ema = None
         self._step = 0
-        self.warmup_steps = 500
     def compute_weights(self, per_token_loss):
         with torch.no_grad():
@@ -218,8 +236,13 @@ class TokenTriage:
 # ═══════════════════════════════════════════════════════════
 class PlateauBreaker:
-    def __init__(self, patience=100, variance_threshold=0.005,
-                 lr_multiplier=2.0, burst_steps=50):
         self.patience = patience
         self.var_threshold = variance_threshold
         self.lr_mult = lr_multiplier
@@ -301,13 +324,18 @@ def invalidate_all_caches(model):
         m.invalidate_packed()
-def create_scheduler(optimizer, max_steps, warmup_steps=200):
     from torch.optim.lr_scheduler import LambdaLR
     def lr_lambda(step):
         if step < warmup_steps:
             return step / max(1, warmup_steps)
         progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
-        return max(0.01, 0.5 * (1.0 + math.cos(math.pi * progress)))
     return LambdaLR(optimizer, lr_lambda)
@@ -315,14 +343,24 @@ def create_scheduler(optimizer, max_steps, warmup_steps=200):
 # apply()
 # ═══════════════════════════════════════════════════════════
-def apply(model, max_steps=10000, lr=0.02, weight_decay=0.01,
-          warmup_steps=200, use_compile=False, mtp_heads=3,
-          llrd_decay=0.85, grokfast_alpha=0.98, grokfast_lambda=2.0,
           verbose=True):
     cpu_info = detect_cpu_info()
     if verbose:
         print("=" * 65)
-        print("CHIMERA GENESIS v12 — Interaction-Audited Stack")
         print("=" * 65)
         print(f"  CPU: {cpu_info['capability']}  Cores: {cpu_info['physical_cores']}")
@@ -357,14 +395,14 @@ def apply(model, max_steps=10000, lr=0.02, weight_decay=0.01,
         else:
             print(f"[P13] MTP disabled (vocab/hidden ratio too high for CPU)")
-    extras["triage"] = TokenTriage(ema_decay=0.99, select_ratio=0.6, floor_weight=0.1)
     if verbose:
-        print(f"[P15] Token Triage (annealed warmup)")
-    extras["plateau"] = PlateauBreaker(patience=200, variance_threshold=0.02,
-                                        lr_multiplier=2.0, burst_steps=50)
     if verbose:
-        print(f"[P16] Plateau Breaker (x2 burst, LLRD-aware)")
     extras["grokfast"] = GrokfastEMA(alpha=grokfast_alpha, lamb=grokfast_lambda)
     if verbose:

   P17 Batch Metabolism  — hard sequences weighted higher
   P18 Grokfast-EMA     — amplify slow grads (1D params ONLY — NS cancels on 2D)
   P19 LLRD             — layer-wise LR decay for ternary
+v12.1 — Tuned for 300-step convergence + ≥1000 tok/s on i7-14700T:
+  - MTP disabled (vocab/hidden=781:1 makes heads noisy + destroys cache)
+  - TokenTriage warmup=30, PlateauBreaker patience=60
+  - Grokfast alpha=0.95/lambda=1.5 for short-horizon training
+  - Scheduler cosine floor raised to 0.05 (keep learning through step 300)
+  - configure_threading respects shell-level OMP_NUM_THREADS (P-core pinning)
 """
 import math
 def configure_threading(cpu_info, reserve=1):
+    # Respect OMP_NUM_THREADS set by launch_turbo.sh (P-core pinning).
+    # Only auto-configure if the env var wasn't set externally.
+    env_threads = os.environ.get("OMP_NUM_THREADS")
+    if env_threads is not None:
+        n = int(env_threads)
+    else:
+        n = max(1, cpu_info["physical_cores"] - reserve)
+        os.environ["OMP_NUM_THREADS"] = str(n)
     torch.set_num_threads(n)
     return n
 # ═══════════════════════════════════════════════════════════
 class TokenTriage:
+    """Focus gradient signal on the most informative tokens.
+    v12.1: warmup=30, select_ratio=0.50, floor=0.15 for 300-step runs.
+    Original warmup=500 never activated in short training.
+    """
+    def __init__(self, ema_decay=0.99, select_ratio=0.50, floor_weight=0.15):
         self.ema_decay = ema_decay
         self.select_ratio = select_ratio
         self.floor_weight = floor_weight
         self._loss_ema = None
         self._step = 0
+        self.warmup_steps = 30  # Must be ≤ total steps; was 500
     def compute_weights(self, per_token_loss):
         with torch.no_grad():
 # ═══════════════════════════════════════════════════════════
 class PlateauBreaker:
+    """Adaptive LR burst when loss stagnates.
+    v12.1: patience=60, var_threshold=0.01, mult=1.8, burst=20 for 300-step runs.
+    Original patience=200 never fired in short training.
+    """
+    def __init__(self, patience=60, variance_threshold=0.01,
+                 lr_multiplier=1.8, burst_steps=20):
         self.patience = patience
         self.var_threshold = variance_threshold
         self.lr_mult = lr_multiplier
         m.invalidate_packed()
+def create_scheduler(optimizer, max_steps, warmup_steps=30):
+    """Cosine schedule with warmup.
+    v12.1: floor raised from 0.01 to 0.05 so LR stays active through
+    the final steps of a 300-step run. warmup default lowered to 30.
+    """
     from torch.optim.lr_scheduler import LambdaLR
     def lr_lambda(step):
         if step < warmup_steps:
             return step / max(1, warmup_steps)
         progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
+        return max(0.05, 0.5 * (1.0 + math.cos(math.pi * progress)))
     return LambdaLR(optimizer, lr_lambda)
 # apply()
 # ═══════════════════════════════════════════════════════════
+def apply(model, max_steps=10000, lr=0.012, weight_decay=0.02,
+          warmup_steps=30, use_compile=False, mtp_heads=0,
+          llrd_decay=0.90, grokfast_alpha=0.95, grokfast_lambda=1.5,
           verbose=True):
+    """Configure the GENESIS paradigm stack.
+    v12.1 defaults tuned for 300-step convergence on i7-14700T:
+      lr=0.012      (was 0.02; ternary STE clamp zone [-1,1])
+      wd=0.02       (was 0.01; BitNet SLM paper: wd=0.05 optimal)
+      warmup=30     (was 200; 10% of 300-step budget)
+      mtp_heads=0   (was 3; vocab/hidden=781:1 makes MTP noisy + slow)
+      llrd=0.90     (was 0.85; 0.85^27=0.009 freezes bottom layers)
+      grokfast: a=0.95/l=1.5 (was 0.98/2.0; shorter EMA window)
+    """
     cpu_info = detect_cpu_info()
     if verbose:
         print("=" * 65)
+        print("CHIMERA GENESIS v12.1 — Tuned for 300-step convergence")
         print("=" * 65)
         print(f"  CPU: {cpu_info['capability']}  Cores: {cpu_info['physical_cores']}")
         else:
             print(f"[P13] MTP disabled (vocab/hidden ratio too high for CPU)")
+    extras["triage"] = TokenTriage(ema_decay=0.99, select_ratio=0.50, floor_weight=0.15)
     if verbose:
+        print(f"[P15] Token Triage (select=0.50, floor=0.15, warmup=30)")
+    extras["plateau"] = PlateauBreaker(patience=60, variance_threshold=0.01,
+                                        lr_multiplier=1.8, burst_steps=20)
     if verbose:
+        print(f"[P16] Plateau Breaker (patience=60, x1.8 burst, 20 steps)")
     extras["grokfast"] = GrokfastEMA(alpha=grokfast_alpha, lamb=grokfast_lambda)
     if verbose: