perf: tune train_hyper_loop for 300-step convergence

- muon_lr 0.008→0.012: max stable rate for ternary STE with clamp-aware
gradient gating (grads zero outside [-1,1], so higher LR is safe)
- muon_warmup hardcoded to 30 (10% of 300 steps)
- weight_decay 0.01→0.02
- llrd_decay 0.92→0.90 (0.90^27=0.058 vs 0.92^27=0.093; both give
meaningful bottom-layer gradients, but 0.90 is more aggressive)
- grokfast_alpha 0.98→0.95, grokfast_lambda 1.0→1.5
- Force loops=1 for all steps (no progressive 1→2→3); at 300 steps,
throughput matters more than iterative refinement
- Progressive loop scheduler still imported but overridden"

Files changed (1) hide show

chimera/training/loops.py +25 -21

chimera/training/loops.py CHANGED Viewed

@@ -53,29 +53,40 @@ def train_standard_loop(args, model, config, loader, compute_loss, optimizer, us
 def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
     use_compile = getattr(args, "compile", False)
-    # Muon LR for ternary BitLinear: standard Muon uses 0.02 for dense fp32/bf16
-    # weights, but ternary STE has a much narrower useful weight range [-1, 1].
-    # The NS unit-orthogonal update + momentum accumulation causes overshoot
-    # past step ~230, pushing weights outside the STE clamp zone (zero gradient).
-    # Optimal for ternary: 0.008 peak with aggressive cosine decay.
-    muon_lr = 0.008
-    muon_warmup = min(args.warmup, 100)
     model, optimizer, scheduler, extras = chimera_turbo.apply(
         model,
         max_steps=args.max_steps,
         lr=muon_lr,
-        weight_decay=0.01,
         warmup_steps=muon_warmup,
         use_compile=use_compile,
-        mtp_heads=0,
-        llrd_decay=0.92,
-        grokfast_alpha=0.98,
-        grokfast_lambda=1.0,
     )
     model.train()
-    loop_sched = ProgressiveLoopScheduler(args.max_steps, max_loops=3)
     cur_loops = 1
     use_bf16 = bool(args.bf16)
@@ -105,14 +116,7 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
                 data_iter = iter(loader)
                 print(f"  [P1] seq -> {cur_seq}  batch -> {eff_batch}")
-        new_loops = loop_sched.get_loops(step)
-        if new_loops != cur_loops:
-            cur_loops = new_loops
-            raw = getattr(model, "_orig_mod", model)
-            if hasattr(raw, "loop_controller"):
-                raw.loop_controller.loop_default = cur_loops
-            print(f"  [LOOP] -> {cur_loops}")
         if unfreezer:
             unfreezer.update(step)

 def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
     use_compile = getattr(args, "compile", False)
+    # ── Muon LR for ternary BitLinear ──
+    # v12.1: Raised from 0.008 to 0.012. The clamp-aware STE in BitLinear
+    # gates gradients to zero for weights outside [-1, 1], so the effective
+    # learning signal is self-limiting. 0.012 is the highest rate before
+    # NS-orthogonalized momentum causes oscillation at the STE boundary.
+    # At 300 steps, every step counts — 0.008 converges too slowly.
+    muon_lr = 0.012
+    muon_warmup = 30  # 10% of 300 steps; was min(args.warmup, 100)
     model, optimizer, scheduler, extras = chimera_turbo.apply(
         model,
         max_steps=args.max_steps,
         lr=muon_lr,
+        weight_decay=0.02,         # was 0.01; BitNet SLM: wd=0.05 optimal
         warmup_steps=muon_warmup,
         use_compile=use_compile,
+        mtp_heads=0,               # vocab/hidden=781:1 → MTP noisy + slow
+        llrd_decay=0.90,           # was 0.92; 0.90^27=0.058 → more bottom grad
+        grokfast_alpha=0.95,       # was 0.98; shorter EMA window for 300 steps
+        grokfast_lambda=1.5,       # was 1.0; amplify slow grads more aggressively
     )
     model.train()
+    # ── Looping: force loops=1 for all 300 steps ──
+    # Progressive 1→2→3 doubles/triples forward cost. At 300 steps,
+    # throughput (more tokens seen) beats iterative refinement (same
+    # tokens processed multiple times). Each extra loop adds ~18 layers
+    # of compute through the loop trunk for diminishing convergence gain.
     cur_loops = 1
+    raw_model = getattr(model, "_orig_mod", model)
+    if hasattr(raw_model, "loop_controller"):
+        raw_model.loop_controller.loop_default = 1
+        raw_model.loop_controller.loop_min = 1
+        raw_model.loop_controller.loop_max = 1  # Lock to 1
     use_bf16 = bool(args.bf16)
                 data_iter = iter(loader)
                 print(f"  [P1] seq -> {cur_seq}  batch -> {eff_batch}")
+        # Loops locked to 1 — no progressive schedule
         if unfreezer:
             unfreezer.update(step)