Lgr54HFi
/

chomera

Lgr54HFi commited on 11 days ago

Commit

6d5c935

verified ·

1 Parent(s): d83bada

Upload chimera/training/loops.py

Files changed (1) hide show

chimera/training/loops.py CHANGED Viewed

@@ -53,26 +53,30 @@ def train_standard_loop(args, model, config, loader, compute_loss, optimizer, us
 def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
     use_compile = getattr(args, "compile", False)
-    # FIX: Use args.lr instead of hardcoded 0.02.
-    # FIX: Use args.warmup instead of hardcoded 200.
-    # FIX: Reduce MTP heads from 3->2 to cut 51M params of overhead.
-    # FIX: Soften LLRD decay (0.85->0.92) so early layers still learn.
-    # FIX: Lower Grokfast lambda (2.0->1.0) to reduce gradient amplification noise.
     model, optimizer, scheduler, extras = chimera_turbo.apply(
         model,
         max_steps=args.max_steps,
-        lr=args.lr,
         weight_decay=0.01,
-        warmup_steps=args.warmup,
         use_compile=use_compile,
-        mtp_heads=2,
         llrd_decay=0.92,
         grokfast_alpha=0.98,
         grokfast_lambda=1.0,
     )
     model.train()
-    # Progressive looping
     loop_sched = ProgressiveLoopScheduler(args.max_steps, max_loops=3)
     cur_loops = 1

 def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
     use_compile = getattr(args, "compile", False)
+    # Muon needs higher LR than AdamW: NS orthogonalization normalizes
+    # update direction, so LR controls step SIZE not direction stability.
+    # 0.02 is the standard Muon LR; CLI default 1.5e-3 was for AdamW.
+    # Warmup shortened: NS already provides early stability.
+    #
+    # MTP DISABLED (mtp_heads=0): lm_head (256->200073) costs 4x the entire
+    # 28-layer stack. Each MTP head doubles that. At loss=13 the model can't
+    # predict token+1, so token+2 is noise. Re-enable once loss < 5.
+    muon_lr = max(args.lr, 0.02)
+    muon_warmup = min(args.warmup, 100)
     model, optimizer, scheduler, extras = chimera_turbo.apply(
         model,
         max_steps=args.max_steps,
+        lr=muon_lr,
         weight_decay=0.01,
+        warmup_steps=muon_warmup,
         use_compile=use_compile,
+        mtp_heads=0,
         llrd_decay=0.92,
         grokfast_alpha=0.98,
         grokfast_lambda=1.0,
     )
     model.train()
     loop_sched = ProgressiveLoopScheduler(args.max_steps, max_loops=3)
     cur_loops = 1