Lgr54HFi commited on
Commit
9d8c566
Β·
verified Β·
1 Parent(s): 8b16586

perf: tune train_hyper_loop for 300-step convergence

Browse files

- muon_lr 0.008β†’0.012: max stable rate for ternary STE with clamp-aware
gradient gating (grads zero outside [-1,1], so higher LR is safe)
- muon_warmup hardcoded to 30 (10% of 300 steps)
- weight_decay 0.01β†’0.02
- llrd_decay 0.92β†’0.90 (0.90^27=0.058 vs 0.92^27=0.093; both give
meaningful bottom-layer gradients, but 0.90 is more aggressive)
- grokfast_alpha 0.98β†’0.95, grokfast_lambda 1.0β†’1.5
- Force loops=1 for all steps (no progressive 1β†’2β†’3); at 300 steps,
throughput matters more than iterative refinement
- Progressive loop scheduler still imported but overridden"

Files changed (1) hide show
  1. chimera/training/loops.py +25 -21
chimera/training/loops.py CHANGED
@@ -53,29 +53,40 @@ def train_standard_loop(args, model, config, loader, compute_loss, optimizer, us
53
  def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
54
  use_compile = getattr(args, "compile", False)
55
 
56
- # Muon LR for ternary BitLinear: standard Muon uses 0.02 for dense fp32/bf16
57
- # weights, but ternary STE has a much narrower useful weight range [-1, 1].
58
- # The NS unit-orthogonal update + momentum accumulation causes overshoot
59
- # past step ~230, pushing weights outside the STE clamp zone (zero gradient).
60
- # Optimal for ternary: 0.008 peak with aggressive cosine decay.
61
- muon_lr = 0.008
62
- muon_warmup = min(args.warmup, 100)
 
 
63
  model, optimizer, scheduler, extras = chimera_turbo.apply(
64
  model,
65
  max_steps=args.max_steps,
66
  lr=muon_lr,
67
- weight_decay=0.01,
68
  warmup_steps=muon_warmup,
69
  use_compile=use_compile,
70
- mtp_heads=0,
71
- llrd_decay=0.92,
72
- grokfast_alpha=0.98,
73
- grokfast_lambda=1.0,
74
  )
75
  model.train()
76
 
77
- loop_sched = ProgressiveLoopScheduler(args.max_steps, max_loops=3)
 
 
 
 
78
  cur_loops = 1
 
 
 
 
 
79
 
80
  use_bf16 = bool(args.bf16)
81
 
@@ -105,14 +116,7 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
105
  data_iter = iter(loader)
106
  print(f" [P1] seq -> {cur_seq} batch -> {eff_batch}")
107
 
108
- new_loops = loop_sched.get_loops(step)
109
- if new_loops != cur_loops:
110
- cur_loops = new_loops
111
- raw = getattr(model, "_orig_mod", model)
112
- if hasattr(raw, "loop_controller"):
113
- raw.loop_controller.loop_default = cur_loops
114
- print(f" [LOOP] -> {cur_loops}")
115
-
116
  if unfreezer:
117
  unfreezer.update(step)
118
 
 
53
  def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
54
  use_compile = getattr(args, "compile", False)
55
 
56
+ # ── Muon LR for ternary BitLinear ──
57
+ # v12.1: Raised from 0.008 to 0.012. The clamp-aware STE in BitLinear
58
+ # gates gradients to zero for weights outside [-1, 1], so the effective
59
+ # learning signal is self-limiting. 0.012 is the highest rate before
60
+ # NS-orthogonalized momentum causes oscillation at the STE boundary.
61
+ # At 300 steps, every step counts β€” 0.008 converges too slowly.
62
+ muon_lr = 0.012
63
+ muon_warmup = 30 # 10% of 300 steps; was min(args.warmup, 100)
64
+
65
  model, optimizer, scheduler, extras = chimera_turbo.apply(
66
  model,
67
  max_steps=args.max_steps,
68
  lr=muon_lr,
69
+ weight_decay=0.02, # was 0.01; BitNet SLM: wd=0.05 optimal
70
  warmup_steps=muon_warmup,
71
  use_compile=use_compile,
72
+ mtp_heads=0, # vocab/hidden=781:1 β†’ MTP noisy + slow
73
+ llrd_decay=0.90, # was 0.92; 0.90^27=0.058 β†’ more bottom grad
74
+ grokfast_alpha=0.95, # was 0.98; shorter EMA window for 300 steps
75
+ grokfast_lambda=1.5, # was 1.0; amplify slow grads more aggressively
76
  )
77
  model.train()
78
 
79
+ # ── Looping: force loops=1 for all 300 steps ──
80
+ # Progressive 1β†’2β†’3 doubles/triples forward cost. At 300 steps,
81
+ # throughput (more tokens seen) beats iterative refinement (same
82
+ # tokens processed multiple times). Each extra loop adds ~18 layers
83
+ # of compute through the loop trunk for diminishing convergence gain.
84
  cur_loops = 1
85
+ raw_model = getattr(model, "_orig_mod", model)
86
+ if hasattr(raw_model, "loop_controller"):
87
+ raw_model.loop_controller.loop_default = 1
88
+ raw_model.loop_controller.loop_min = 1
89
+ raw_model.loop_controller.loop_max = 1 # Lock to 1
90
 
91
  use_bf16 = bool(args.bf16)
92
 
 
116
  data_iter = iter(loader)
117
  print(f" [P1] seq -> {cur_seq} batch -> {eff_batch}")
118
 
119
+ # Loops locked to 1 β€” no progressive schedule
 
 
 
 
 
 
 
120
  if unfreezer:
121
  unfreezer.update(step)
122