Lgr54HFi commited on
Commit
6a7521a
·
verified ·
1 Parent(s): edcdcb3

Upload chimera/training/hyper.py

Browse files
Files changed (1) hide show
  1. chimera/training/hyper.py +14 -7
chimera/training/hyper.py CHANGED
@@ -125,17 +125,19 @@ class ProgressiveLoopScheduler:
125
  With STE+AdamW (not MeZO), multi-loop training is affordable.
126
  Progressive schedule avoids instability from deep loops early on.
127
 
128
- Default: loops=1 for 20%, loops=2 for 40%, loops=3 for 40%.
 
 
 
129
  """
130
 
131
  def __init__(self, total_steps: int, max_loops: int = 3):
132
  self._total = total_steps
133
  self._max_loops = max_loops
134
- # Schedule: (fraction_done_threshold, num_loops)
135
  self._schedule = [
136
- (0.20, 1), # First 20%: stabilize weights
137
- (0.60, 2), # Next 40%: learn to iterate
138
- (1.01, min(3, max_loops)), # Last 40%: deep refinement
139
  ]
140
 
141
  def get_loops(self, step: int) -> int:
@@ -151,6 +153,11 @@ def patch_training_loops(model, num_loops=1) -> None:
151
  if hasattr(model, "loop_controller"):
152
  model.loop_controller.loop_default = num_loops
153
  model.loop_controller.loop_min = 1
154
- model.loop_controller.loop_max = max(num_loops, 3) # ← allow up to 3
 
 
 
 
 
155
  if hasattr(model, "evo_every_n_layers"):
156
- model.evo_every_n_layers = max(model.evo_every_n_layers, 8)
 
125
  With STE+AdamW (not MeZO), multi-loop training is affordable.
126
  Progressive schedule avoids instability from deep loops early on.
127
 
128
+ FIX: Old schedule (1→2→3 at 20%/60%/100%) was too aggressive —
129
+ with 5000 steps, loops=2 at step 1000 while the model is still at
130
+ loss=10. Now: loops=1 for 50% (stabilize), loops=2 for 30%, loops=3
131
+ for 20%. This gives the model time to learn basics before iterating.
132
  """
133
 
134
  def __init__(self, total_steps: int, max_loops: int = 3):
135
  self._total = total_steps
136
  self._max_loops = max_loops
 
137
  self._schedule = [
138
+ (0.50, 1), # First 50%: stabilize weights with single pass
139
+ (0.80, 2), # Next 30%: learn to iterate
140
+ (1.01, min(3, max_loops)), # Last 20%: deep refinement
141
  ]
142
 
143
  def get_loops(self, step: int) -> int:
 
153
  if hasattr(model, "loop_controller"):
154
  model.loop_controller.loop_default = num_loops
155
  model.loop_controller.loop_min = 1
156
+ model.loop_controller.loop_max = max(num_loops, 3)
157
+ # FIX: Evolution modulation is very expensive on CPU (HDC projections,
158
+ # Hamming distance queries over 50K entries, episodic retrieval).
159
+ # With evo_every_n_layers=4 and 28 layers, that's 7 calls per forward.
160
+ # Set to 28 → evolution fires once per full pass (at layer 0 only),
161
+ # which is enough for the memory to modulate the input embedding.
162
  if hasattr(model, "evo_every_n_layers"):
163
+ model.evo_every_n_layers = max(model.evo_every_n_layers, 28)