Upload chimera/training/hyper.py
Browse files- chimera/training/hyper.py +14 -7
chimera/training/hyper.py
CHANGED
|
@@ -125,17 +125,19 @@ class ProgressiveLoopScheduler:
|
|
| 125 |
With STE+AdamW (not MeZO), multi-loop training is affordable.
|
| 126 |
Progressive schedule avoids instability from deep loops early on.
|
| 127 |
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
| 129 |
"""
|
| 130 |
|
| 131 |
def __init__(self, total_steps: int, max_loops: int = 3):
|
| 132 |
self._total = total_steps
|
| 133 |
self._max_loops = max_loops
|
| 134 |
-
# Schedule: (fraction_done_threshold, num_loops)
|
| 135 |
self._schedule = [
|
| 136 |
-
(0.
|
| 137 |
-
(0.
|
| 138 |
-
(1.01, min(3, max_loops)), # Last
|
| 139 |
]
|
| 140 |
|
| 141 |
def get_loops(self, step: int) -> int:
|
|
@@ -151,6 +153,11 @@ def patch_training_loops(model, num_loops=1) -> None:
|
|
| 151 |
if hasattr(model, "loop_controller"):
|
| 152 |
model.loop_controller.loop_default = num_loops
|
| 153 |
model.loop_controller.loop_min = 1
|
| 154 |
-
model.loop_controller.loop_max = max(num_loops, 3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
if hasattr(model, "evo_every_n_layers"):
|
| 156 |
-
model.evo_every_n_layers = max(model.evo_every_n_layers,
|
|
|
|
| 125 |
With STE+AdamW (not MeZO), multi-loop training is affordable.
|
| 126 |
Progressive schedule avoids instability from deep loops early on.
|
| 127 |
|
| 128 |
+
FIX: Old schedule (1→2→3 at 20%/60%/100%) was too aggressive —
|
| 129 |
+
with 5000 steps, loops=2 at step 1000 while the model is still at
|
| 130 |
+
loss=10. Now: loops=1 for 50% (stabilize), loops=2 for 30%, loops=3
|
| 131 |
+
for 20%. This gives the model time to learn basics before iterating.
|
| 132 |
"""
|
| 133 |
|
| 134 |
def __init__(self, total_steps: int, max_loops: int = 3):
|
| 135 |
self._total = total_steps
|
| 136 |
self._max_loops = max_loops
|
|
|
|
| 137 |
self._schedule = [
|
| 138 |
+
(0.50, 1), # First 50%: stabilize weights with single pass
|
| 139 |
+
(0.80, 2), # Next 30%: learn to iterate
|
| 140 |
+
(1.01, min(3, max_loops)), # Last 20%: deep refinement
|
| 141 |
]
|
| 142 |
|
| 143 |
def get_loops(self, step: int) -> int:
|
|
|
|
| 153 |
if hasattr(model, "loop_controller"):
|
| 154 |
model.loop_controller.loop_default = num_loops
|
| 155 |
model.loop_controller.loop_min = 1
|
| 156 |
+
model.loop_controller.loop_max = max(num_loops, 3)
|
| 157 |
+
# FIX: Evolution modulation is very expensive on CPU (HDC projections,
|
| 158 |
+
# Hamming distance queries over 50K entries, episodic retrieval).
|
| 159 |
+
# With evo_every_n_layers=4 and 28 layers, that's 7 calls per forward.
|
| 160 |
+
# Set to 28 → evolution fires once per full pass (at layer 0 only),
|
| 161 |
+
# which is enough for the memory to modulate the input embedding.
|
| 162 |
if hasattr(model, "evo_every_n_layers"):
|
| 163 |
+
model.evo_every_n_layers = max(model.evo_every_n_layers, 28)
|