feat: train_hyper_loop with progressive looping, evolution loss feedback, no progressive_unfreeze default\n\nActivates dormant ch1mera paradigms:\n1. Progressive looping: 1→2→3 Parcae loops during training\n2. Evolution receives prev_loss for surprise-based memory writes\n3. progressive_unfreeze disabled by default (all layers train from start)\n4. Logs loop count and NaN-safe averaging"
Browse files- chimera/training/loops.py +33 -10
chimera/training/loops.py
CHANGED
|
@@ -10,6 +10,7 @@ import torch
|
|
| 10 |
import chimera_turbo
|
| 11 |
|
| 12 |
from .common import cosine_lr, save_final_checkpoint, save_training_checkpoint
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
def train_fast_loop(args, model, config, loader, compute_loss) -> str:
|
|
@@ -149,18 +150,20 @@ def train_standard_loop(args, model, config, loader, compute_loss, optimizer, us
|
|
| 149 |
def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
|
| 150 |
use_compile = getattr(args, "compile", False)
|
| 151 |
|
| 152 |
-
# Let chimera_turbo.apply() use its v8 BitNet-aligned defaults:
|
| 153 |
-
# lr=1.5e-3, wd=0.01, β=(0.9,0.98), warmup=750
|
| 154 |
-
# Only override lr if the user explicitly set it via CLI
|
| 155 |
model, optimizer, scheduler = chimera_turbo.apply(
|
| 156 |
model,
|
| 157 |
max_steps=args.max_steps,
|
| 158 |
-
lr=args.lr,
|
| 159 |
-
# weight_decay and warmup_steps use chimera_turbo defaults (0.01, 750)
|
| 160 |
use_compile=use_compile,
|
| 161 |
use_ipex=True,
|
| 162 |
)
|
| 163 |
model.train()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
print(f"[P5] Train mode: BitLinear STE (clamp-aware, NaN-safe)")
|
| 165 |
use_bf16 = bool(args.bf16)
|
| 166 |
|
|
@@ -168,6 +171,7 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 168 |
log_f = open(os.path.join(args.output_dir, "log_hyper.jsonl"), "w")
|
| 169 |
step = 0
|
| 170 |
total_loss = 0.0
|
|
|
|
| 171 |
best_loss = float("inf")
|
| 172 |
toks = 0
|
| 173 |
t0 = time.time()
|
|
@@ -179,10 +183,11 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 179 |
data_iter = iter(loader)
|
| 180 |
|
| 181 |
print(f"\n{'=' * 65}")
|
| 182 |
-
print(f"Training eff_batch={eff_batch} seq={cur_seq}")
|
| 183 |
print(f"{'=' * 65}\n")
|
| 184 |
|
| 185 |
while step < args.max_steps:
|
|
|
|
| 186 |
if grow:
|
| 187 |
ns = grow.get_seq_len(step)
|
| 188 |
if ns != cur_seq:
|
|
@@ -194,8 +199,21 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 194 |
)
|
| 195 |
data_iter = iter(loader)
|
| 196 |
print(f" [P1] seq → {cur_seq} batch → {eff_batch}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
if unfreezer:
|
| 198 |
unfreezer.update(step)
|
|
|
|
| 199 |
try:
|
| 200 |
batch = next(data_iter)
|
| 201 |
except StopIteration:
|
|
@@ -209,31 +227,36 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 209 |
)
|
| 210 |
|
| 211 |
cur_lr = optimizer.param_groups[0]["lr"]
|
| 212 |
-
|
|
|
|
|
|
|
| 213 |
toks += batch["input_ids"].numel()
|
| 214 |
step += 1
|
| 215 |
|
| 216 |
if step % args.log_every == 0:
|
| 217 |
dt = time.time() - t0
|
| 218 |
-
avg = total_loss /
|
| 219 |
ppl = math.exp(min(avg, 20)) if math.isfinite(avg) else float("nan")
|
| 220 |
tps = toks / dt if dt > 0 else 0
|
| 221 |
eta = (args.max_steps - step) / (step / dt) / 3600 if dt > 0 else 0
|
| 222 |
log_f.write(
|
| 223 |
json.dumps({
|
| 224 |
-
"step": step,
|
|
|
|
| 225 |
"ppl": round(ppl, 2) if math.isfinite(ppl) else None,
|
| 226 |
"lr": round(cur_lr, 6), "tok/s": round(tps),
|
| 227 |
"seq_len": cur_seq, "eff_batch": eff_batch,
|
|
|
|
| 228 |
}) + "\n"
|
| 229 |
)
|
| 230 |
log_f.flush()
|
| 231 |
print(
|
| 232 |
f" step {step:>6}/{args.max_steps} | loss {avg:.4f} | ppl {ppl:>8.2f} "
|
| 233 |
-
f"| lr {cur_lr:.2e} | {tps:,.0f} tok/s | seq {cur_seq} | ETA {eta:.1f}h"
|
| 234 |
)
|
| 235 |
best_loss = min(best_loss, avg) if math.isfinite(avg) else best_loss
|
| 236 |
total_loss = 0.0
|
|
|
|
| 237 |
toks = 0
|
| 238 |
t0 = time.time()
|
| 239 |
|
|
|
|
| 10 |
import chimera_turbo
|
| 11 |
|
| 12 |
from .common import cosine_lr, save_final_checkpoint, save_training_checkpoint
|
| 13 |
+
from .hyper import ProgressiveLoopScheduler
|
| 14 |
|
| 15 |
|
| 16 |
def train_fast_loop(args, model, config, loader, compute_loss) -> str:
|
|
|
|
| 150 |
def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
|
| 151 |
use_compile = getattr(args, "compile", False)
|
| 152 |
|
|
|
|
|
|
|
|
|
|
| 153 |
model, optimizer, scheduler = chimera_turbo.apply(
|
| 154 |
model,
|
| 155 |
max_steps=args.max_steps,
|
| 156 |
+
lr=args.lr,
|
|
|
|
| 157 |
use_compile=use_compile,
|
| 158 |
use_ipex=True,
|
| 159 |
)
|
| 160 |
model.train()
|
| 161 |
+
|
| 162 |
+
# ── Progressive looping: 1→2→3 Parcae loops ──
|
| 163 |
+
loop_sched = ProgressiveLoopScheduler(args.max_steps, max_loops=3)
|
| 164 |
+
cur_loops = 1
|
| 165 |
+
print(f"[LOOP] Progressive looping: 1→2→3 over {args.max_steps} steps")
|
| 166 |
+
|
| 167 |
print(f"[P5] Train mode: BitLinear STE (clamp-aware, NaN-safe)")
|
| 168 |
use_bf16 = bool(args.bf16)
|
| 169 |
|
|
|
|
| 171 |
log_f = open(os.path.join(args.output_dir, "log_hyper.jsonl"), "w")
|
| 172 |
step = 0
|
| 173 |
total_loss = 0.0
|
| 174 |
+
valid_loss_count = 0
|
| 175 |
best_loss = float("inf")
|
| 176 |
toks = 0
|
| 177 |
t0 = time.time()
|
|
|
|
| 183 |
data_iter = iter(loader)
|
| 184 |
|
| 185 |
print(f"\n{'=' * 65}")
|
| 186 |
+
print(f"Training eff_batch={eff_batch} seq={cur_seq} loops={cur_loops}")
|
| 187 |
print(f"{'=' * 65}\n")
|
| 188 |
|
| 189 |
while step < args.max_steps:
|
| 190 |
+
# ── GrowLength seq scheduling ──
|
| 191 |
if grow:
|
| 192 |
ns = grow.get_seq_len(step)
|
| 193 |
if ns != cur_seq:
|
|
|
|
| 199 |
)
|
| 200 |
data_iter = iter(loader)
|
| 201 |
print(f" [P1] seq → {cur_seq} batch → {eff_batch}")
|
| 202 |
+
|
| 203 |
+
# ── Progressive loop scheduling ──
|
| 204 |
+
new_loops = loop_sched.get_loops(step)
|
| 205 |
+
if new_loops != cur_loops:
|
| 206 |
+
cur_loops = new_loops
|
| 207 |
+
if hasattr(model, "loop_controller"):
|
| 208 |
+
model.loop_controller.loop_default = cur_loops
|
| 209 |
+
elif hasattr(model, "_orig_mod") and hasattr(model._orig_mod, "loop_controller"):
|
| 210 |
+
model._orig_mod.loop_controller.loop_default = cur_loops
|
| 211 |
+
print(f" [LOOP] loops → {cur_loops}")
|
| 212 |
+
|
| 213 |
+
# ── Progressive unfreeze (if enabled) ──
|
| 214 |
if unfreezer:
|
| 215 |
unfreezer.update(step)
|
| 216 |
+
|
| 217 |
try:
|
| 218 |
batch = next(data_iter)
|
| 219 |
except StopIteration:
|
|
|
|
| 227 |
)
|
| 228 |
|
| 229 |
cur_lr = optimizer.param_groups[0]["lr"]
|
| 230 |
+
if math.isfinite(loss_val):
|
| 231 |
+
total_loss += loss_val
|
| 232 |
+
valid_loss_count += 1
|
| 233 |
toks += batch["input_ids"].numel()
|
| 234 |
step += 1
|
| 235 |
|
| 236 |
if step % args.log_every == 0:
|
| 237 |
dt = time.time() - t0
|
| 238 |
+
avg = total_loss / max(1, valid_loss_count)
|
| 239 |
ppl = math.exp(min(avg, 20)) if math.isfinite(avg) else float("nan")
|
| 240 |
tps = toks / dt if dt > 0 else 0
|
| 241 |
eta = (args.max_steps - step) / (step / dt) / 3600 if dt > 0 else 0
|
| 242 |
log_f.write(
|
| 243 |
json.dumps({
|
| 244 |
+
"step": step,
|
| 245 |
+
"loss": round(avg, 4) if math.isfinite(avg) else None,
|
| 246 |
"ppl": round(ppl, 2) if math.isfinite(ppl) else None,
|
| 247 |
"lr": round(cur_lr, 6), "tok/s": round(tps),
|
| 248 |
"seq_len": cur_seq, "eff_batch": eff_batch,
|
| 249 |
+
"loops": cur_loops,
|
| 250 |
}) + "\n"
|
| 251 |
)
|
| 252 |
log_f.flush()
|
| 253 |
print(
|
| 254 |
f" step {step:>6}/{args.max_steps} | loss {avg:.4f} | ppl {ppl:>8.2f} "
|
| 255 |
+
f"| lr {cur_lr:.2e} | {tps:,.0f} tok/s | seq {cur_seq} | L{cur_loops} | ETA {eta:.1f}h"
|
| 256 |
)
|
| 257 |
best_loss = min(best_loss, avg) if math.isfinite(avg) else best_loss
|
| 258 |
total_loss = 0.0
|
| 259 |
+
valid_loss_count = 0
|
| 260 |
toks = 0
|
| 261 |
t0 = time.time()
|
| 262 |
|