Fix NaN loss reporting: show nan instead of 0.0 when all steps in window are NaN
Browse files- chimera/training/loops.py +12 -8
chimera/training/loops.py
CHANGED
|
@@ -55,9 +55,9 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 55 |
|
| 56 |
# FIX: Use args.lr instead of hardcoded 0.02.
|
| 57 |
# FIX: Use args.warmup instead of hardcoded 200.
|
| 58 |
-
# FIX: Reduce MTP heads from 3
|
| 59 |
-
# FIX: Soften LLRD decay (0.85
|
| 60 |
-
# FIX: Lower Grokfast lambda (2.0
|
| 61 |
model, optimizer, scheduler, extras = chimera_turbo.apply(
|
| 62 |
model,
|
| 63 |
max_steps=args.max_steps,
|
|
@@ -102,7 +102,7 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 102 |
loader = torch.utils.data.DataLoader(
|
| 103 |
dataset, batch_size=eff_batch, shuffle=True, num_workers=0, drop_last=True)
|
| 104 |
data_iter = iter(loader)
|
| 105 |
-
print(f" [P1] seq
|
| 106 |
|
| 107 |
new_loops = loop_sched.get_loops(step)
|
| 108 |
if new_loops != cur_loops:
|
|
@@ -110,7 +110,7 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 110 |
raw = getattr(model, "_orig_mod", model)
|
| 111 |
if hasattr(raw, "loop_controller"):
|
| 112 |
raw.loop_controller.loop_default = cur_loops
|
| 113 |
-
print(f" [LOOP]
|
| 114 |
|
| 115 |
if unfreezer:
|
| 116 |
unfreezer.update(step)
|
|
@@ -136,8 +136,12 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 136 |
|
| 137 |
if step % args.log_every == 0:
|
| 138 |
dt = time.time() - t0
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
tps = toks / dt if dt > 0 else 0
|
| 142 |
eta = (args.max_steps - step) / max(1, step) * (time.time() - t0) / 3600 if step > 0 else 0
|
| 143 |
log_f.write(json.dumps({
|
|
@@ -162,5 +166,5 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
|
|
| 162 |
d = save_final_checkpoint(model, config, step, best_loss,
|
| 163 |
os.path.join(args.output_dir, "final"))
|
| 164 |
log_f.close()
|
| 165 |
-
print(f"\nDONE
|
| 166 |
return d
|
|
|
|
| 55 |
|
| 56 |
# FIX: Use args.lr instead of hardcoded 0.02.
|
| 57 |
# FIX: Use args.warmup instead of hardcoded 200.
|
| 58 |
+
# FIX: Reduce MTP heads from 3->2 to cut 51M params of overhead.
|
| 59 |
+
# FIX: Soften LLRD decay (0.85->0.92) so early layers still learn.
|
| 60 |
+
# FIX: Lower Grokfast lambda (2.0->1.0) to reduce gradient amplification noise.
|
| 61 |
model, optimizer, scheduler, extras = chimera_turbo.apply(
|
| 62 |
model,
|
| 63 |
max_steps=args.max_steps,
|
|
|
|
| 102 |
loader = torch.utils.data.DataLoader(
|
| 103 |
dataset, batch_size=eff_batch, shuffle=True, num_workers=0, drop_last=True)
|
| 104 |
data_iter = iter(loader)
|
| 105 |
+
print(f" [P1] seq -> {cur_seq} batch -> {eff_batch}")
|
| 106 |
|
| 107 |
new_loops = loop_sched.get_loops(step)
|
| 108 |
if new_loops != cur_loops:
|
|
|
|
| 110 |
raw = getattr(model, "_orig_mod", model)
|
| 111 |
if hasattr(raw, "loop_controller"):
|
| 112 |
raw.loop_controller.loop_default = cur_loops
|
| 113 |
+
print(f" [LOOP] -> {cur_loops}")
|
| 114 |
|
| 115 |
if unfreezer:
|
| 116 |
unfreezer.update(step)
|
|
|
|
| 136 |
|
| 137 |
if step % args.log_every == 0:
|
| 138 |
dt = time.time() - t0
|
| 139 |
+
if valid_count > 0:
|
| 140 |
+
avg = total_loss / valid_count
|
| 141 |
+
ppl = math.exp(min(avg, 20)) if math.isfinite(avg) else float("nan")
|
| 142 |
+
else:
|
| 143 |
+
avg = float("nan")
|
| 144 |
+
ppl = float("nan")
|
| 145 |
tps = toks / dt if dt > 0 else 0
|
| 146 |
eta = (args.max_steps - step) / max(1, step) * (time.time() - t0) / 3600 if step > 0 else 0
|
| 147 |
log_f.write(json.dumps({
|
|
|
|
| 166 |
d = save_final_checkpoint(model, config, step, best_loss,
|
| 167 |
os.path.join(args.output_dir, "final"))
|
| 168 |
log_f.close()
|
| 169 |
+
print(f"\nDONE -- best loss {best_loss:.4f} ppl {math.exp(min(best_loss, 20)):.2f}")
|
| 170 |
return d
|