Lgr54HFi commited on
Commit
8e41f12
·
verified ·
1 Parent(s): 0e7327a

Fix NaN loss reporting: show nan instead of 0.0 when all steps in window are NaN

Browse files
Files changed (1) hide show
  1. chimera/training/loops.py +12 -8
chimera/training/loops.py CHANGED
@@ -55,9 +55,9 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
55
 
56
  # FIX: Use args.lr instead of hardcoded 0.02.
57
  # FIX: Use args.warmup instead of hardcoded 200.
58
- # FIX: Reduce MTP heads from 32 to cut 51M params of overhead.
59
- # FIX: Soften LLRD decay (0.850.92) so early layers still learn.
60
- # FIX: Lower Grokfast lambda (2.01.0) to reduce gradient amplification noise.
61
  model, optimizer, scheduler, extras = chimera_turbo.apply(
62
  model,
63
  max_steps=args.max_steps,
@@ -102,7 +102,7 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
102
  loader = torch.utils.data.DataLoader(
103
  dataset, batch_size=eff_batch, shuffle=True, num_workers=0, drop_last=True)
104
  data_iter = iter(loader)
105
- print(f" [P1] seq {cur_seq} batch {eff_batch}")
106
 
107
  new_loops = loop_sched.get_loops(step)
108
  if new_loops != cur_loops:
@@ -110,7 +110,7 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
110
  raw = getattr(model, "_orig_mod", model)
111
  if hasattr(raw, "loop_controller"):
112
  raw.loop_controller.loop_default = cur_loops
113
- print(f" [LOOP] {cur_loops}")
114
 
115
  if unfreezer:
116
  unfreezer.update(step)
@@ -136,8 +136,12 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
136
 
137
  if step % args.log_every == 0:
138
  dt = time.time() - t0
139
- avg = total_loss / max(1, valid_count)
140
- ppl = math.exp(min(avg, 20)) if math.isfinite(avg) else float("nan")
 
 
 
 
141
  tps = toks / dt if dt > 0 else 0
142
  eta = (args.max_steps - step) / max(1, step) * (time.time() - t0) / 3600 if step > 0 else 0
143
  log_f.write(json.dumps({
@@ -162,5 +166,5 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
162
  d = save_final_checkpoint(model, config, step, best_loss,
163
  os.path.join(args.output_dir, "final"))
164
  log_f.close()
165
- print(f"\nDONE best loss {best_loss:.4f} ppl {math.exp(min(best_loss, 20)):.2f}")
166
  return d
 
55
 
56
  # FIX: Use args.lr instead of hardcoded 0.02.
57
  # FIX: Use args.warmup instead of hardcoded 200.
58
+ # FIX: Reduce MTP heads from 3->2 to cut 51M params of overhead.
59
+ # FIX: Soften LLRD decay (0.85->0.92) so early layers still learn.
60
+ # FIX: Lower Grokfast lambda (2.0->1.0) to reduce gradient amplification noise.
61
  model, optimizer, scheduler, extras = chimera_turbo.apply(
62
  model,
63
  max_steps=args.max_steps,
 
102
  loader = torch.utils.data.DataLoader(
103
  dataset, batch_size=eff_batch, shuffle=True, num_workers=0, drop_last=True)
104
  data_iter = iter(loader)
105
+ print(f" [P1] seq -> {cur_seq} batch -> {eff_batch}")
106
 
107
  new_loops = loop_sched.get_loops(step)
108
  if new_loops != cur_loops:
 
110
  raw = getattr(model, "_orig_mod", model)
111
  if hasattr(raw, "loop_controller"):
112
  raw.loop_controller.loop_default = cur_loops
113
+ print(f" [LOOP] -> {cur_loops}")
114
 
115
  if unfreezer:
116
  unfreezer.update(step)
 
136
 
137
  if step % args.log_every == 0:
138
  dt = time.time() - t0
139
+ if valid_count > 0:
140
+ avg = total_loss / valid_count
141
+ ppl = math.exp(min(avg, 20)) if math.isfinite(avg) else float("nan")
142
+ else:
143
+ avg = float("nan")
144
+ ppl = float("nan")
145
  tps = toks / dt if dt > 0 else 0
146
  eta = (args.max_steps - step) / max(1, step) * (time.time() - t0) / 3600 if step > 0 else 0
147
  log_f.write(json.dumps({
 
166
  d = save_final_checkpoint(model, config, step, best_loss,
167
  os.path.join(args.output_dir, "final"))
168
  log_f.close()
169
+ print(f"\nDONE -- best loss {best_loss:.4f} ppl {math.exp(min(best_loss, 20)):.2f}")
170
  return d