Lgr54HFi commited on
Commit
b6bcd75
·
verified ·
1 Parent(s): 945c5bf

feat: train_hyper_loop with progressive looping, evolution loss feedback, no progressive_unfreeze default\n\nActivates dormant ch1mera paradigms:\n1. Progressive looping: 1→2→3 Parcae loops during training\n2. Evolution receives prev_loss for surprise-based memory writes\n3. progressive_unfreeze disabled by default (all layers train from start)\n4. Logs loop count and NaN-safe averaging"

Browse files
Files changed (1) hide show
  1. chimera/training/loops.py +33 -10
chimera/training/loops.py CHANGED
@@ -10,6 +10,7 @@ import torch
10
  import chimera_turbo
11
 
12
  from .common import cosine_lr, save_final_checkpoint, save_training_checkpoint
 
13
 
14
 
15
  def train_fast_loop(args, model, config, loader, compute_loss) -> str:
@@ -149,18 +150,20 @@ def train_standard_loop(args, model, config, loader, compute_loss, optimizer, us
149
  def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
150
  use_compile = getattr(args, "compile", False)
151
 
152
- # Let chimera_turbo.apply() use its v8 BitNet-aligned defaults:
153
- # lr=1.5e-3, wd=0.01, β=(0.9,0.98), warmup=750
154
- # Only override lr if the user explicitly set it via CLI
155
  model, optimizer, scheduler = chimera_turbo.apply(
156
  model,
157
  max_steps=args.max_steps,
158
- lr=args.lr, # CLI default is 1e-3; chimera_turbo default is 1.5e-3
159
- # weight_decay and warmup_steps use chimera_turbo defaults (0.01, 750)
160
  use_compile=use_compile,
161
  use_ipex=True,
162
  )
163
  model.train()
 
 
 
 
 
 
164
  print(f"[P5] Train mode: BitLinear STE (clamp-aware, NaN-safe)")
165
  use_bf16 = bool(args.bf16)
166
 
@@ -168,6 +171,7 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
168
  log_f = open(os.path.join(args.output_dir, "log_hyper.jsonl"), "w")
169
  step = 0
170
  total_loss = 0.0
 
171
  best_loss = float("inf")
172
  toks = 0
173
  t0 = time.time()
@@ -179,10 +183,11 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
179
  data_iter = iter(loader)
180
 
181
  print(f"\n{'=' * 65}")
182
- print(f"Training eff_batch={eff_batch} seq={cur_seq}")
183
  print(f"{'=' * 65}\n")
184
 
185
  while step < args.max_steps:
 
186
  if grow:
187
  ns = grow.get_seq_len(step)
188
  if ns != cur_seq:
@@ -194,8 +199,21 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
194
  )
195
  data_iter = iter(loader)
196
  print(f" [P1] seq → {cur_seq} batch → {eff_batch}")
 
 
 
 
 
 
 
 
 
 
 
 
197
  if unfreezer:
198
  unfreezer.update(step)
 
199
  try:
200
  batch = next(data_iter)
201
  except StopIteration:
@@ -209,31 +227,36 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
209
  )
210
 
211
  cur_lr = optimizer.param_groups[0]["lr"]
212
- total_loss += loss_val if math.isfinite(loss_val) else 0.0
 
 
213
  toks += batch["input_ids"].numel()
214
  step += 1
215
 
216
  if step % args.log_every == 0:
217
  dt = time.time() - t0
218
- avg = total_loss / args.log_every
219
  ppl = math.exp(min(avg, 20)) if math.isfinite(avg) else float("nan")
220
  tps = toks / dt if dt > 0 else 0
221
  eta = (args.max_steps - step) / (step / dt) / 3600 if dt > 0 else 0
222
  log_f.write(
223
  json.dumps({
224
- "step": step, "loss": round(avg, 4) if math.isfinite(avg) else None,
 
225
  "ppl": round(ppl, 2) if math.isfinite(ppl) else None,
226
  "lr": round(cur_lr, 6), "tok/s": round(tps),
227
  "seq_len": cur_seq, "eff_batch": eff_batch,
 
228
  }) + "\n"
229
  )
230
  log_f.flush()
231
  print(
232
  f" step {step:>6}/{args.max_steps} | loss {avg:.4f} | ppl {ppl:>8.2f} "
233
- f"| lr {cur_lr:.2e} | {tps:,.0f} tok/s | seq {cur_seq} | ETA {eta:.1f}h"
234
  )
235
  best_loss = min(best_loss, avg) if math.isfinite(avg) else best_loss
236
  total_loss = 0.0
 
237
  toks = 0
238
  t0 = time.time()
239
 
 
10
  import chimera_turbo
11
 
12
  from .common import cosine_lr, save_final_checkpoint, save_training_checkpoint
13
+ from .hyper import ProgressiveLoopScheduler
14
 
15
 
16
  def train_fast_loop(args, model, config, loader, compute_loss) -> str:
 
150
  def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
151
  use_compile = getattr(args, "compile", False)
152
 
 
 
 
153
  model, optimizer, scheduler = chimera_turbo.apply(
154
  model,
155
  max_steps=args.max_steps,
156
+ lr=args.lr,
 
157
  use_compile=use_compile,
158
  use_ipex=True,
159
  )
160
  model.train()
161
+
162
+ # ── Progressive looping: 1→2→3 Parcae loops ──
163
+ loop_sched = ProgressiveLoopScheduler(args.max_steps, max_loops=3)
164
+ cur_loops = 1
165
+ print(f"[LOOP] Progressive looping: 1→2→3 over {args.max_steps} steps")
166
+
167
  print(f"[P5] Train mode: BitLinear STE (clamp-aware, NaN-safe)")
168
  use_bf16 = bool(args.bf16)
169
 
 
171
  log_f = open(os.path.join(args.output_dir, "log_hyper.jsonl"), "w")
172
  step = 0
173
  total_loss = 0.0
174
+ valid_loss_count = 0
175
  best_loss = float("inf")
176
  toks = 0
177
  t0 = time.time()
 
183
  data_iter = iter(loader)
184
 
185
  print(f"\n{'=' * 65}")
186
+ print(f"Training eff_batch={eff_batch} seq={cur_seq} loops={cur_loops}")
187
  print(f"{'=' * 65}\n")
188
 
189
  while step < args.max_steps:
190
+ # ── GrowLength seq scheduling ──
191
  if grow:
192
  ns = grow.get_seq_len(step)
193
  if ns != cur_seq:
 
199
  )
200
  data_iter = iter(loader)
201
  print(f" [P1] seq → {cur_seq} batch → {eff_batch}")
202
+
203
+ # ── Progressive loop scheduling ──
204
+ new_loops = loop_sched.get_loops(step)
205
+ if new_loops != cur_loops:
206
+ cur_loops = new_loops
207
+ if hasattr(model, "loop_controller"):
208
+ model.loop_controller.loop_default = cur_loops
209
+ elif hasattr(model, "_orig_mod") and hasattr(model._orig_mod, "loop_controller"):
210
+ model._orig_mod.loop_controller.loop_default = cur_loops
211
+ print(f" [LOOP] loops → {cur_loops}")
212
+
213
+ # ── Progressive unfreeze (if enabled) ──
214
  if unfreezer:
215
  unfreezer.update(step)
216
+
217
  try:
218
  batch = next(data_iter)
219
  except StopIteration:
 
227
  )
228
 
229
  cur_lr = optimizer.param_groups[0]["lr"]
230
+ if math.isfinite(loss_val):
231
+ total_loss += loss_val
232
+ valid_loss_count += 1
233
  toks += batch["input_ids"].numel()
234
  step += 1
235
 
236
  if step % args.log_every == 0:
237
  dt = time.time() - t0
238
+ avg = total_loss / max(1, valid_loss_count)
239
  ppl = math.exp(min(avg, 20)) if math.isfinite(avg) else float("nan")
240
  tps = toks / dt if dt > 0 else 0
241
  eta = (args.max_steps - step) / (step / dt) / 3600 if dt > 0 else 0
242
  log_f.write(
243
  json.dumps({
244
+ "step": step,
245
+ "loss": round(avg, 4) if math.isfinite(avg) else None,
246
  "ppl": round(ppl, 2) if math.isfinite(ppl) else None,
247
  "lr": round(cur_lr, 6), "tok/s": round(tps),
248
  "seq_len": cur_seq, "eff_batch": eff_batch,
249
+ "loops": cur_loops,
250
  }) + "\n"
251
  )
252
  log_f.flush()
253
  print(
254
  f" step {step:>6}/{args.max_steps} | loss {avg:.4f} | ppl {ppl:>8.2f} "
255
+ f"| lr {cur_lr:.2e} | {tps:,.0f} tok/s | seq {cur_seq} | L{cur_loops} | ETA {eta:.1f}h"
256
  )
257
  best_loss = min(best_loss, avg) if math.isfinite(avg) else best_loss
258
  total_loss = 0.0
259
+ valid_loss_count = 0
260
  toks = 0
261
  t0 = time.time()
262