Lgr54HFi commited on
Commit
e2f5e25
·
verified ·
1 Parent(s): 64db48c

fix: loops.py — use chimera_turbo v8 defaults (wd=0.01, warmup=750, β2=0.98) instead of hardcoded values"

Browse files
Files changed (1) hide show
  1. chimera/training/loops.py +18 -24
chimera/training/loops.py CHANGED
@@ -13,7 +13,7 @@ from .common import cosine_lr, save_final_checkpoint, save_training_checkpoint
13
 
14
 
15
  def train_fast_loop(args, model, config, loader, compute_loss) -> str:
16
- optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.95))
17
  os.makedirs(args.output_dir, exist_ok=True)
18
  log_f = open(os.path.join(args.output_dir, "log.jsonl"), "w", encoding="utf-8")
19
 
@@ -147,19 +147,21 @@ def train_standard_loop(args, model, config, loader, compute_loss, optimizer, us
147
 
148
 
149
  def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
150
- # use_compile=True now works: STE uses detach() trick = zero graph breaks
151
- use_compile = getattr(args, "compile", True)
 
 
 
152
  model, optimizer, scheduler = chimera_turbo.apply(
153
  model,
154
  max_steps=args.max_steps,
155
- lr=args.lr,
156
- weight_decay=0.05,
157
- warmup_steps=min(500, args.max_steps // 10),
158
  use_compile=use_compile,
159
  use_ipex=True,
160
  )
161
  model.train()
162
- print(f"[P5] Train mode: BitLinear STE path (detach trick, compile-friendly)")
163
  use_bf16 = bool(args.bf16)
164
 
165
  os.makedirs(args.output_dir, exist_ok=True)
@@ -200,37 +202,29 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
200
  data_iter = iter(loader)
201
  batch = next(data_iter)
202
 
203
- # grad_accum_steps=1: DataLoader already provides eff_batch items.
204
  loss_val = chimera_turbo.training_step(
205
- model,
206
- batch,
207
- optimizer,
208
- scheduler,
209
- grad_accum_steps=1,
210
- step=step,
211
  autocast_dtype=torch.bfloat16 if use_bf16 else None,
212
  )
213
 
214
  cur_lr = optimizer.param_groups[0]["lr"]
215
- total_loss += loss_val
216
  toks += batch["input_ids"].numel()
217
  step += 1
218
 
219
  if step % args.log_every == 0:
220
  dt = time.time() - t0
221
  avg = total_loss / args.log_every
222
- ppl = math.exp(min(avg, 20))
223
  tps = toks / dt if dt > 0 else 0
224
  eta = (args.max_steps - step) / (step / dt) / 3600 if dt > 0 else 0
225
  log_f.write(
226
  json.dumps({
227
- "step": step,
228
- "loss": round(avg, 4),
229
- "ppl": round(ppl, 2),
230
- "lr": round(cur_lr, 6),
231
- "tok/s": round(tps),
232
- "seq_len": cur_seq,
233
- "eff_batch": eff_batch,
234
  }) + "\n"
235
  )
236
  log_f.flush()
@@ -238,7 +232,7 @@ def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
238
  f" step {step:>6}/{args.max_steps} | loss {avg:.4f} | ppl {ppl:>8.2f} "
239
  f"| lr {cur_lr:.2e} | {tps:,.0f} tok/s | seq {cur_seq} | ETA {eta:.1f}h"
240
  )
241
- best_loss = min(best_loss, avg)
242
  total_loss = 0.0
243
  toks = 0
244
  t0 = time.time()
 
13
 
14
 
15
  def train_fast_loop(args, model, config, loader, compute_loss) -> str:
16
+ optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.98))
17
  os.makedirs(args.output_dir, exist_ok=True)
18
  log_f = open(os.path.join(args.output_dir, "log.jsonl"), "w", encoding="utf-8")
19
 
 
147
 
148
 
149
  def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
150
+ use_compile = getattr(args, "compile", False)
151
+
152
+ # Let chimera_turbo.apply() use its v8 BitNet-aligned defaults:
153
+ # lr=1.5e-3, wd=0.01, β=(0.9,0.98), warmup=750
154
+ # Only override lr if the user explicitly set it via CLI
155
  model, optimizer, scheduler = chimera_turbo.apply(
156
  model,
157
  max_steps=args.max_steps,
158
+ lr=args.lr, # CLI default is 1e-3; chimera_turbo default is 1.5e-3
159
+ # weight_decay and warmup_steps use chimera_turbo defaults (0.01, 750)
 
160
  use_compile=use_compile,
161
  use_ipex=True,
162
  )
163
  model.train()
164
+ print(f"[P5] Train mode: BitLinear STE (clamp-aware, NaN-safe)")
165
  use_bf16 = bool(args.bf16)
166
 
167
  os.makedirs(args.output_dir, exist_ok=True)
 
202
  data_iter = iter(loader)
203
  batch = next(data_iter)
204
 
 
205
  loss_val = chimera_turbo.training_step(
206
+ model, batch, optimizer, scheduler,
207
+ grad_accum_steps=1, step=step,
 
 
 
 
208
  autocast_dtype=torch.bfloat16 if use_bf16 else None,
209
  )
210
 
211
  cur_lr = optimizer.param_groups[0]["lr"]
212
+ total_loss += loss_val if math.isfinite(loss_val) else 0.0
213
  toks += batch["input_ids"].numel()
214
  step += 1
215
 
216
  if step % args.log_every == 0:
217
  dt = time.time() - t0
218
  avg = total_loss / args.log_every
219
+ ppl = math.exp(min(avg, 20)) if math.isfinite(avg) else float("nan")
220
  tps = toks / dt if dt > 0 else 0
221
  eta = (args.max_steps - step) / (step / dt) / 3600 if dt > 0 else 0
222
  log_f.write(
223
  json.dumps({
224
+ "step": step, "loss": round(avg, 4) if math.isfinite(avg) else None,
225
+ "ppl": round(ppl, 2) if math.isfinite(ppl) else None,
226
+ "lr": round(cur_lr, 6), "tok/s": round(tps),
227
+ "seq_len": cur_seq, "eff_batch": eff_batch,
 
 
 
228
  }) + "\n"
229
  )
230
  log_f.flush()
 
232
  f" step {step:>6}/{args.max_steps} | loss {avg:.4f} | ppl {ppl:>8.2f} "
233
  f"| lr {cur_lr:.2e} | {tps:,.0f} tok/s | seq {cur_seq} | ETA {eta:.1f}h"
234
  )
235
+ best_loss = min(best_loss, avg) if math.isfinite(avg) else best_loss
236
  total_loss = 0.0
237
  toks = 0
238
  t0 = time.time()