Lgr54HFi commited on
Commit
edcdcb3
·
verified ·
1 Parent(s): f9d237b

Upload chimera/training/loops.py

Browse files
Files changed (1) hide show
  1. chimera/training/loops.py +10 -5
chimera/training/loops.py CHANGED
@@ -53,17 +53,22 @@ def train_standard_loop(args, model, config, loader, compute_loss, optimizer, us
53
  def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
54
  use_compile = getattr(args, "compile", False)
55
 
 
 
 
 
 
56
  model, optimizer, scheduler, extras = chimera_turbo.apply(
57
  model,
58
  max_steps=args.max_steps,
59
- lr=0.02,
60
  weight_decay=0.01,
61
- warmup_steps=200,
62
  use_compile=use_compile,
63
- mtp_heads=3,
64
- llrd_decay=0.85,
65
  grokfast_alpha=0.98,
66
- grokfast_lambda=2.0,
67
  )
68
  model.train()
69
 
 
53
  def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
54
  use_compile = getattr(args, "compile", False)
55
 
56
+ # FIX: Use args.lr instead of hardcoded 0.02.
57
+ # FIX: Use args.warmup instead of hardcoded 200.
58
+ # FIX: Reduce MTP heads from 3→2 to cut 51M params of overhead.
59
+ # FIX: Soften LLRD decay (0.85→0.92) so early layers still learn.
60
+ # FIX: Lower Grokfast lambda (2.0→1.0) to reduce gradient amplification noise.
61
  model, optimizer, scheduler, extras = chimera_turbo.apply(
62
  model,
63
  max_steps=args.max_steps,
64
+ lr=args.lr,
65
  weight_decay=0.01,
66
+ warmup_steps=args.warmup,
67
  use_compile=use_compile,
68
+ mtp_heads=2,
69
+ llrd_decay=0.92,
70
  grokfast_alpha=0.98,
71
+ grokfast_lambda=1.0,
72
  )
73
  model.train()
74