Upload chimera/training/loops.py
Browse files- chimera/training/loops.py +10 -5
chimera/training/loops.py
CHANGED
|
@@ -53,17 +53,22 @@ def train_standard_loop(args, model, config, loader, compute_loss, optimizer, us
|
|
| 53 |
def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
|
| 54 |
use_compile = getattr(args, "compile", False)
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
model, optimizer, scheduler, extras = chimera_turbo.apply(
|
| 57 |
model,
|
| 58 |
max_steps=args.max_steps,
|
| 59 |
-
lr=
|
| 60 |
weight_decay=0.01,
|
| 61 |
-
warmup_steps=
|
| 62 |
use_compile=use_compile,
|
| 63 |
-
mtp_heads=
|
| 64 |
-
llrd_decay=0.
|
| 65 |
grokfast_alpha=0.98,
|
| 66 |
-
grokfast_lambda=
|
| 67 |
)
|
| 68 |
model.train()
|
| 69 |
|
|
|
|
| 53 |
def train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer):
|
| 54 |
use_compile = getattr(args, "compile", False)
|
| 55 |
|
| 56 |
+
# FIX: Use args.lr instead of hardcoded 0.02.
|
| 57 |
+
# FIX: Use args.warmup instead of hardcoded 200.
|
| 58 |
+
# FIX: Reduce MTP heads from 3→2 to cut 51M params of overhead.
|
| 59 |
+
# FIX: Soften LLRD decay (0.85→0.92) so early layers still learn.
|
| 60 |
+
# FIX: Lower Grokfast lambda (2.0→1.0) to reduce gradient amplification noise.
|
| 61 |
model, optimizer, scheduler, extras = chimera_turbo.apply(
|
| 62 |
model,
|
| 63 |
max_steps=args.max_steps,
|
| 64 |
+
lr=args.lr,
|
| 65 |
weight_decay=0.01,
|
| 66 |
+
warmup_steps=args.warmup,
|
| 67 |
use_compile=use_compile,
|
| 68 |
+
mtp_heads=2,
|
| 69 |
+
llrd_decay=0.92,
|
| 70 |
grokfast_alpha=0.98,
|
| 71 |
+
grokfast_lambda=1.0,
|
| 72 |
)
|
| 73 |
model.train()
|
| 74 |
|