Upload train_hyper.py
Browse files- train_hyper.py +7 -3
train_hyper.py
CHANGED
|
@@ -102,10 +102,14 @@ def train_hyper(args):
|
|
| 102 |
|
| 103 |
# ── P1: GrowLength
|
| 104 |
if args.growlength:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
stages = [
|
| 106 |
-
(max(
|
| 107 |
-
(
|
| 108 |
-
(args.seq_len, 0.40),
|
| 109 |
]
|
| 110 |
grow = GrowLengthScheduler(stages, args.max_steps)
|
| 111 |
initial_seq = stages[0][0]
|
|
|
|
| 102 |
|
| 103 |
# ── P1: GrowLength
|
| 104 |
if args.growlength:
|
| 105 |
+
# FIX: The old schedule spent 30% of training at seq=16 (seq_len//4) —
|
| 106 |
+
# far too short for the model to learn any language structure.
|
| 107 |
+
# New schedule: 10% at half-length (warmup), 90% at full length.
|
| 108 |
+
# This preserves the GrowLength throughput benefit during warmup
|
| 109 |
+
# while giving the model real sentences for the bulk of training.
|
| 110 |
stages = [
|
| 111 |
+
(max(16, args.seq_len // 2), 0.10),
|
| 112 |
+
(args.seq_len, 0.90),
|
|
|
|
| 113 |
]
|
| 114 |
grow = GrowLengthScheduler(stages, args.max_steps)
|
| 115 |
initial_seq = stages[0][0]
|