Lgr54HFi
/

chomera

Lgr54HFi commited on 12 days ago

Commit

0e64e3a

verified ·

1 Parent(s): 310c416

Upload train_hyper.py

Files changed (1) hide show

train_hyper.py CHANGED Viewed

@@ -102,10 +102,14 @@ def train_hyper(args):
     # ── P1: GrowLength
     if args.growlength:
         stages = [
-            (max(8, args.seq_len // 4), 0.30),
-            (max(16, args.seq_len // 2), 0.30),
-            (args.seq_len, 0.40),
         ]
         grow = GrowLengthScheduler(stages, args.max_steps)
         initial_seq = stages[0][0]

     # ── P1: GrowLength
     if args.growlength:
+        # FIX: The old schedule spent 30% of training at seq=16 (seq_len//4) —
+        # far too short for the model to learn any language structure.
+        # New schedule: 10% at half-length (warmup), 90% at full length.
+        # This preserves the GrowLength throughput benefit during warmup
+        # while giving the model real sentences for the bulk of training.
         stages = [
+            (max(16, args.seq_len // 2), 0.10),
+            (args.seq_len, 0.90),
         ]
         grow = GrowLengthScheduler(stages, args.max_steps)
         initial_seq = stages[0][0]