Lgr54HFi commited on
Commit
0e64e3a
·
verified ·
1 Parent(s): 310c416

Upload train_hyper.py

Browse files
Files changed (1) hide show
  1. train_hyper.py +7 -3
train_hyper.py CHANGED
@@ -102,10 +102,14 @@ def train_hyper(args):
102
 
103
  # ── P1: GrowLength
104
  if args.growlength:
 
 
 
 
 
105
  stages = [
106
- (max(8, args.seq_len // 4), 0.30),
107
- (max(16, args.seq_len // 2), 0.30),
108
- (args.seq_len, 0.40),
109
  ]
110
  grow = GrowLengthScheduler(stages, args.max_steps)
111
  initial_seq = stages[0][0]
 
102
 
103
  # ── P1: GrowLength
104
  if args.growlength:
105
+ # FIX: The old schedule spent 30% of training at seq=16 (seq_len//4) —
106
+ # far too short for the model to learn any language structure.
107
+ # New schedule: 10% at half-length (warmup), 90% at full length.
108
+ # This preserves the GrowLength throughput benefit during warmup
109
+ # while giving the model real sentences for the bulk of training.
110
  stages = [
111
+ (max(16, args.seq_len // 2), 0.10),
112
+ (args.seq_len, 0.90),
 
113
  ]
114
  grow = GrowLengthScheduler(stages, args.max_steps)
115
  initial_seq = stages[0][0]