Lgr54HFi commited on
Commit
acc06f5
Β·
verified Β·
1 Parent(s): b6bcd75

fix: --all no longer enables progressive_unfreeze (counterproductive with backprop)"

Browse files
Files changed (1) hide show
  1. train_hyper.py +26 -40
train_hyper.py CHANGED
@@ -1,27 +1,19 @@
1
  #!/usr/bin/env python3
2
  """
3
- Chimera 5.3 β€” HYPER CPU Training v3 (10,000+ tok/s target)
4
- ============================================================
5
 
6
- ALL features preserved: 28 layers, MoE, Parcae looping, SelfEvolution,
7
- SpanInference, Grammar, EntropyValve, DebtLedger β€” nothing disabled.
 
8
 
9
- Speed comes from optimizing HOW the forward+MeZO runs, not WHAT it runs:
10
-
11
- P1 GrowLength Curriculum β€” seq 8β†’target, huge batch at short lengths
12
- P2 Reservoir Freezing β€” freeze recurrent gates (fewer params to perturb)
13
- P3 In-Place Seed MeZO β€” no randn allocation, seed-replay perturbation
14
- P4 torch.compile β€” fuse ops, eliminate Python overhead
15
- P5 Train-Mode STE Path β€” BitLinear uses STE (no invalidate_packed)
16
  P6 Aggressive Token Packing β€” zero padding waste
17
- P7 Progressive Unfreeze β€” fewer params early = faster perturbation
18
- P8 Vocab Projection Cache β€” cache lm_head weight for 200K vocab
19
- P9 Loop-1 Training β€” force num_loops=1 during training (full arch)
20
-
21
- Key insight: MeZO's bottleneck is not the forward pass β€” it's
22
- generating+applying random perturbations to 227M params 3Γ— per step.
23
- Seed-replay MeZO eliminates this entirely: perturb in-place using a
24
- single seed, replay the same seed to restore/update.
25
  """
26
 
27
  from __future__ import annotations
@@ -72,16 +64,12 @@ def build_model(args):
72
  return build_model_from_args(args)
73
 
74
 
75
- # ═══════════════════════════════════════════════════════════════════════════
76
- # MAIN HYPER TRAIN
77
- # ═══════════════════════════════════════════════════════════════════════════
78
-
79
  def train_hyper(args):
80
  model, config = build_model(args)
81
  counts = model.count_parameters()
82
 
83
  print("=" * 65)
84
- print(f"CHIMERA 5.3 HYPER v3 β€” scale={args.scale} bf16={args.bf16}")
85
  print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} "
86
  f"vocab={config['vocab_size']} target_seq={args.seq_len}")
87
  print(f"Threads: {torch.get_num_threads()} IPEX={_HAS_IPEX}")
@@ -91,12 +79,11 @@ def train_hyper(args):
91
  f"span={model.span_engine is not None}")
92
  print("=" * 65)
93
 
94
- # ── P9: Force loop=1 during training ─────────────────────────────
95
- # Architecture intact, but save 1 full pass through layers 4-23
96
  patch_training_loops(model, num_loops=1)
97
- print(f"[P9] Training loops=1 (arch intact, Parcae wired)")
98
 
99
- # ── P2: Reservoir Freezing ───────────────────────────────────────
100
  if args.reservoir:
101
  frozen = apply_reservoir_freezing(model)
102
  print(f"[P2] Reservoir: froze {frozen:,} gate params")
@@ -104,14 +91,16 @@ def train_hyper(args):
104
  trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
105
  print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}")
106
 
107
- # ── P7: Progressive Unfreezing ───────────────────────────────────
108
  unfreezer = None
109
  if args.progressive_unfreeze:
110
  unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages)
111
  active = sum(p.numel() for p in model.parameters() if p.requires_grad)
112
  print(f"[P7] Progressive unfreeze: {active:,} initially trainable")
 
 
113
 
114
- # ── P1: GrowLength ───────────────────────────────────────────────
115
  if args.growlength:
116
  stages = [
117
  (max(8, args.seq_len // 4), 0.30),
@@ -125,7 +114,7 @@ def train_hyper(args):
125
  grow = None
126
  initial_seq = args.seq_len
127
 
128
- # ── Data ─────────────────────────────────────────────────────────
129
  tok_budget = args.max_tokens or max(500_000,
130
  args.max_steps * args.batch_size * (args.seq_len + 1) * 4)
131
  token_buf = build_token_buffer(
@@ -137,18 +126,14 @@ def train_hyper(args):
137
  train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
138
 
139
 
140
- # ═══════════════════════════════════════════════════════════════════════════
141
- # CLI
142
- # ═══════════════════════════════════════════════════════════════════════════
143
-
144
  def cli():
145
- p = argparse.ArgumentParser(description="Chimera 5.3 HYPER v3")
146
  p.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
147
  p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"])
148
  p.add_argument("--seq_len", type=int, default=64)
149
  p.add_argument("--batch_size", type=int, default=8)
150
- p.add_argument("--lr", type=float, default=1e-3)
151
- p.add_argument("--warmup", type=int, default=100)
152
  p.add_argument("--max_steps", type=int, default=5000)
153
  p.add_argument("--max_tokens", type=int, default=None)
154
  p.add_argument("--max_samples", type=int, default=None)
@@ -168,6 +153,7 @@ def cli():
168
  g.add_argument("--growlength", action="store_true", default=False)
169
  g.add_argument("--reservoir", action="store_true", default=False)
170
  g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps")
 
171
  g.add_argument("--progressive-unfreeze", action="store_true", default=False,
172
  dest="progressive_unfreeze")
173
  g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages")
@@ -182,11 +168,11 @@ if __name__ == "__main__":
182
  if args.all:
183
  args.growlength = True
184
  args.reservoir = True
185
- args.progressive_unfreeze = True
 
186
  if args.benchmark:
187
  args.growlength = True
188
  args.reservoir = True
189
- args.progressive_unfreeze = True
190
  benchmark_hyper(args)
191
  else:
192
  train_hyper(args)
 
1
  #!/usr/bin/env python3
2
  """
3
+ Chimera 5.3 β€” HYPER CPU Training v4
4
+ =====================================
5
 
6
+ All ch1mera features active: 28 layers (GD/XM/TM/SK pattern), MoE,
7
+ Parcae looping (progressive 1β†’2β†’3), SelfEvolution, SpanInference,
8
+ Grammar, EntropyValve, DebtLedger.
9
 
10
+ Training paradigms:
11
+ P1 GrowLength Curriculum β€” seq 8β†’target, huge batch at short seq
12
+ P2 Reservoir Freezing β€” freeze recurrent gates as random ternary
13
+ P5 STE + AdamW β€” BitNet-paper training (replaces MeZO)
 
 
 
14
  P6 Aggressive Token Packing β€” zero padding waste
15
+ P10 Progressive Looping β€” Parcae loops 1β†’2β†’3 during training
16
+ P11 NaN-safe training β€” skip + recover on gradient explosion
 
 
 
 
 
 
17
  """
18
 
19
  from __future__ import annotations
 
64
  return build_model_from_args(args)
65
 
66
 
 
 
 
 
67
  def train_hyper(args):
68
  model, config = build_model(args)
69
  counts = model.count_parameters()
70
 
71
  print("=" * 65)
72
+ print(f"CHIMERA 5.3 HYPER v4 β€” scale={args.scale} bf16={args.bf16}")
73
  print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} "
74
  f"vocab={config['vocab_size']} target_seq={args.seq_len}")
75
  print(f"Threads: {torch.get_num_threads()} IPEX={_HAS_IPEX}")
 
79
  f"span={model.span_engine is not None}")
80
  print("=" * 65)
81
 
82
+ # ── Parcae: start at 1 loop, progressive scheduler will increase to 2β†’3
 
83
  patch_training_loops(model, num_loops=1)
84
+ print(f"[P10] Progressive looping enabled (1β†’2β†’3)")
85
 
86
+ # ── P2: Reservoir Freezing
87
  if args.reservoir:
88
  frozen = apply_reservoir_freezing(model)
89
  print(f"[P2] Reservoir: froze {frozen:,} gate params")
 
91
  trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
92
  print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}")
93
 
94
+ # ── P7: Progressive Unfreezing (OFF by default β€” counterproductive with backprop)
95
  unfreezer = None
96
  if args.progressive_unfreeze:
97
  unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages)
98
  active = sum(p.numel() for p in model.parameters() if p.requires_grad)
99
  print(f"[P7] Progressive unfreeze: {active:,} initially trainable")
100
+ else:
101
+ print(f"[P7] Progressive unfreeze: OFF (all layers train from start)")
102
 
103
+ # ── P1: GrowLength
104
  if args.growlength:
105
  stages = [
106
  (max(8, args.seq_len // 4), 0.30),
 
114
  grow = None
115
  initial_seq = args.seq_len
116
 
117
+ # ── Data
118
  tok_budget = args.max_tokens or max(500_000,
119
  args.max_steps * args.batch_size * (args.seq_len + 1) * 4)
120
  token_buf = build_token_buffer(
 
126
  train_hyper_loop(args, model, config, dataset, initial_seq, grow, unfreezer)
127
 
128
 
 
 
 
 
129
  def cli():
130
+ p = argparse.ArgumentParser(description="Chimera 5.3 HYPER v4")
131
  p.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
132
  p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"])
133
  p.add_argument("--seq_len", type=int, default=64)
134
  p.add_argument("--batch_size", type=int, default=8)
135
+ p.add_argument("--lr", type=float, default=1.5e-3) # ← BitNet-interpolated default
136
+ p.add_argument("--warmup", type=int, default=750) # ← BitNet paper-exact
137
  p.add_argument("--max_steps", type=int, default=5000)
138
  p.add_argument("--max_tokens", type=int, default=None)
139
  p.add_argument("--max_samples", type=int, default=None)
 
153
  g.add_argument("--growlength", action="store_true", default=False)
154
  g.add_argument("--reservoir", action="store_true", default=False)
155
  g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps")
156
+ # Progressive unfreeze: OFF by default (counterproductive with backprop)
157
  g.add_argument("--progressive-unfreeze", action="store_true", default=False,
158
  dest="progressive_unfreeze")
159
  g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages")
 
168
  if args.all:
169
  args.growlength = True
170
  args.reservoir = True
171
+ # NOTE: progressive_unfreeze deliberately NOT set by --all
172
+ # It was designed for MeZO and is counterproductive with STE+AdamW
173
  if args.benchmark:
174
  args.growlength = True
175
  args.reservoir = True
 
176
  benchmark_hyper(args)
177
  else:
178
  train_hyper(args)