Lgr54HFi commited on
Commit
995be31
·
verified ·
1 Parent(s): 5bfbb8a

fix: batch_size 32→4 base (GrowLength scales up, _safe_batch caps)

Browse files

With vocab=200073, the logits tensor dominates memory:
batch × seq × 200073 × 4 bytes

base batch_size=4, seq_len=128:
Stage 1: desired=4×(128/16)=32 → safe_batch caps to ~32 (ok, 0.41GB)
Stage 2: desired=4×(128/32)=16 → 16 (0.41GB)
Stage 3: desired=4×(128/64)=8 → 8 (0.41GB)
Stage 4: desired=4×1=4 → 4 (0.41GB)

Constant ~0.41 GB logits per step — safe within 32 GB with 227M params."

Files changed (1) hide show
  1. train_hyper.py +19 -28
train_hyper.py CHANGED
@@ -1,6 +1,6 @@
1
  #!/usr/bin/env python3
2
  """
3
- Chimera 5.3 — HYPER CPU Training v4.1
4
  =======================================
5
 
6
  All ch1mera features active: 28 layers (GD/XM/TM/SK pattern), MoE,
@@ -18,7 +18,7 @@ Training paradigms:
18
  P16 Plateau Breaker — adaptive LR burst (patience=60)
19
  P18 Grokfast-EMA — amplify slow grads (alpha=0.95, lambda=1.5)
20
 
21
- v4.1Tuned for ≥1000 tok/s + near-optimal loss by step 300 on i7-14700T.
22
  """
23
 
24
  from __future__ import annotations
@@ -28,7 +28,6 @@ import os
28
 
29
  def _setup_cpu():
30
  n = os.cpu_count() or 4
31
- # Only set defaults; launch_turbo.sh overrides with P-core-only values
32
  os.environ.setdefault("OMP_NUM_THREADS", str(n))
33
  os.environ.setdefault("MKL_NUM_THREADS", str(n))
34
  os.environ.setdefault("KMP_AFFINITY", "granularity=fine,compact,1,0")
@@ -75,7 +74,7 @@ def train_hyper(args):
75
  counts = model.count_parameters()
76
 
77
  print("=" * 65)
78
- print(f"CHIMERA 5.3 HYPER v4.1 — scale={args.scale} bf16={args.bf16}")
79
  print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} "
80
  f"vocab={config['vocab_size']} target_seq={args.seq_len}")
81
  print(f"Threads: {torch.get_num_threads()} IPEX={_HAS_IPEX}")
@@ -85,7 +84,7 @@ def train_hyper(args):
85
  f"span={model.span_engine is not None}")
86
  print("=" * 65)
87
 
88
- # ── Parcae: lock to 1 loop for throughput (no progressive 1→2→3)
89
  patch_training_loops(model, num_loops=1)
90
  print(f"[P10] Looping locked to 1 (throughput > refinement at 300 steps)")
91
 
@@ -97,7 +96,7 @@ def train_hyper(args):
97
  trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
98
  print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}")
99
 
100
- # ── P7: Progressive Unfreezing (OFF — all layers train from step 0)
101
  unfreezer = None
102
  if args.progressive_unfreeze:
103
  unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages)
@@ -108,19 +107,14 @@ def train_hyper(args):
108
 
109
  # ── P1: GrowLength — 4-stage front-loaded schedule ──
110
  if args.growlength:
111
- # v4.1: 4-stage schedule designed for joint throughput + convergence.
 
112
  #
113
- # Stage 1 (seq=16, 10%): Token co-occurrence, ternary weight stabilization.
114
- # batch=256 (=32×128/16), throughput ~2800 tok/s.
115
- # Stage 2 (seq=32, 15%): Short sentences, basic clause structure.
116
- # batch=128, throughput ~1200 tok/s.
117
- # Stage 3 (seq=64, 25%): Full sentences, narrative coherence.
118
- # batch=64, throughput ~650 tok/s.
119
- # Stage 4 (seq=128, 50%): Full TinyStories context, story-level structure.
120
- # batch=32, throughput ~350 tok/s.
121
- #
122
- # Constant tokens/step = 4096 across all stages.
123
- # Weighted-average throughput ≥ 1000 tok/s.
124
  stages = [
125
  (16, 0.10), # 30 steps at seq=16
126
  (32, 0.15), # 45 steps at seq=32
@@ -147,14 +141,14 @@ def train_hyper(args):
147
 
148
 
149
  def cli():
150
- p = argparse.ArgumentParser(description="Chimera 5.3 HYPER v4.1")
151
  p.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
152
  p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"])
153
- p.add_argument("--seq_len", type=int, default=128) # was 64; full TinyStories context
154
- p.add_argument("--batch_size", type=int, default=32) # was 8; GrowLength scales this up
155
- p.add_argument("--lr", type=float, default=1.2e-2) # was 1.5e-3; Muon optimal for ternary
156
- p.add_argument("--warmup", type=int, default=30) # was 750; 10% of 300 steps
157
- p.add_argument("--max_steps", type=int, default=300) # was 5000; convergence target
158
  p.add_argument("--max_tokens", type=int, default=None)
159
  p.add_argument("--max_samples", type=int, default=None)
160
  p.add_argument("--bf16", action="store_true", default=True)
@@ -165,7 +159,7 @@ def cli():
165
  p.add_argument("--text_column", default="auto")
166
  p.add_argument("--cache_dir", default="./cache")
167
  p.add_argument("--log_every", type=int, default=10)
168
- p.add_argument("--save_every", type=int, default=100) # was 1000; more frequent for 300 steps
169
  p.add_argument("--output_dir", default="./chimera_hyper_output")
170
 
171
  g = p.add_argument_group("paradigms")
@@ -173,7 +167,6 @@ def cli():
173
  g.add_argument("--growlength", action="store_true", default=False)
174
  g.add_argument("--reservoir", action="store_true", default=False)
175
  g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps")
176
- # Progressive unfreeze: OFF by default (counterproductive with backprop)
177
  g.add_argument("--progressive-unfreeze", action="store_true", default=False,
178
  dest="progressive_unfreeze")
179
  g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages")
@@ -188,8 +181,6 @@ if __name__ == "__main__":
188
  if args.all:
189
  args.growlength = True
190
  args.reservoir = True
191
- # NOTE: progressive_unfreeze deliberately NOT set by --all
192
- # It was designed for MeZO and is counterproductive with STE+Muon
193
  if args.benchmark:
194
  args.growlength = True
195
  args.reservoir = True
 
1
  #!/usr/bin/env python3
2
  """
3
+ Chimera 5.3 — HYPER CPU Training v4.2
4
  =======================================
5
 
6
  All ch1mera features active: 28 layers (GD/XM/TM/SK pattern), MoE,
 
18
  P16 Plateau Breaker — adaptive LR burst (patience=60)
19
  P18 Grokfast-EMA — amplify slow grads (alpha=0.95, lambda=1.5)
20
 
21
+ v4.2Memory-safe batch sizing for vocab=200073 on 32 GB RAM.
22
  """
23
 
24
  from __future__ import annotations
 
28
 
29
  def _setup_cpu():
30
  n = os.cpu_count() or 4
 
31
  os.environ.setdefault("OMP_NUM_THREADS", str(n))
32
  os.environ.setdefault("MKL_NUM_THREADS", str(n))
33
  os.environ.setdefault("KMP_AFFINITY", "granularity=fine,compact,1,0")
 
74
  counts = model.count_parameters()
75
 
76
  print("=" * 65)
77
+ print(f"CHIMERA 5.3 HYPER v4.2 — scale={args.scale} bf16={args.bf16}")
78
  print(f"Layers={config['num_hidden_layers']} hidden={config['hidden_size']} "
79
  f"vocab={config['vocab_size']} target_seq={args.seq_len}")
80
  print(f"Threads: {torch.get_num_threads()} IPEX={_HAS_IPEX}")
 
84
  f"span={model.span_engine is not None}")
85
  print("=" * 65)
86
 
87
+ # ── Parcae: lock to 1 loop for throughput
88
  patch_training_loops(model, num_loops=1)
89
  print(f"[P10] Looping locked to 1 (throughput > refinement at 300 steps)")
90
 
 
96
  trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
97
  print(f"[INFO] Trainable: {trainable:,} / {counts['total']:,}")
98
 
99
+ # ── P7: Progressive Unfreezing (OFF)
100
  unfreezer = None
101
  if args.progressive_unfreeze:
102
  unfreezer = ProgressiveUnfreezer(model, args.max_steps, args.unfreeze_stages)
 
107
 
108
  # ── P1: GrowLength — 4-stage front-loaded schedule ──
109
  if args.growlength:
110
+ # v4.2: batch sizes are base × (target_seq / stage_seq), then
111
+ # capped by _safe_batch() in loops.py to stay under 2 GB logits.
112
  #
113
+ # With base batch=4, target_seq=128, vocab=200073:
114
+ # Stage 1: seq=16, desired=32 logits=0.41GB ✓
115
+ # Stage 2: seq=32, desired=16 logits=0.41GB ✓
116
+ # Stage 3: seq=64, desired=8 → logits=0.41GB ✓
117
+ # Stage 4: seq=128, desired=4 logits=0.41GB ✓
 
 
 
 
 
 
118
  stages = [
119
  (16, 0.10), # 30 steps at seq=16
120
  (32, 0.15), # 45 steps at seq=32
 
141
 
142
 
143
  def cli():
144
+ p = argparse.ArgumentParser(description="Chimera 5.3 HYPER v4.2")
145
  p.add_argument("--config", default=str(DEFAULT_CONFIG_PATH))
146
  p.add_argument("--scale", default="tiny", choices=["tiny", "small", "medium", "full"])
147
+ p.add_argument("--seq_len", type=int, default=128)
148
+ p.add_argument("--batch_size", type=int, default=4) # base batch; GrowLength scales up
149
+ p.add_argument("--lr", type=float, default=1.2e-2)
150
+ p.add_argument("--warmup", type=int, default=30)
151
+ p.add_argument("--max_steps", type=int, default=300)
152
  p.add_argument("--max_tokens", type=int, default=None)
153
  p.add_argument("--max_samples", type=int, default=None)
154
  p.add_argument("--bf16", action="store_true", default=True)
 
159
  p.add_argument("--text_column", default="auto")
160
  p.add_argument("--cache_dir", default="./cache")
161
  p.add_argument("--log_every", type=int, default=10)
162
+ p.add_argument("--save_every", type=int, default=100)
163
  p.add_argument("--output_dir", default="./chimera_hyper_output")
164
 
165
  g = p.add_argument_group("paradigms")
 
167
  g.add_argument("--growlength", action="store_true", default=False)
168
  g.add_argument("--reservoir", action="store_true", default=False)
169
  g.add_argument("--mezo-eps", type=float, default=1e-3, dest="mezo_eps")
 
170
  g.add_argument("--progressive-unfreeze", action="store_true", default=False,
171
  dest="progressive_unfreeze")
172
  g.add_argument("--unfreeze-stages", type=int, default=4, dest="unfreeze_stages")
 
181
  if args.all:
182
  args.growlength = True
183
  args.reservoir = True
 
 
184
  if args.benchmark:
185
  args.growlength = True
186
  args.reservoir = True