Lgr54HFi commited on
Commit
8b16586
Β·
verified Β·
1 Parent(s): fdb348a

perf: tune chimera_turbo.py for 300-step convergence + throughput

Browse files

All changes target joint throughput β‰₯1000 tok/s + near-optimal loss by step 300:

apply() defaults:
- lr 0.02β†’0.012: ternary STE clamp zone [-1,1] causes overshoot at 0.02
- weight_decay 0.01β†’0.02: BitNet SLM paper finds wd=0.05 optimal; 0.02
balances with Muon's NS regularization
- warmup_steps 200β†’30: 200 wastes 67% of a 300-step budget at sub-optimal LR
- mtp_heads 3β†’0: each head adds Linear(256,200073)=51M params; 3 heads =
153M extra params (4.4Γ— model) destroying cache residency + throughput
- llrd_decay 0.85β†’0.90: 0.85^27=0.009 nearly freezes bottom layers;
0.90^27=0.058 gives 6Γ— more gradient at bottom, critical for 300 steps
- grokfast_alpha 0.98β†’0.95: EMA window ~50β†’~20 steps, better for short runs
- grokfast_lambda 2.0β†’1.5: reduce instability risk with Muon NS updates

TokenTriage:
- select_ratio 0.6β†’0.50: focus top-50% informative tokens
- floor_weight 0.1β†’0.15: ensure minimum signal from all tokens
- warmup_steps 500β†’30: original never activates in 300-step run

PlateauBreaker:
- patience 200β†’60: original never fires in 300 steps
- variance_threshold 0.02β†’0.01: tighter detection for converging loss
- lr_multiplier 2.0β†’1.8: gentler burst with Muon
- burst_steps 50β†’20: controlled escape window

Scheduler:
- cosine floor 0.01β†’0.05: keep LR active through final steps

configure_threading:
- Respect OMP_NUM_THREADS from launch_turbo.sh instead of overriding"

Files changed (1) hide show
  1. chimera_turbo.py +55 -17
chimera_turbo.py CHANGED
@@ -10,6 +10,13 @@ Interaction-audited paradigm stack. Every paradigm verified cumulative.
10
  P17 Batch Metabolism β€” hard sequences weighted higher
11
  P18 Grokfast-EMA β€” amplify slow grads (1D params ONLY β€” NS cancels on 2D)
12
  P19 LLRD β€” layer-wise LR decay for ternary
 
 
 
 
 
 
 
13
  """
14
 
15
  import math
@@ -43,9 +50,15 @@ def detect_cpu_info():
43
 
44
 
45
  def configure_threading(cpu_info, reserve=1):
46
- n = max(1, cpu_info["physical_cores"] - reserve)
 
 
 
 
 
 
 
47
  torch.set_num_threads(n)
48
- os.environ["OMP_NUM_THREADS"] = str(n)
49
  return n
50
 
51
 
@@ -187,13 +200,18 @@ class MultiTokenPredictionLoss(nn.Module):
187
  # ═══════════════════════════════════════════════════════════
188
 
189
  class TokenTriage:
190
- def __init__(self, ema_decay=0.99, select_ratio=0.6, floor_weight=0.1):
 
 
 
 
 
191
  self.ema_decay = ema_decay
192
  self.select_ratio = select_ratio
193
  self.floor_weight = floor_weight
194
  self._loss_ema = None
195
  self._step = 0
196
- self.warmup_steps = 500
197
 
198
  def compute_weights(self, per_token_loss):
199
  with torch.no_grad():
@@ -218,8 +236,13 @@ class TokenTriage:
218
  # ═══════════════════════════════════════════════════════════
219
 
220
  class PlateauBreaker:
221
- def __init__(self, patience=100, variance_threshold=0.005,
222
- lr_multiplier=2.0, burst_steps=50):
 
 
 
 
 
223
  self.patience = patience
224
  self.var_threshold = variance_threshold
225
  self.lr_mult = lr_multiplier
@@ -301,13 +324,18 @@ def invalidate_all_caches(model):
301
  m.invalidate_packed()
302
 
303
 
304
- def create_scheduler(optimizer, max_steps, warmup_steps=200):
 
 
 
 
 
305
  from torch.optim.lr_scheduler import LambdaLR
306
  def lr_lambda(step):
307
  if step < warmup_steps:
308
  return step / max(1, warmup_steps)
309
  progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
310
- return max(0.01, 0.5 * (1.0 + math.cos(math.pi * progress)))
311
  return LambdaLR(optimizer, lr_lambda)
312
 
313
 
@@ -315,14 +343,24 @@ def create_scheduler(optimizer, max_steps, warmup_steps=200):
315
  # apply()
316
  # ═══════════════════════════════════════════════════════════
317
 
318
- def apply(model, max_steps=10000, lr=0.02, weight_decay=0.01,
319
- warmup_steps=200, use_compile=False, mtp_heads=3,
320
- llrd_decay=0.85, grokfast_alpha=0.98, grokfast_lambda=2.0,
321
  verbose=True):
 
 
 
 
 
 
 
 
 
 
322
  cpu_info = detect_cpu_info()
323
  if verbose:
324
  print("=" * 65)
325
- print("CHIMERA GENESIS v12 β€” Interaction-Audited Stack")
326
  print("=" * 65)
327
  print(f" CPU: {cpu_info['capability']} Cores: {cpu_info['physical_cores']}")
328
 
@@ -357,14 +395,14 @@ def apply(model, max_steps=10000, lr=0.02, weight_decay=0.01,
357
  else:
358
  print(f"[P13] MTP disabled (vocab/hidden ratio too high for CPU)")
359
 
360
- extras["triage"] = TokenTriage(ema_decay=0.99, select_ratio=0.6, floor_weight=0.1)
361
  if verbose:
362
- print(f"[P15] Token Triage (annealed warmup)")
363
 
364
- extras["plateau"] = PlateauBreaker(patience=200, variance_threshold=0.02,
365
- lr_multiplier=2.0, burst_steps=50)
366
  if verbose:
367
- print(f"[P16] Plateau Breaker (x2 burst, LLRD-aware)")
368
 
369
  extras["grokfast"] = GrokfastEMA(alpha=grokfast_alpha, lamb=grokfast_lambda)
370
  if verbose:
 
10
  P17 Batch Metabolism β€” hard sequences weighted higher
11
  P18 Grokfast-EMA β€” amplify slow grads (1D params ONLY β€” NS cancels on 2D)
12
  P19 LLRD β€” layer-wise LR decay for ternary
13
+
14
+ v12.1 β€” Tuned for 300-step convergence + β‰₯1000 tok/s on i7-14700T:
15
+ - MTP disabled (vocab/hidden=781:1 makes heads noisy + destroys cache)
16
+ - TokenTriage warmup=30, PlateauBreaker patience=60
17
+ - Grokfast alpha=0.95/lambda=1.5 for short-horizon training
18
+ - Scheduler cosine floor raised to 0.05 (keep learning through step 300)
19
+ - configure_threading respects shell-level OMP_NUM_THREADS (P-core pinning)
20
  """
21
 
22
  import math
 
50
 
51
 
52
  def configure_threading(cpu_info, reserve=1):
53
+ # Respect OMP_NUM_THREADS set by launch_turbo.sh (P-core pinning).
54
+ # Only auto-configure if the env var wasn't set externally.
55
+ env_threads = os.environ.get("OMP_NUM_THREADS")
56
+ if env_threads is not None:
57
+ n = int(env_threads)
58
+ else:
59
+ n = max(1, cpu_info["physical_cores"] - reserve)
60
+ os.environ["OMP_NUM_THREADS"] = str(n)
61
  torch.set_num_threads(n)
 
62
  return n
63
 
64
 
 
200
  # ═══════════════════════════════════════════════════════════
201
 
202
  class TokenTriage:
203
+ """Focus gradient signal on the most informative tokens.
204
+
205
+ v12.1: warmup=30, select_ratio=0.50, floor=0.15 for 300-step runs.
206
+ Original warmup=500 never activated in short training.
207
+ """
208
+ def __init__(self, ema_decay=0.99, select_ratio=0.50, floor_weight=0.15):
209
  self.ema_decay = ema_decay
210
  self.select_ratio = select_ratio
211
  self.floor_weight = floor_weight
212
  self._loss_ema = None
213
  self._step = 0
214
+ self.warmup_steps = 30 # Must be ≀ total steps; was 500
215
 
216
  def compute_weights(self, per_token_loss):
217
  with torch.no_grad():
 
236
  # ═══════════════════════════════════════════════════════════
237
 
238
  class PlateauBreaker:
239
+ """Adaptive LR burst when loss stagnates.
240
+
241
+ v12.1: patience=60, var_threshold=0.01, mult=1.8, burst=20 for 300-step runs.
242
+ Original patience=200 never fired in short training.
243
+ """
244
+ def __init__(self, patience=60, variance_threshold=0.01,
245
+ lr_multiplier=1.8, burst_steps=20):
246
  self.patience = patience
247
  self.var_threshold = variance_threshold
248
  self.lr_mult = lr_multiplier
 
324
  m.invalidate_packed()
325
 
326
 
327
+ def create_scheduler(optimizer, max_steps, warmup_steps=30):
328
+ """Cosine schedule with warmup.
329
+
330
+ v12.1: floor raised from 0.01 to 0.05 so LR stays active through
331
+ the final steps of a 300-step run. warmup default lowered to 30.
332
+ """
333
  from torch.optim.lr_scheduler import LambdaLR
334
  def lr_lambda(step):
335
  if step < warmup_steps:
336
  return step / max(1, warmup_steps)
337
  progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
338
+ return max(0.05, 0.5 * (1.0 + math.cos(math.pi * progress)))
339
  return LambdaLR(optimizer, lr_lambda)
340
 
341
 
 
343
  # apply()
344
  # ═══════════════════════════════════════════════════════════
345
 
346
+ def apply(model, max_steps=10000, lr=0.012, weight_decay=0.02,
347
+ warmup_steps=30, use_compile=False, mtp_heads=0,
348
+ llrd_decay=0.90, grokfast_alpha=0.95, grokfast_lambda=1.5,
349
  verbose=True):
350
+ """Configure the GENESIS paradigm stack.
351
+
352
+ v12.1 defaults tuned for 300-step convergence on i7-14700T:
353
+ lr=0.012 (was 0.02; ternary STE clamp zone [-1,1])
354
+ wd=0.02 (was 0.01; BitNet SLM paper: wd=0.05 optimal)
355
+ warmup=30 (was 200; 10% of 300-step budget)
356
+ mtp_heads=0 (was 3; vocab/hidden=781:1 makes MTP noisy + slow)
357
+ llrd=0.90 (was 0.85; 0.85^27=0.009 freezes bottom layers)
358
+ grokfast: a=0.95/l=1.5 (was 0.98/2.0; shorter EMA window)
359
+ """
360
  cpu_info = detect_cpu_info()
361
  if verbose:
362
  print("=" * 65)
363
+ print("CHIMERA GENESIS v12.1 β€” Tuned for 300-step convergence")
364
  print("=" * 65)
365
  print(f" CPU: {cpu_info['capability']} Cores: {cpu_info['physical_cores']}")
366
 
 
395
  else:
396
  print(f"[P13] MTP disabled (vocab/hidden ratio too high for CPU)")
397
 
398
+ extras["triage"] = TokenTriage(ema_decay=0.99, select_ratio=0.50, floor_weight=0.15)
399
  if verbose:
400
+ print(f"[P15] Token Triage (select=0.50, floor=0.15, warmup=30)")
401
 
402
+ extras["plateau"] = PlateauBreaker(patience=60, variance_threshold=0.01,
403
+ lr_multiplier=1.8, burst_steps=20)
404
  if verbose:
405
+ print(f"[P16] Plateau Breaker (patience=60, x1.8 burst, 20 steps)")
406
 
407
  extras["grokfast"] = GrokfastEMA(alpha=grokfast_alpha, lamb=grokfast_lambda)
408
  if verbose: