perf: tune chimera_turbo.py for 300-step convergence + throughput
Browse filesAll changes target joint throughput β₯1000 tok/s + near-optimal loss by step 300:
apply() defaults:
- lr 0.02β0.012: ternary STE clamp zone [-1,1] causes overshoot at 0.02
- weight_decay 0.01β0.02: BitNet SLM paper finds wd=0.05 optimal; 0.02
balances with Muon's NS regularization
- warmup_steps 200β30: 200 wastes 67% of a 300-step budget at sub-optimal LR
- mtp_heads 3β0: each head adds Linear(256,200073)=51M params; 3 heads =
153M extra params (4.4Γ model) destroying cache residency + throughput
- llrd_decay 0.85β0.90: 0.85^27=0.009 nearly freezes bottom layers;
0.90^27=0.058 gives 6Γ more gradient at bottom, critical for 300 steps
- grokfast_alpha 0.98β0.95: EMA window ~50β~20 steps, better for short runs
- grokfast_lambda 2.0β1.5: reduce instability risk with Muon NS updates
TokenTriage:
- select_ratio 0.6β0.50: focus top-50% informative tokens
- floor_weight 0.1β0.15: ensure minimum signal from all tokens
- warmup_steps 500β30: original never activates in 300-step run
PlateauBreaker:
- patience 200β60: original never fires in 300 steps
- variance_threshold 0.02β0.01: tighter detection for converging loss
- lr_multiplier 2.0β1.8: gentler burst with Muon
- burst_steps 50β20: controlled escape window
Scheduler:
- cosine floor 0.01β0.05: keep LR active through final steps
configure_threading:
- Respect OMP_NUM_THREADS from launch_turbo.sh instead of overriding"
- chimera_turbo.py +55 -17
|
@@ -10,6 +10,13 @@ Interaction-audited paradigm stack. Every paradigm verified cumulative.
|
|
| 10 |
P17 Batch Metabolism β hard sequences weighted higher
|
| 11 |
P18 Grokfast-EMA β amplify slow grads (1D params ONLY β NS cancels on 2D)
|
| 12 |
P19 LLRD β layer-wise LR decay for ternary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
|
| 15 |
import math
|
|
@@ -43,9 +50,15 @@ def detect_cpu_info():
|
|
| 43 |
|
| 44 |
|
| 45 |
def configure_threading(cpu_info, reserve=1):
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
torch.set_num_threads(n)
|
| 48 |
-
os.environ["OMP_NUM_THREADS"] = str(n)
|
| 49 |
return n
|
| 50 |
|
| 51 |
|
|
@@ -187,13 +200,18 @@ class MultiTokenPredictionLoss(nn.Module):
|
|
| 187 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 188 |
|
| 189 |
class TokenTriage:
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
self.ema_decay = ema_decay
|
| 192 |
self.select_ratio = select_ratio
|
| 193 |
self.floor_weight = floor_weight
|
| 194 |
self._loss_ema = None
|
| 195 |
self._step = 0
|
| 196 |
-
self.warmup_steps = 500
|
| 197 |
|
| 198 |
def compute_weights(self, per_token_loss):
|
| 199 |
with torch.no_grad():
|
|
@@ -218,8 +236,13 @@ class TokenTriage:
|
|
| 218 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 219 |
|
| 220 |
class PlateauBreaker:
|
| 221 |
-
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
self.patience = patience
|
| 224 |
self.var_threshold = variance_threshold
|
| 225 |
self.lr_mult = lr_multiplier
|
|
@@ -301,13 +324,18 @@ def invalidate_all_caches(model):
|
|
| 301 |
m.invalidate_packed()
|
| 302 |
|
| 303 |
|
| 304 |
-
def create_scheduler(optimizer, max_steps, warmup_steps=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
from torch.optim.lr_scheduler import LambdaLR
|
| 306 |
def lr_lambda(step):
|
| 307 |
if step < warmup_steps:
|
| 308 |
return step / max(1, warmup_steps)
|
| 309 |
progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
|
| 310 |
-
return max(0.
|
| 311 |
return LambdaLR(optimizer, lr_lambda)
|
| 312 |
|
| 313 |
|
|
@@ -315,14 +343,24 @@ def create_scheduler(optimizer, max_steps, warmup_steps=200):
|
|
| 315 |
# apply()
|
| 316 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 317 |
|
| 318 |
-
def apply(model, max_steps=10000, lr=0.
|
| 319 |
-
warmup_steps=
|
| 320 |
-
llrd_decay=0.
|
| 321 |
verbose=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
cpu_info = detect_cpu_info()
|
| 323 |
if verbose:
|
| 324 |
print("=" * 65)
|
| 325 |
-
print("CHIMERA GENESIS v12 β
|
| 326 |
print("=" * 65)
|
| 327 |
print(f" CPU: {cpu_info['capability']} Cores: {cpu_info['physical_cores']}")
|
| 328 |
|
|
@@ -357,14 +395,14 @@ def apply(model, max_steps=10000, lr=0.02, weight_decay=0.01,
|
|
| 357 |
else:
|
| 358 |
print(f"[P13] MTP disabled (vocab/hidden ratio too high for CPU)")
|
| 359 |
|
| 360 |
-
extras["triage"] = TokenTriage(ema_decay=0.99, select_ratio=0.
|
| 361 |
if verbose:
|
| 362 |
-
print(f"[P15] Token Triage (
|
| 363 |
|
| 364 |
-
extras["plateau"] = PlateauBreaker(patience=
|
| 365 |
-
lr_multiplier=
|
| 366 |
if verbose:
|
| 367 |
-
print(f"[P16] Plateau Breaker (
|
| 368 |
|
| 369 |
extras["grokfast"] = GrokfastEMA(alpha=grokfast_alpha, lamb=grokfast_lambda)
|
| 370 |
if verbose:
|
|
|
|
| 10 |
P17 Batch Metabolism β hard sequences weighted higher
|
| 11 |
P18 Grokfast-EMA β amplify slow grads (1D params ONLY β NS cancels on 2D)
|
| 12 |
P19 LLRD β layer-wise LR decay for ternary
|
| 13 |
+
|
| 14 |
+
v12.1 β Tuned for 300-step convergence + β₯1000 tok/s on i7-14700T:
|
| 15 |
+
- MTP disabled (vocab/hidden=781:1 makes heads noisy + destroys cache)
|
| 16 |
+
- TokenTriage warmup=30, PlateauBreaker patience=60
|
| 17 |
+
- Grokfast alpha=0.95/lambda=1.5 for short-horizon training
|
| 18 |
+
- Scheduler cosine floor raised to 0.05 (keep learning through step 300)
|
| 19 |
+
- configure_threading respects shell-level OMP_NUM_THREADS (P-core pinning)
|
| 20 |
"""
|
| 21 |
|
| 22 |
import math
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def configure_threading(cpu_info, reserve=1):
|
| 53 |
+
# Respect OMP_NUM_THREADS set by launch_turbo.sh (P-core pinning).
|
| 54 |
+
# Only auto-configure if the env var wasn't set externally.
|
| 55 |
+
env_threads = os.environ.get("OMP_NUM_THREADS")
|
| 56 |
+
if env_threads is not None:
|
| 57 |
+
n = int(env_threads)
|
| 58 |
+
else:
|
| 59 |
+
n = max(1, cpu_info["physical_cores"] - reserve)
|
| 60 |
+
os.environ["OMP_NUM_THREADS"] = str(n)
|
| 61 |
torch.set_num_threads(n)
|
|
|
|
| 62 |
return n
|
| 63 |
|
| 64 |
|
|
|
|
| 200 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 201 |
|
| 202 |
class TokenTriage:
|
| 203 |
+
"""Focus gradient signal on the most informative tokens.
|
| 204 |
+
|
| 205 |
+
v12.1: warmup=30, select_ratio=0.50, floor=0.15 for 300-step runs.
|
| 206 |
+
Original warmup=500 never activated in short training.
|
| 207 |
+
"""
|
| 208 |
+
def __init__(self, ema_decay=0.99, select_ratio=0.50, floor_weight=0.15):
|
| 209 |
self.ema_decay = ema_decay
|
| 210 |
self.select_ratio = select_ratio
|
| 211 |
self.floor_weight = floor_weight
|
| 212 |
self._loss_ema = None
|
| 213 |
self._step = 0
|
| 214 |
+
self.warmup_steps = 30 # Must be β€ total steps; was 500
|
| 215 |
|
| 216 |
def compute_weights(self, per_token_loss):
|
| 217 |
with torch.no_grad():
|
|
|
|
| 236 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 237 |
|
| 238 |
class PlateauBreaker:
|
| 239 |
+
"""Adaptive LR burst when loss stagnates.
|
| 240 |
+
|
| 241 |
+
v12.1: patience=60, var_threshold=0.01, mult=1.8, burst=20 for 300-step runs.
|
| 242 |
+
Original patience=200 never fired in short training.
|
| 243 |
+
"""
|
| 244 |
+
def __init__(self, patience=60, variance_threshold=0.01,
|
| 245 |
+
lr_multiplier=1.8, burst_steps=20):
|
| 246 |
self.patience = patience
|
| 247 |
self.var_threshold = variance_threshold
|
| 248 |
self.lr_mult = lr_multiplier
|
|
|
|
| 324 |
m.invalidate_packed()
|
| 325 |
|
| 326 |
|
| 327 |
+
def create_scheduler(optimizer, max_steps, warmup_steps=30):
|
| 328 |
+
"""Cosine schedule with warmup.
|
| 329 |
+
|
| 330 |
+
v12.1: floor raised from 0.01 to 0.05 so LR stays active through
|
| 331 |
+
the final steps of a 300-step run. warmup default lowered to 30.
|
| 332 |
+
"""
|
| 333 |
from torch.optim.lr_scheduler import LambdaLR
|
| 334 |
def lr_lambda(step):
|
| 335 |
if step < warmup_steps:
|
| 336 |
return step / max(1, warmup_steps)
|
| 337 |
progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
|
| 338 |
+
return max(0.05, 0.5 * (1.0 + math.cos(math.pi * progress)))
|
| 339 |
return LambdaLR(optimizer, lr_lambda)
|
| 340 |
|
| 341 |
|
|
|
|
| 343 |
# apply()
|
| 344 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 345 |
|
| 346 |
+
def apply(model, max_steps=10000, lr=0.012, weight_decay=0.02,
|
| 347 |
+
warmup_steps=30, use_compile=False, mtp_heads=0,
|
| 348 |
+
llrd_decay=0.90, grokfast_alpha=0.95, grokfast_lambda=1.5,
|
| 349 |
verbose=True):
|
| 350 |
+
"""Configure the GENESIS paradigm stack.
|
| 351 |
+
|
| 352 |
+
v12.1 defaults tuned for 300-step convergence on i7-14700T:
|
| 353 |
+
lr=0.012 (was 0.02; ternary STE clamp zone [-1,1])
|
| 354 |
+
wd=0.02 (was 0.01; BitNet SLM paper: wd=0.05 optimal)
|
| 355 |
+
warmup=30 (was 200; 10% of 300-step budget)
|
| 356 |
+
mtp_heads=0 (was 3; vocab/hidden=781:1 makes MTP noisy + slow)
|
| 357 |
+
llrd=0.90 (was 0.85; 0.85^27=0.009 freezes bottom layers)
|
| 358 |
+
grokfast: a=0.95/l=1.5 (was 0.98/2.0; shorter EMA window)
|
| 359 |
+
"""
|
| 360 |
cpu_info = detect_cpu_info()
|
| 361 |
if verbose:
|
| 362 |
print("=" * 65)
|
| 363 |
+
print("CHIMERA GENESIS v12.1 β Tuned for 300-step convergence")
|
| 364 |
print("=" * 65)
|
| 365 |
print(f" CPU: {cpu_info['capability']} Cores: {cpu_info['physical_cores']}")
|
| 366 |
|
|
|
|
| 395 |
else:
|
| 396 |
print(f"[P13] MTP disabled (vocab/hidden ratio too high for CPU)")
|
| 397 |
|
| 398 |
+
extras["triage"] = TokenTriage(ema_decay=0.99, select_ratio=0.50, floor_weight=0.15)
|
| 399 |
if verbose:
|
| 400 |
+
print(f"[P15] Token Triage (select=0.50, floor=0.15, warmup=30)")
|
| 401 |
|
| 402 |
+
extras["plateau"] = PlateauBreaker(patience=60, variance_threshold=0.01,
|
| 403 |
+
lr_multiplier=1.8, burst_steps=20)
|
| 404 |
if verbose:
|
| 405 |
+
print(f"[P16] Plateau Breaker (patience=60, x1.8 burst, 20 steps)")
|
| 406 |
|
| 407 |
extras["grokfast"] = GrokfastEMA(alpha=grokfast_alpha, lamb=grokfast_lambda)
|
| 408 |
if verbose:
|