Lgr54HFi commited on
Commit
64db48c
·
verified ·
1 Parent(s): 58f6f80

perf: BitNet-paper hyperparams — β2=0.98, wd=0.01, warmup=750, grad_clip=1.0, NaN-safe\n\nAligned with BitNet training recipe (2310.11453 Table 5-6):\n- β2: 0.95→0.98 (all BitNet papers use 0.98, critical for ternary noise)\n- wd: 0.05→0.01 (original BitNet; Reloaded uses 0.05 but 0.01 more stable)\n- warmup: 500→750 fixed steps (paper-exact)\n- grad_clip: 0.5→1.0 (papers use none, but we keep light clip for safety)\n- Default LR: 1.5e-3 (interpolated 125M→2.4e-3, 350M→1.2e-3)"

Browse files
Files changed (1) hide show
  1. chimera_turbo.py +31 -33
chimera_turbo.py CHANGED
@@ -2,7 +2,7 @@
2
  chimera_turbo.py — Drop-in CPU acceleration for ch1mera 5.3
3
  Usage: import chimera_turbo; chimera_turbo.apply(model, max_steps=N)
4
 
5
- v7: NaN-safe training stepskip optimizer on NaN loss, sanitize grads
6
  """
7
 
8
  import math
@@ -39,7 +39,6 @@ def detect_cpu_info() -> Dict[str, Any]:
39
  try:
40
  import intel_extension_for_pytorch
41
  info["ipex_available"] = True
42
- info["ipex_version"] = intel_extension_for_pytorch.__version__
43
  except Exception:
44
  info["ipex_available"] = False
45
  info["tcmalloc"] = "tcmalloc" in os.environ.get("LD_PRELOAD", "")
@@ -55,9 +54,19 @@ def configure_threading(cpu_info: Dict[str, Any], reserve_for_io: int = 1):
55
 
56
 
57
  def create_optimizer(
58
- model: nn.Module, lr: float = 1e-3, weight_decay: float = 0.05,
59
- use_lion: bool = False, betas: Tuple[float, float] = (0.9, 0.95),
 
 
 
60
  ) -> torch.optim.Optimizer:
 
 
 
 
 
 
 
61
  decay_params, no_decay_params = [], []
62
  for name, param in model.named_parameters():
63
  if not param.requires_grad:
@@ -75,11 +84,12 @@ def create_optimizer(
75
  from lion_pytorch import Lion
76
  return Lion(param_groups, lr=lr * 0.3, betas=(0.95, 0.98))
77
  except ImportError:
78
- warnings.warn("lion-pytorch not installed, falling back to AdamW")
79
- return torch.optim.AdamW(param_groups, lr=lr, betas=betas, fused=False)
80
 
81
 
82
- def create_scheduler(optimizer, max_steps: int, warmup_steps: int = 500):
 
83
  from torch.optim.lr_scheduler import LambdaLR
84
  def lr_lambda(step):
85
  if step < warmup_steps:
@@ -121,27 +131,26 @@ def try_compile_model(model: nn.Module, mode: str = "default") -> nn.Module:
121
 
122
 
123
  def apply(
124
- model: nn.Module, max_steps: int = 10000, lr: float = 1e-3,
125
- weight_decay: float = 0.05, warmup_steps: int = 500,
126
  use_compile: bool = True, use_ipex: bool = True,
127
  use_lion: bool = False, verbose: bool = True,
128
  ) -> Tuple[nn.Module, torch.optim.Optimizer, Any]:
129
  cpu_info = detect_cpu_info()
130
  if verbose:
131
  print("=" * 65)
132
- print("CHIMERA TURBO v7CPU Acceleration Layer")
133
  print("=" * 65)
134
  print(f" Cores: {cpu_info['physical_cores']} CPU: {cpu_info['capability']}")
135
- print(f" AMX: {cpu_info['has_amx']} AVX-512: {cpu_info['has_avx512']} BF16 hw: {cpu_info['has_avx512_bf16']}")
136
  print(f" IPEX: {cpu_info['ipex_available']} tcmalloc: {cpu_info['tcmalloc']}")
137
  n_threads = configure_threading(cpu_info)
138
  if verbose:
139
- print(f"[TURBO-3] Compute threads: {n_threads}")
140
- optimizer = create_optimizer(model, lr=lr, weight_decay=weight_decay, use_lion=use_lion)
141
  scheduler = create_scheduler(optimizer, max_steps=max_steps, warmup_steps=warmup_steps)
142
  if verbose:
143
  n_params = sum(p.numel() for g in optimizer.param_groups for p in g["params"])
144
- print(f"[TURBO-1] AdamW (lr={lr}, wd={weight_decay}) — {n_params:,} params")
145
  if use_ipex:
146
  model, optimizer = try_ipex_optimize(model, optimizer, cpu_info)
147
  if use_compile:
@@ -155,7 +164,6 @@ def apply(
155
  return model, optimizer, scheduler
156
 
157
 
158
- # Track consecutive NaN count for emergency recovery
159
  _nan_count = 0
160
  _MAX_CONSECUTIVE_NAN = 5
161
 
@@ -163,16 +171,13 @@ _MAX_CONSECUTIVE_NAN = 5
163
  def training_step(
164
  model: nn.Module, batch, optimizer: torch.optim.Optimizer, scheduler,
165
  grad_accum_steps: int = 1, step: int = 0,
166
- max_grad_norm: float = 0.5,
167
  autocast_dtype: Optional[torch.dtype] = torch.bfloat16,
168
  ) -> float:
169
- """Training step with NaN detection and recovery.
170
 
171
- If loss is NaN/Inf:
172
- - Zero all gradients (prevent NaN from contaminating weights)
173
- - Skip optimizer.step() entirely
174
- - Return previous valid loss value
175
- - After 5 consecutive NaN: halve the learning rate as emergency fix
176
  """
177
  global _nan_count
178
 
@@ -192,28 +197,21 @@ def training_step(
192
  # ── NaN detection ──
193
  if not math.isfinite(loss_val):
194
  _nan_count += 1
195
- # Don't backward NaN — it would poison all gradients
196
  optimizer.zero_grad(set_to_none=True)
197
-
198
  if _nan_count >= _MAX_CONSECUTIVE_NAN:
199
- # Emergency: halve LR to try to recover
200
  for pg in optimizer.param_groups:
201
  pg["lr"] *= 0.5
202
- new_lr = optimizer.param_groups[0]["lr"]
203
- print(f" [NaN] {_nan_count} consecutive — emergency LR halved to {new_lr:.2e}")
204
  _nan_count = 0
 
205
 
206
- return loss_val # Return NaN so logging shows it, but weights are safe
207
-
208
- # ── Normal path ──
209
- _nan_count = 0 # Reset counter on valid loss
210
 
211
  if grad_accum_steps > 1:
212
  loss = loss / grad_accum_steps
213
-
214
  loss.backward()
215
 
216
- # Sanitize gradients: replace any NaN/Inf grads with zero
217
  for p in model.parameters():
218
  if p.grad is not None and not torch.isfinite(p.grad).all():
219
  p.grad.nan_to_num_(nan=0.0, posinf=0.0, neginf=0.0)
 
2
  chimera_turbo.py — Drop-in CPU acceleration for ch1mera 5.3
3
  Usage: import chimera_turbo; chimera_turbo.apply(model, max_steps=N)
4
 
5
+ v8: BitNet-paper aligned hyperparamsβ2=0.98, wd=0.01, warmup=750
6
  """
7
 
8
  import math
 
39
  try:
40
  import intel_extension_for_pytorch
41
  info["ipex_available"] = True
 
42
  except Exception:
43
  info["ipex_available"] = False
44
  info["tcmalloc"] = "tcmalloc" in os.environ.get("LD_PRELOAD", "")
 
54
 
55
 
56
  def create_optimizer(
57
+ model: nn.Module,
58
+ lr: float = 1.5e-3, # BitNet interpolated: 125M→2.4e-3, 350M→1.2e-3
59
+ weight_decay: float = 0.01, # ← BitNet original (2310.11453 Table 5)
60
+ use_lion: bool = False,
61
+ betas: Tuple[float, float] = (0.9, 0.98), # ← BitNet: β2=0.98 NOT 0.95/0.999
62
  ) -> torch.optim.Optimizer:
63
+ """AdamW with BitNet-paper hyperparameters.
64
+
65
+ Key differences from standard:
66
+ - β2=0.98 (not 0.999): faster variance adaptation for ternary noise
67
+ - wd=0.01: original BitNet paper value, more stable than 0.05 for from-scratch
68
+ - lr=1.5e-3: interpolated from BitNet Table 5 (125M→2.4e-3, 350M→1.2e-3)
69
+ """
70
  decay_params, no_decay_params = [], []
71
  for name, param in model.named_parameters():
72
  if not param.requires_grad:
 
84
  from lion_pytorch import Lion
85
  return Lion(param_groups, lr=lr * 0.3, betas=(0.95, 0.98))
86
  except ImportError:
87
+ pass
88
+ return torch.optim.AdamW(param_groups, lr=lr, betas=betas, eps=1e-8, fused=False)
89
 
90
 
91
+ def create_scheduler(optimizer, max_steps: int, warmup_steps: int = 750):
92
+ """Cosine decay with 750-step warmup (BitNet paper-exact)."""
93
  from torch.optim.lr_scheduler import LambdaLR
94
  def lr_lambda(step):
95
  if step < warmup_steps:
 
131
 
132
 
133
  def apply(
134
+ model: nn.Module, max_steps: int = 10000, lr: float = 1.5e-3,
135
+ weight_decay: float = 0.01, warmup_steps: int = 750,
136
  use_compile: bool = True, use_ipex: bool = True,
137
  use_lion: bool = False, verbose: bool = True,
138
  ) -> Tuple[nn.Module, torch.optim.Optimizer, Any]:
139
  cpu_info = detect_cpu_info()
140
  if verbose:
141
  print("=" * 65)
142
+ print("CHIMERA TURBO v8BitNet-aligned hyperparams")
143
  print("=" * 65)
144
  print(f" Cores: {cpu_info['physical_cores']} CPU: {cpu_info['capability']}")
 
145
  print(f" IPEX: {cpu_info['ipex_available']} tcmalloc: {cpu_info['tcmalloc']}")
146
  n_threads = configure_threading(cpu_info)
147
  if verbose:
148
+ print(f"[TURBO-3] Threads: {n_threads}")
149
+ optimizer = create_optimizer(model, lr=lr, weight_decay=weight_decay)
150
  scheduler = create_scheduler(optimizer, max_steps=max_steps, warmup_steps=warmup_steps)
151
  if verbose:
152
  n_params = sum(p.numel() for g in optimizer.param_groups for p in g["params"])
153
+ print(f"[TURBO-1] AdamW (lr={lr}, β=(0.9,0.98), wd={weight_decay}) — {n_params:,} params")
154
  if use_ipex:
155
  model, optimizer = try_ipex_optimize(model, optimizer, cpu_info)
156
  if use_compile:
 
164
  return model, optimizer, scheduler
165
 
166
 
 
167
  _nan_count = 0
168
  _MAX_CONSECUTIVE_NAN = 5
169
 
 
171
  def training_step(
172
  model: nn.Module, batch, optimizer: torch.optim.Optimizer, scheduler,
173
  grad_accum_steps: int = 1, step: int = 0,
174
+ max_grad_norm: float = 1.0, # ← raised back to 1.0 (papers use none, this is light)
175
  autocast_dtype: Optional[torch.dtype] = torch.bfloat16,
176
  ) -> float:
177
+ """NaN-safe training step with BitNet-aligned grad clipping.
178
 
179
+ BitNet papers use NO grad clipping. We keep a light clip (1.0) as safety
180
+ net for the evolution engine side-effects, but it should rarely activate.
 
 
 
181
  """
182
  global _nan_count
183
 
 
197
  # ── NaN detection ──
198
  if not math.isfinite(loss_val):
199
  _nan_count += 1
 
200
  optimizer.zero_grad(set_to_none=True)
 
201
  if _nan_count >= _MAX_CONSECUTIVE_NAN:
 
202
  for pg in optimizer.param_groups:
203
  pg["lr"] *= 0.5
204
+ print(f" [NaN] {_nan_count}x — LR halved to {optimizer.param_groups[0]['lr']:.2e}")
 
205
  _nan_count = 0
206
+ return loss_val
207
 
208
+ _nan_count = 0
 
 
 
209
 
210
  if grad_accum_steps > 1:
211
  loss = loss / grad_accum_steps
 
212
  loss.backward()
213
 
214
+ # Sanitize any NaN grads from evolution engine
215
  for p in model.parameters():
216
  if p.grad is not None and not torch.isfinite(p.grad).all():
217
  p.grad.nan_to_num_(nan=0.0, posinf=0.0, neginf=0.0)