Lgr54HFi commited on
Commit
58f6f80
·
verified ·
1 Parent(s): a97a233

fix: NaN skip + grad sanitization — detect NaN loss, zero corrupted grads, skip optimizer step\n\nWhen a rare batch produces NaN loss (step 380/500), the backward pass\ncontaminates all gradients with NaN. Without detection, optimizer.step()\npushes all weights to NaN → irrecoverable.\n\nFix: check loss for NaN/Inf before backward. If detected, zero grads\nand skip the optimizer step. Training recovers on the next batch."

Browse files
Files changed (1) hide show
  1. chimera_turbo.py +51 -31
chimera_turbo.py CHANGED
@@ -2,15 +2,7 @@
2
  chimera_turbo.py — Drop-in CPU acceleration for ch1mera 5.3
3
  Usage: import chimera_turbo; chimera_turbo.apply(model, max_steps=N)
4
 
5
- Paradigmes intégrés:
6
- P-TURBO-1: STE + AdamW (remplace MeZO → fix convergence + 50x moins de forwards)
7
- P-TURBO-2: torch.compile mode=default (CPU-safe, no CUDA graph pool)
8
- P-TURBO-3: Threading optimal + tcmalloc detection
9
- P-TURBO-4: IPEX bf16/AMX si disponible
10
- P-TURBO-5: Invalidate BitLinear packed caches after optimizer step
11
- P-TURBO-6: INT8 ternary forward path (VNNI/AMX dispatch)
12
-
13
- v6: lower max_grad_norm 1.0→0.5, clamp-aware STE in quantization.py
14
  """
15
 
16
  import math
@@ -106,23 +98,13 @@ def invalidate_all_caches(model: nn.Module):
106
 
107
  def try_ipex_optimize(model, optimizer, cpu_info, dtype=None):
108
  if not cpu_info.get("ipex_available"):
109
- print("[TURBO-4] IPEX not available — skipping")
110
  return model, optimizer
111
  try:
112
  import intel_extension_for_pytorch as ipex
113
  except Exception:
114
- print("[TURBO-4] IPEX import failed — skipping")
115
  return model, optimizer
116
  if dtype is None:
117
- if cpu_info["has_amx"]:
118
- dtype = torch.bfloat16
119
- print("[TURBO-4] IPEX + AMX bf16 enabled")
120
- elif cpu_info["has_avx512"]:
121
- dtype = torch.bfloat16
122
- print("[TURBO-4] IPEX + AVX-512 bf16 enabled")
123
- else:
124
- dtype = torch.float32
125
- print("[TURBO-4] IPEX fp32")
126
  model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=dtype, level="O1", inplace=True)
127
  return model, optimizer
128
 
@@ -134,8 +116,7 @@ def try_compile_model(model: nn.Module, mode: str = "default") -> nn.Module:
134
  compiled = torch.compile(model, backend="inductor", mode=mode, fullgraph=False)
135
  print(f"[TURBO-2] torch.compile enabled (mode={mode})")
136
  return compiled
137
- except Exception as e:
138
- warnings.warn(f"torch.compile failed: {e}. Eager mode.")
139
  return model
140
 
141
 
@@ -148,45 +129,56 @@ def apply(
148
  cpu_info = detect_cpu_info()
149
  if verbose:
150
  print("=" * 65)
151
- print("CHIMERA TURBO v6 — CPU Acceleration Layer")
152
  print("=" * 65)
153
  print(f" Cores: {cpu_info['physical_cores']} CPU: {cpu_info['capability']}")
154
  print(f" AMX: {cpu_info['has_amx']} AVX-512: {cpu_info['has_avx512']} BF16 hw: {cpu_info['has_avx512_bf16']}")
155
  print(f" IPEX: {cpu_info['ipex_available']} tcmalloc: {cpu_info['tcmalloc']}")
156
-
157
  n_threads = configure_threading(cpu_info)
158
  if verbose:
159
  print(f"[TURBO-3] Compute threads: {n_threads}")
160
-
161
  optimizer = create_optimizer(model, lr=lr, weight_decay=weight_decay, use_lion=use_lion)
162
  scheduler = create_scheduler(optimizer, max_steps=max_steps, warmup_steps=warmup_steps)
163
  if verbose:
164
  n_params = sum(p.numel() for g in optimizer.param_groups for p in g["params"])
165
  print(f"[TURBO-1] AdamW (lr={lr}, wd={weight_decay}) — {n_params:,} params")
166
-
167
  if use_ipex:
168
  model, optimizer = try_ipex_optimize(model, optimizer, cpu_info)
169
  if use_compile:
170
  model = try_compile_model(model, mode="default")
171
-
172
  if verbose:
173
  if not cpu_info["has_avx512_bf16"]:
174
  print(" ⚠️ No BF16 hw — use --no-bf16")
175
  if not cpu_info["tcmalloc"]:
176
  print(" ⚠️ No tcmalloc — LD_PRELOAD=...libtcmalloc.so.4 for +15%")
177
  print("=" * 65)
178
-
179
  return model, optimizer, scheduler
180
 
181
 
 
 
 
 
 
182
  def training_step(
183
  model: nn.Module, batch, optimizer: torch.optim.Optimizer, scheduler,
184
  grad_accum_steps: int = 1, step: int = 0,
185
- max_grad_norm: float = 0.5, # ← lowered from 1.0 to prevent NaN
186
  autocast_dtype: Optional[torch.dtype] = torch.bfloat16,
187
  ) -> float:
 
 
 
 
 
 
 
 
 
 
188
  is_accum_step = (step + 1) % grad_accum_steps == 0
189
  ctx = torch.autocast(device_type="cpu", dtype=autocast_dtype) if autocast_dtype else nullcontext()
 
190
  with ctx:
191
  if isinstance(batch, dict):
192
  outputs = model(batch["input_ids"], labels=batch.get("labels"))
@@ -196,13 +188,41 @@ def training_step(
196
  outputs = model(batch)
197
  loss = outputs if isinstance(outputs, torch.Tensor) else outputs.loss
198
  loss_val = loss.item()
199
- if grad_accum_steps > 1:
200
- loss = loss / grad_accum_steps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  loss.backward()
 
 
 
 
 
 
202
  if is_accum_step:
203
  torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
204
  optimizer.step()
205
  scheduler.step()
206
  optimizer.zero_grad(set_to_none=True)
207
  invalidate_all_caches(model)
 
208
  return loss_val
 
2
  chimera_turbo.py — Drop-in CPU acceleration for ch1mera 5.3
3
  Usage: import chimera_turbo; chimera_turbo.apply(model, max_steps=N)
4
 
5
+ v7: NaN-safe training step — skip optimizer on NaN loss, sanitize grads
 
 
 
 
 
 
 
 
6
  """
7
 
8
  import math
 
98
 
99
  def try_ipex_optimize(model, optimizer, cpu_info, dtype=None):
100
  if not cpu_info.get("ipex_available"):
 
101
  return model, optimizer
102
  try:
103
  import intel_extension_for_pytorch as ipex
104
  except Exception:
 
105
  return model, optimizer
106
  if dtype is None:
107
+ dtype = torch.bfloat16 if (cpu_info["has_amx"] or cpu_info["has_avx512"]) else torch.float32
 
 
 
 
 
 
 
 
108
  model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=dtype, level="O1", inplace=True)
109
  return model, optimizer
110
 
 
116
  compiled = torch.compile(model, backend="inductor", mode=mode, fullgraph=False)
117
  print(f"[TURBO-2] torch.compile enabled (mode={mode})")
118
  return compiled
119
+ except Exception:
 
120
  return model
121
 
122
 
 
129
  cpu_info = detect_cpu_info()
130
  if verbose:
131
  print("=" * 65)
132
+ print("CHIMERA TURBO v7 — CPU Acceleration Layer")
133
  print("=" * 65)
134
  print(f" Cores: {cpu_info['physical_cores']} CPU: {cpu_info['capability']}")
135
  print(f" AMX: {cpu_info['has_amx']} AVX-512: {cpu_info['has_avx512']} BF16 hw: {cpu_info['has_avx512_bf16']}")
136
  print(f" IPEX: {cpu_info['ipex_available']} tcmalloc: {cpu_info['tcmalloc']}")
 
137
  n_threads = configure_threading(cpu_info)
138
  if verbose:
139
  print(f"[TURBO-3] Compute threads: {n_threads}")
 
140
  optimizer = create_optimizer(model, lr=lr, weight_decay=weight_decay, use_lion=use_lion)
141
  scheduler = create_scheduler(optimizer, max_steps=max_steps, warmup_steps=warmup_steps)
142
  if verbose:
143
  n_params = sum(p.numel() for g in optimizer.param_groups for p in g["params"])
144
  print(f"[TURBO-1] AdamW (lr={lr}, wd={weight_decay}) — {n_params:,} params")
 
145
  if use_ipex:
146
  model, optimizer = try_ipex_optimize(model, optimizer, cpu_info)
147
  if use_compile:
148
  model = try_compile_model(model, mode="default")
 
149
  if verbose:
150
  if not cpu_info["has_avx512_bf16"]:
151
  print(" ⚠️ No BF16 hw — use --no-bf16")
152
  if not cpu_info["tcmalloc"]:
153
  print(" ⚠️ No tcmalloc — LD_PRELOAD=...libtcmalloc.so.4 for +15%")
154
  print("=" * 65)
 
155
  return model, optimizer, scheduler
156
 
157
 
158
+ # Track consecutive NaN count for emergency recovery
159
+ _nan_count = 0
160
+ _MAX_CONSECUTIVE_NAN = 5
161
+
162
+
163
  def training_step(
164
  model: nn.Module, batch, optimizer: torch.optim.Optimizer, scheduler,
165
  grad_accum_steps: int = 1, step: int = 0,
166
+ max_grad_norm: float = 0.5,
167
  autocast_dtype: Optional[torch.dtype] = torch.bfloat16,
168
  ) -> float:
169
+ """Training step with NaN detection and recovery.
170
+
171
+ If loss is NaN/Inf:
172
+ - Zero all gradients (prevent NaN from contaminating weights)
173
+ - Skip optimizer.step() entirely
174
+ - Return previous valid loss value
175
+ - After 5 consecutive NaN: halve the learning rate as emergency fix
176
+ """
177
+ global _nan_count
178
+
179
  is_accum_step = (step + 1) % grad_accum_steps == 0
180
  ctx = torch.autocast(device_type="cpu", dtype=autocast_dtype) if autocast_dtype else nullcontext()
181
+
182
  with ctx:
183
  if isinstance(batch, dict):
184
  outputs = model(batch["input_ids"], labels=batch.get("labels"))
 
188
  outputs = model(batch)
189
  loss = outputs if isinstance(outputs, torch.Tensor) else outputs.loss
190
  loss_val = loss.item()
191
+
192
+ # ── NaN detection ──
193
+ if not math.isfinite(loss_val):
194
+ _nan_count += 1
195
+ # Don't backward NaN — it would poison all gradients
196
+ optimizer.zero_grad(set_to_none=True)
197
+
198
+ if _nan_count >= _MAX_CONSECUTIVE_NAN:
199
+ # Emergency: halve LR to try to recover
200
+ for pg in optimizer.param_groups:
201
+ pg["lr"] *= 0.5
202
+ new_lr = optimizer.param_groups[0]["lr"]
203
+ print(f" [NaN] {_nan_count} consecutive — emergency LR halved to {new_lr:.2e}")
204
+ _nan_count = 0
205
+
206
+ return loss_val # Return NaN so logging shows it, but weights are safe
207
+
208
+ # ── Normal path ──
209
+ _nan_count = 0 # Reset counter on valid loss
210
+
211
+ if grad_accum_steps > 1:
212
+ loss = loss / grad_accum_steps
213
+
214
  loss.backward()
215
+
216
+ # Sanitize gradients: replace any NaN/Inf grads with zero
217
+ for p in model.parameters():
218
+ if p.grad is not None and not torch.isfinite(p.grad).all():
219
+ p.grad.nan_to_num_(nan=0.0, posinf=0.0, neginf=0.0)
220
+
221
  if is_accum_step:
222
  torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
223
  optimizer.step()
224
  scheduler.step()
225
  optimizer.zero_grad(set_to_none=True)
226
  invalidate_all_caches(model)
227
+
228
  return loss_val