Lgr54HFi commited on
Commit
bb2d3d5
Β·
verified Β·
1 Parent(s): fc678ef

fix: torch.compile mode='default' (reduce-overhead crashes on CPU with glibc heap corruption)"

Browse files
Files changed (1) hide show
  1. chimera_turbo.py +20 -23
chimera_turbo.py CHANGED
@@ -4,18 +4,17 @@ Usage: import chimera_turbo; chimera_turbo.apply(model, max_steps=N)
4
 
5
  Paradigmes intΓ©grΓ©s:
6
  P-TURBO-1: STE + AdamW (remplace MeZO β†’ fix convergence + 50x moins de forwards)
7
- P-TURBO-2: torch.compile (now possible β€” STE uses detach() trick, zero graph breaks)
8
  P-TURBO-3: Threading optimal + tcmalloc detection
9
  P-TURBO-4: IPEX bf16/AMX si disponible
10
  P-TURBO-5: Invalidate BitLinear packed caches after optimizer step
11
  P-TURBO-6: INT8 ternary forward path (VNNI/AMX dispatch)
12
 
13
- v3 changes (after quantization.py STE migration):
14
- - torch.compile RE-ENABLED: _RoundTernarySTE replaced with detach() trick
15
- in quantization.py β€” zero graph breaks, Inductor can fuse quantize+linear.
16
- - Compile uses fullgraph=False as safety net for any remaining breaks
17
- in non-BitLinear modules (evolution engine, loop controller, etc.)
18
- - grad_accum_steps fix from v2 preserved.
19
  """
20
 
21
  import math
@@ -186,16 +185,18 @@ def try_ipex_optimize(
186
  # P-TURBO-2 : torch.compile
187
  # ═══════════════════════════════════════════════════════════
188
 
189
- def try_compile_model(model: nn.Module, mode: str = "reduce-overhead") -> nn.Module:
190
  """
191
  Compile model with torch.compile for kernel fusion.
192
 
193
- Now possible because STE uses the detach() trick (zero graph breaks
194
- in BitLinear). Uses fullgraph=False as safety net for any remaining
195
- breaks in non-BitLinear modules (evolution engine, grammar FST, etc.)
196
 
197
- Expected speedup: 1.5-3x from fusing quantize + linear + activation.
198
- First call is slow (compilation); subsequent calls are fast.
 
 
199
  """
200
  if not hasattr(torch, "compile"):
201
  warnings.warn("torch.compile not available (PyTorch < 2.0)")
@@ -206,10 +207,10 @@ def try_compile_model(model: nn.Module, mode: str = "reduce-overhead") -> nn.Mod
206
  model,
207
  backend="inductor",
208
  mode=mode,
209
- fullgraph=False, # safety net for non-BitLinear graph breaks
210
  )
211
  print(f"[TURBO-2] torch.compile enabled (backend=inductor, mode={mode})")
212
- print(f" First few steps will be slow (compilation). Then 1.5-3x speedup.")
213
  return compiled
214
  except Exception as e:
215
  warnings.warn(f"torch.compile failed: {e}. Running in eager mode.")
@@ -253,7 +254,7 @@ def apply(
253
  lr: float = 1e-3,
254
  weight_decay: float = 0.05,
255
  warmup_steps: int = 500,
256
- use_compile: bool = True, # ← RE-ENABLED (STE detach trick = zero graph breaks)
257
  use_ipex: bool = True,
258
  use_lion: bool = False,
259
  verbose: bool = True,
@@ -267,7 +268,7 @@ def apply(
267
 
268
  if verbose:
269
  print("=" * 65)
270
- print("CHIMERA TURBO v3 β€” CPU Acceleration Layer")
271
  print("=" * 65)
272
  print(f" Physical cores: {cpu_info['physical_cores']}")
273
  print(f" CPU capability: {cpu_info['capability']}")
@@ -295,7 +296,7 @@ def apply(
295
 
296
  # ── torch.compile ──
297
  if use_compile:
298
- model = try_compile_model(model)
299
 
300
  # ── Warnings ──
301
  if verbose:
@@ -393,11 +394,7 @@ def profile_model(model: nn.Module, dummy_input: torch.Tensor, steps: int = 5):
393
 
394
 
395
  def count_compile_graph_breaks(model: nn.Module, dummy_input: torch.Tensor):
396
- """Count how many graph breaks torch.compile would produce.
397
-
398
- After the STE detach() migration, BitLinear should produce ZERO breaks.
399
- Remaining breaks come from non-BitLinear modules (evolution, grammar, etc.)
400
- """
401
  try:
402
  import torch._dynamo as dynamo
403
  explanation = dynamo.explain(model)(dummy_input)
 
4
 
5
  Paradigmes intΓ©grΓ©s:
6
  P-TURBO-1: STE + AdamW (remplace MeZO β†’ fix convergence + 50x moins de forwards)
7
+ P-TURBO-2: torch.compile mode=default (CPU-safe, no CUDA graph pool)
8
  P-TURBO-3: Threading optimal + tcmalloc detection
9
  P-TURBO-4: IPEX bf16/AMX si disponible
10
  P-TURBO-5: Invalidate BitLinear packed caches after optimizer step
11
  P-TURBO-6: INT8 ternary forward path (VNNI/AMX dispatch)
12
 
13
+ v4 changes:
14
+ - torch.compile mode changed from 'reduce-overhead' to 'default'.
15
+ reduce-overhead uses CUDA graph capture + memory pool which corrupts
16
+ glibc heap on CPU ('corrupted double-linked list' abort).
17
+ mode='default' is the stable choice for CPU with graph breaks.
 
18
  """
19
 
20
  import math
 
185
  # P-TURBO-2 : torch.compile
186
  # ═══════════════════════════════════════════════════════════
187
 
188
+ def try_compile_model(model: nn.Module, mode: str = "default") -> nn.Module:
189
  """
190
  Compile model with torch.compile for kernel fusion.
191
 
192
+ Uses mode='default' for CPU stability. Do NOT use 'reduce-overhead'
193
+ on CPU β€” it uses CUDA graph capture internals that corrupt the glibc
194
+ heap allocator ('corrupted double-linked list' crash).
195
 
196
+ mode='default': safe, fuses kernels via Inductor, ~1.3-2x speedup.
197
+ mode='max-autotune': slower compile, better code, ~1.5-2.5x speedup.
198
+
199
+ Expected: first ~10 steps slow (compilation), then steady speedup.
200
  """
201
  if not hasattr(torch, "compile"):
202
  warnings.warn("torch.compile not available (PyTorch < 2.0)")
 
207
  model,
208
  backend="inductor",
209
  mode=mode,
210
+ fullgraph=False, # safety net for evolution.py graph breaks
211
  )
212
  print(f"[TURBO-2] torch.compile enabled (backend=inductor, mode={mode})")
213
+ print(f" First few steps will be slow (compilation). Then ~1.5-2x speedup.")
214
  return compiled
215
  except Exception as e:
216
  warnings.warn(f"torch.compile failed: {e}. Running in eager mode.")
 
254
  lr: float = 1e-3,
255
  weight_decay: float = 0.05,
256
  warmup_steps: int = 500,
257
+ use_compile: bool = True,
258
  use_ipex: bool = True,
259
  use_lion: bool = False,
260
  verbose: bool = True,
 
268
 
269
  if verbose:
270
  print("=" * 65)
271
+ print("CHIMERA TURBO v4 β€” CPU Acceleration Layer")
272
  print("=" * 65)
273
  print(f" Physical cores: {cpu_info['physical_cores']}")
274
  print(f" CPU capability: {cpu_info['capability']}")
 
296
 
297
  # ── torch.compile ──
298
  if use_compile:
299
+ model = try_compile_model(model, mode="default")
300
 
301
  # ── Warnings ──
302
  if verbose:
 
394
 
395
 
396
  def count_compile_graph_breaks(model: nn.Module, dummy_input: torch.Tensor):
397
+ """Count how many graph breaks torch.compile would produce."""
 
 
 
 
398
  try:
399
  import torch._dynamo as dynamo
400
  explanation = dynamo.explain(model)(dummy_input)