fix: torch.compile mode='default' (reduce-overhead crashes on CPU with glibc heap corruption)"
Browse files- chimera_turbo.py +20 -23
chimera_turbo.py
CHANGED
|
@@ -4,18 +4,17 @@ Usage: import chimera_turbo; chimera_turbo.apply(model, max_steps=N)
|
|
| 4 |
|
| 5 |
Paradigmes intΓ©grΓ©s:
|
| 6 |
P-TURBO-1: STE + AdamW (remplace MeZO β fix convergence + 50x moins de forwards)
|
| 7 |
-
P-TURBO-2: torch.compile
|
| 8 |
P-TURBO-3: Threading optimal + tcmalloc detection
|
| 9 |
P-TURBO-4: IPEX bf16/AMX si disponible
|
| 10 |
P-TURBO-5: Invalidate BitLinear packed caches after optimizer step
|
| 11 |
P-TURBO-6: INT8 ternary forward path (VNNI/AMX dispatch)
|
| 12 |
|
| 13 |
-
|
| 14 |
-
- torch.compile
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
- grad_accum_steps fix from v2 preserved.
|
| 19 |
"""
|
| 20 |
|
| 21 |
import math
|
|
@@ -186,16 +185,18 @@ def try_ipex_optimize(
|
|
| 186 |
# P-TURBO-2 : torch.compile
|
| 187 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 188 |
|
| 189 |
-
def try_compile_model(model: nn.Module, mode: str = "
|
| 190 |
"""
|
| 191 |
Compile model with torch.compile for kernel fusion.
|
| 192 |
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
|
| 197 |
-
|
| 198 |
-
|
|
|
|
|
|
|
| 199 |
"""
|
| 200 |
if not hasattr(torch, "compile"):
|
| 201 |
warnings.warn("torch.compile not available (PyTorch < 2.0)")
|
|
@@ -206,10 +207,10 @@ def try_compile_model(model: nn.Module, mode: str = "reduce-overhead") -> nn.Mod
|
|
| 206 |
model,
|
| 207 |
backend="inductor",
|
| 208 |
mode=mode,
|
| 209 |
-
fullgraph=False, # safety net for
|
| 210 |
)
|
| 211 |
print(f"[TURBO-2] torch.compile enabled (backend=inductor, mode={mode})")
|
| 212 |
-
print(f" First few steps will be slow (compilation). Then 1.5-
|
| 213 |
return compiled
|
| 214 |
except Exception as e:
|
| 215 |
warnings.warn(f"torch.compile failed: {e}. Running in eager mode.")
|
|
@@ -253,7 +254,7 @@ def apply(
|
|
| 253 |
lr: float = 1e-3,
|
| 254 |
weight_decay: float = 0.05,
|
| 255 |
warmup_steps: int = 500,
|
| 256 |
-
use_compile: bool = True,
|
| 257 |
use_ipex: bool = True,
|
| 258 |
use_lion: bool = False,
|
| 259 |
verbose: bool = True,
|
|
@@ -267,7 +268,7 @@ def apply(
|
|
| 267 |
|
| 268 |
if verbose:
|
| 269 |
print("=" * 65)
|
| 270 |
-
print("CHIMERA TURBO
|
| 271 |
print("=" * 65)
|
| 272 |
print(f" Physical cores: {cpu_info['physical_cores']}")
|
| 273 |
print(f" CPU capability: {cpu_info['capability']}")
|
|
@@ -295,7 +296,7 @@ def apply(
|
|
| 295 |
|
| 296 |
# ββ torch.compile ββ
|
| 297 |
if use_compile:
|
| 298 |
-
model = try_compile_model(model)
|
| 299 |
|
| 300 |
# ββ Warnings ββ
|
| 301 |
if verbose:
|
|
@@ -393,11 +394,7 @@ def profile_model(model: nn.Module, dummy_input: torch.Tensor, steps: int = 5):
|
|
| 393 |
|
| 394 |
|
| 395 |
def count_compile_graph_breaks(model: nn.Module, dummy_input: torch.Tensor):
|
| 396 |
-
"""Count how many graph breaks torch.compile would produce.
|
| 397 |
-
|
| 398 |
-
After the STE detach() migration, BitLinear should produce ZERO breaks.
|
| 399 |
-
Remaining breaks come from non-BitLinear modules (evolution, grammar, etc.)
|
| 400 |
-
"""
|
| 401 |
try:
|
| 402 |
import torch._dynamo as dynamo
|
| 403 |
explanation = dynamo.explain(model)(dummy_input)
|
|
|
|
| 4 |
|
| 5 |
Paradigmes intΓ©grΓ©s:
|
| 6 |
P-TURBO-1: STE + AdamW (remplace MeZO β fix convergence + 50x moins de forwards)
|
| 7 |
+
P-TURBO-2: torch.compile mode=default (CPU-safe, no CUDA graph pool)
|
| 8 |
P-TURBO-3: Threading optimal + tcmalloc detection
|
| 9 |
P-TURBO-4: IPEX bf16/AMX si disponible
|
| 10 |
P-TURBO-5: Invalidate BitLinear packed caches after optimizer step
|
| 11 |
P-TURBO-6: INT8 ternary forward path (VNNI/AMX dispatch)
|
| 12 |
|
| 13 |
+
v4 changes:
|
| 14 |
+
- torch.compile mode changed from 'reduce-overhead' to 'default'.
|
| 15 |
+
reduce-overhead uses CUDA graph capture + memory pool which corrupts
|
| 16 |
+
glibc heap on CPU ('corrupted double-linked list' abort).
|
| 17 |
+
mode='default' is the stable choice for CPU with graph breaks.
|
|
|
|
| 18 |
"""
|
| 19 |
|
| 20 |
import math
|
|
|
|
| 185 |
# P-TURBO-2 : torch.compile
|
| 186 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 187 |
|
| 188 |
+
def try_compile_model(model: nn.Module, mode: str = "default") -> nn.Module:
|
| 189 |
"""
|
| 190 |
Compile model with torch.compile for kernel fusion.
|
| 191 |
|
| 192 |
+
Uses mode='default' for CPU stability. Do NOT use 'reduce-overhead'
|
| 193 |
+
on CPU β it uses CUDA graph capture internals that corrupt the glibc
|
| 194 |
+
heap allocator ('corrupted double-linked list' crash).
|
| 195 |
|
| 196 |
+
mode='default': safe, fuses kernels via Inductor, ~1.3-2x speedup.
|
| 197 |
+
mode='max-autotune': slower compile, better code, ~1.5-2.5x speedup.
|
| 198 |
+
|
| 199 |
+
Expected: first ~10 steps slow (compilation), then steady speedup.
|
| 200 |
"""
|
| 201 |
if not hasattr(torch, "compile"):
|
| 202 |
warnings.warn("torch.compile not available (PyTorch < 2.0)")
|
|
|
|
| 207 |
model,
|
| 208 |
backend="inductor",
|
| 209 |
mode=mode,
|
| 210 |
+
fullgraph=False, # safety net for evolution.py graph breaks
|
| 211 |
)
|
| 212 |
print(f"[TURBO-2] torch.compile enabled (backend=inductor, mode={mode})")
|
| 213 |
+
print(f" First few steps will be slow (compilation). Then ~1.5-2x speedup.")
|
| 214 |
return compiled
|
| 215 |
except Exception as e:
|
| 216 |
warnings.warn(f"torch.compile failed: {e}. Running in eager mode.")
|
|
|
|
| 254 |
lr: float = 1e-3,
|
| 255 |
weight_decay: float = 0.05,
|
| 256 |
warmup_steps: int = 500,
|
| 257 |
+
use_compile: bool = True,
|
| 258 |
use_ipex: bool = True,
|
| 259 |
use_lion: bool = False,
|
| 260 |
verbose: bool = True,
|
|
|
|
| 268 |
|
| 269 |
if verbose:
|
| 270 |
print("=" * 65)
|
| 271 |
+
print("CHIMERA TURBO v4 β CPU Acceleration Layer")
|
| 272 |
print("=" * 65)
|
| 273 |
print(f" Physical cores: {cpu_info['physical_cores']}")
|
| 274 |
print(f" CPU capability: {cpu_info['capability']}")
|
|
|
|
| 296 |
|
| 297 |
# ββ torch.compile ββ
|
| 298 |
if use_compile:
|
| 299 |
+
model = try_compile_model(model, mode="default")
|
| 300 |
|
| 301 |
# ββ Warnings ββ
|
| 302 |
if verbose:
|
|
|
|
| 394 |
|
| 395 |
|
| 396 |
def count_compile_graph_breaks(model: nn.Module, dummy_input: torch.Tensor):
|
| 397 |
+
"""Count how many graph breaks torch.compile would produce."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
try:
|
| 399 |
import torch._dynamo as dynamo
|
| 400 |
explanation = dynamo.explain(model)(dummy_input)
|