Lgr54HFi
/

chomera

chimera51

custom_code

Model card Files Files and versions

xet

Community

Lgr54HFi commited on 12 days ago

Commit

bb2d3d5

verified ·

1 Parent(s): fc678ef

fix: torch.compile mode='default' (reduce-overhead crashes on CPU with glibc heap corruption)"

Browse files

Files changed (1) hide show

chimera_turbo.py +20 -23

chimera_turbo.py CHANGED Viewed

@@ -4,18 +4,17 @@ Usage: import chimera_turbo; chimera_turbo.apply(model, max_steps=N)
 Paradigmes intégrés:
   P-TURBO-1: STE + AdamW (remplace MeZO → fix convergence + 50x moins de forwards)
-  P-TURBO-2: torch.compile (now possible — STE uses detach() trick, zero graph breaks)
   P-TURBO-3: Threading optimal + tcmalloc detection
   P-TURBO-4: IPEX bf16/AMX si disponible
   P-TURBO-5: Invalidate BitLinear packed caches after optimizer step
   P-TURBO-6: INT8 ternary forward path (VNNI/AMX dispatch)
-v3 changes (after quantization.py STE migration):
-  - torch.compile RE-ENABLED: _RoundTernarySTE replaced with detach() trick
-    in quantization.py — zero graph breaks, Inductor can fuse quantize+linear.
-  - Compile uses fullgraph=False as safety net for any remaining breaks
-    in non-BitLinear modules (evolution engine, loop controller, etc.)
-  - grad_accum_steps fix from v2 preserved.
 """
 import math
@@ -186,16 +185,18 @@ def try_ipex_optimize(
 # P-TURBO-2 : torch.compile
 # ═══════════════════════════════════════════════════════════
-def try_compile_model(model: nn.Module, mode: str = "reduce-overhead") -> nn.Module:
     """
     Compile model with torch.compile for kernel fusion.
-    Now possible because STE uses the detach() trick (zero graph breaks
-    in BitLinear). Uses fullgraph=False as safety net for any remaining
-    breaks in non-BitLinear modules (evolution engine, grammar FST, etc.)
-    Expected speedup: 1.5-3x from fusing quantize + linear + activation.
-    First call is slow (compilation); subsequent calls are fast.
     """
     if not hasattr(torch, "compile"):
         warnings.warn("torch.compile not available (PyTorch < 2.0)")
@@ -206,10 +207,10 @@ def try_compile_model(model: nn.Module, mode: str = "reduce-overhead") -> nn.Mod
             model,
             backend="inductor",
             mode=mode,
-            fullgraph=False,  # safety net for non-BitLinear graph breaks
         )
         print(f"[TURBO-2] torch.compile enabled (backend=inductor, mode={mode})")
-        print(f"          First few steps will be slow (compilation). Then 1.5-3x speedup.")
         return compiled
     except Exception as e:
         warnings.warn(f"torch.compile failed: {e}. Running in eager mode.")
@@ -253,7 +254,7 @@ def apply(
     lr: float = 1e-3,
     weight_decay: float = 0.05,
     warmup_steps: int = 500,
-    use_compile: bool = True,   # ← RE-ENABLED (STE detach trick = zero graph breaks)
     use_ipex: bool = True,
     use_lion: bool = False,
     verbose: bool = True,
@@ -267,7 +268,7 @@ def apply(
     if verbose:
         print("=" * 65)
-        print("CHIMERA TURBO v3 — CPU Acceleration Layer")
         print("=" * 65)
         print(f"  Physical cores: {cpu_info['physical_cores']}")
         print(f"  CPU capability: {cpu_info['capability']}")
@@ -295,7 +296,7 @@ def apply(
     # ── torch.compile ──
     if use_compile:
-        model = try_compile_model(model)
     # ── Warnings ──
     if verbose:
@@ -393,11 +394,7 @@ def profile_model(model: nn.Module, dummy_input: torch.Tensor, steps: int = 5):
 def count_compile_graph_breaks(model: nn.Module, dummy_input: torch.Tensor):
-    """Count how many graph breaks torch.compile would produce.
-    After the STE detach() migration, BitLinear should produce ZERO breaks.
-    Remaining breaks come from non-BitLinear modules (evolution, grammar, etc.)
-    """
     try:
         import torch._dynamo as dynamo
         explanation = dynamo.explain(model)(dummy_input)

 Paradigmes intégrés:
   P-TURBO-1: STE + AdamW (remplace MeZO → fix convergence + 50x moins de forwards)
+  P-TURBO-2: torch.compile mode=default (CPU-safe, no CUDA graph pool)
   P-TURBO-3: Threading optimal + tcmalloc detection
   P-TURBO-4: IPEX bf16/AMX si disponible
   P-TURBO-5: Invalidate BitLinear packed caches after optimizer step
   P-TURBO-6: INT8 ternary forward path (VNNI/AMX dispatch)
+v4 changes:
+  - torch.compile mode changed from 'reduce-overhead' to 'default'.
+    reduce-overhead uses CUDA graph capture + memory pool which corrupts
+    glibc heap on CPU ('corrupted double-linked list' abort).
+    mode='default' is the stable choice for CPU with graph breaks.
 """
 import math
 # P-TURBO-2 : torch.compile
 # ═══════════════════════════════════════════════════════════
+def try_compile_model(model: nn.Module, mode: str = "default") -> nn.Module:
     """
     Compile model with torch.compile for kernel fusion.
+    Uses mode='default' for CPU stability. Do NOT use 'reduce-overhead'
+    on CPU — it uses CUDA graph capture internals that corrupt the glibc
+    heap allocator ('corrupted double-linked list' crash).
+    mode='default': safe, fuses kernels via Inductor, ~1.3-2x speedup.
+    mode='max-autotune': slower compile, better code, ~1.5-2.5x speedup.
+    Expected: first ~10 steps slow (compilation), then steady speedup.
     """
     if not hasattr(torch, "compile"):
         warnings.warn("torch.compile not available (PyTorch < 2.0)")
             model,
             backend="inductor",
             mode=mode,
+            fullgraph=False,  # safety net for evolution.py graph breaks
         )
         print(f"[TURBO-2] torch.compile enabled (backend=inductor, mode={mode})")
+        print(f"          First few steps will be slow (compilation). Then ~1.5-2x speedup.")
         return compiled
     except Exception as e:
         warnings.warn(f"torch.compile failed: {e}. Running in eager mode.")
     lr: float = 1e-3,
     weight_decay: float = 0.05,
     warmup_steps: int = 500,
+    use_compile: bool = True,
     use_ipex: bool = True,
     use_lion: bool = False,
     verbose: bool = True,
     if verbose:
         print("=" * 65)
+        print("CHIMERA TURBO v4 — CPU Acceleration Layer")
         print("=" * 65)
         print(f"  Physical cores: {cpu_info['physical_cores']}")
         print(f"  CPU capability: {cpu_info['capability']}")
     # ── torch.compile ──
     if use_compile:
+        model = try_compile_model(model, mode="default")
     # ── Warnings ──
     if verbose:
 def count_compile_graph_breaks(model: nn.Module, dummy_input: torch.Tensor):
+    """Count how many graph breaks torch.compile would produce."""
     try:
         import torch._dynamo as dynamo
         explanation = dynamo.explain(model)(dummy_input)