Lgr54HFi
/

chomera

chimera51

custom_code

Model card Files Files and versions

xet

Community

Lgr54HFi commited on 13 days ago

Commit

20ad65d

verified ·

1 Parent(s): 11c11f8

fix: turbo v2 — disable compile (84 graph breaks), fix grad_accum, add diagnostics

Browse files

Files changed (1) hide show

chimera_turbo.py +161 -282

chimera_turbo.py CHANGED Viewed

@@ -1,25 +1,33 @@
 """
 chimera_turbo.py — Drop-in CPU acceleration for ch1mera 5.3
-Usage: import chimera_turbo; chimera_turbo.apply(model, optimizer, args)
 Paradigmes intégrés:
   P-TURBO-1: STE + AdamW (remplace MeZO → fix convergence + 50x moins de forwards)
-  P-TURBO-2: torch.compile regional (2-3x kernel fusion)
   P-TURBO-3: Threading optimal + tcmalloc detection
   P-TURBO-4: IPEX bf16/AMX si disponible
-  P-TURBO-5: Cache poids quantifiés inter micro-batch
   P-TURBO-6: INT8 ternary forward path (VNNI/AMX dispatch)
   P-TURBO-7: Arrow mmap dataset
 """
 import os
 import sys
 import warnings
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from typing import Optional, Dict, Any, Tuple
-from functools import wraps
 from contextlib import nullcontext
 # ═══════════════════════════════════════════════════════════
@@ -29,11 +37,10 @@ from contextlib import nullcontext
 def detect_cpu_info() -> Dict[str, Any]:
     """Detect CPU capabilities for optimal configuration."""
     info = {}
     # Physical cores (not hyperthreads)
     try:
         physical = len(os.sched_getaffinity(0))
-        # Heuristic: if thread count is even, likely HT enabled → halve
         import multiprocessing
         logical = multiprocessing.cpu_count()
         info["physical_cores"] = logical // 2 if logical == physical else physical
@@ -42,18 +49,19 @@ def detect_cpu_info() -> Dict[str, Any]:
         import multiprocessing
         info["logical_cores"] = multiprocessing.cpu_count()
         info["physical_cores"] = info["logical_cores"] // 2
     # CPU capability
     try:
         info["capability"] = torch.backends.cpu.get_cpu_capability()
     except Exception:
         info["capability"] = "unknown"
-    # AMX support (Sapphire Rapids+)
-    info["has_amx"] = "amx" in info["capability"].lower() if info["capability"] else False
-    info["has_avx512"] = "avx512" in info["capability"].lower() if info["capability"] else False
-    info["has_vnni"] = info["has_avx512"]  # VNNI comes with AVX-512 Ice Lake+
     # IPEX available?
     try:
         import intel_extension_for_pytorch
@@ -61,23 +69,24 @@ def detect_cpu_info() -> Dict[str, Any]:
         info["ipex_version"] = intel_extension_for_pytorch.__version__
     except ImportError:
         info["ipex_available"] = False
     # tcmalloc loaded?
     info["tcmalloc"] = "tcmalloc" in os.environ.get("LD_PRELOAD", "")
     return info
 def configure_threading(cpu_info: Dict[str, Any], reserve_for_io: int = 1):
     """Set optimal threading for CPU training."""
     n_compute = max(1, cpu_info["physical_cores"] - reserve_for_io)
     torch.set_num_threads(n_compute)
-    torch.set_num_interop_threads(min(4, reserve_for_io + 1))
     os.environ["OMP_NUM_THREADS"] = str(n_compute)
     os.environ["MKL_NUM_THREADS"] = str(n_compute)
     return n_compute
@@ -94,19 +103,15 @@ def create_optimizer(
 ) -> torch.optim.Optimizer:
     """
     Create optimizer for STE-based ternary training (replaces MeZO).
     Based on BitNet b1.58 Reloaded (2407.09527):
     - lr=1e-3 for <300M params (NOT 1e-2, that's for 3B+)
     - weight_decay=0.05
     - AdamW with β=(0.9, 0.95)
-    The STE is already in BitLinear — just use a normal optimizer.
-    MeZO needed 528 forward passes per step; this needs 1 forward + 1 backward.
     """
-    # Separate weight decay groups (no WD on bias, layernorm, embeddings)
     decay_params = []
     no_decay_params = []
     for name, param in model.named_parameters():
         if not param.requires_grad:
             continue
@@ -114,172 +119,50 @@ def create_optimizer(
             no_decay_params.append(param)
         else:
             decay_params.append(param)
     param_groups = [
         {"params": decay_params, "weight_decay": weight_decay},
         {"params": no_decay_params, "weight_decay": 0.0},
     ]
     if use_lion:
         try:
             from lion_pytorch import Lion
             return Lion(param_groups, lr=lr * 0.3, betas=(0.95, 0.98))
         except ImportError:
             warnings.warn("lion-pytorch not installed, falling back to AdamW")
     return torch.optim.AdamW(param_groups, lr=lr, betas=betas, fused=False)
 def create_scheduler(optimizer, max_steps: int, warmup_steps: int = 500):
     """Cosine schedule with linear warmup — standard BitNet recipe."""
     from torch.optim.lr_scheduler import LambdaLR
-    import math
     def lr_lambda(step):
         if step < warmup_steps:
             return step / max(1, warmup_steps)
         progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
         return max(0.01, 0.5 * (1.0 + math.cos(math.pi * progress)))
     return LambdaLR(optimizer, lr_lambda)
 # ═══════════════════════════════════════════════════════════
-# P-TURBO-5 : Quantized Weight Cache
 # ═══════════════════════════════════════════════════════════
-class QuantCacheMixin:
-    """
-    Mixin for BitLinear to cache quantized weights during gradient accumulation.
-    Without cache: quantize weights on every micro-batch forward pass
-    With cache: quantize once, reuse across accumulation steps
-    Invalidate after optimizer.step()
-    """
-    _quant_cache: Optional[torch.Tensor] = None
-    _cache_valid: bool = False
-    def get_quantized_weight(self):
-        """Override in your BitLinear. Returns quantized weight + scale."""
-        raise NotImplementedError
-    def cached_quantized_weight(self):
-        if not self._cache_valid or self._quant_cache is None:
-            self._quant_cache = self.get_quantized_weight()
-            self._cache_valid = True
-        return self._quant_cache
-    def invalidate_cache(self):
-        self._cache_valid = False
-        self._quant_cache = None
 def invalidate_all_caches(model: nn.Module):
-    """Call after optimizer.step() to force re-quantization."""
-    for m in model.modules():
-        if hasattr(m, "invalidate_cache"):
-            m.invalidate_cache()
-# ═══════════════════════════════════════════════════════════
-# P-TURBO-6 : INT8 Ternary Forward Path
-# ═══════════════════════════════════════════════════════════
-def ternary_matmul_int8(
-    x: torch.Tensor,        # [B, S, K] float
-    w_ternary: torch.Tensor, # [N, K] float {-1, 0, 1}
-    w_scale: torch.Tensor,   # scalar
-) -> torch.Tensor:
-    """
-    INT8 ternary matmul using torch._int_mm (dispatches to VNNI/AMX).
-    For inference-in-training (eval steps) or forward pass if
-    your hardware has VNNI/AMX support.
-    Speedup: 2-4x over float GEMM for ternary weights.
     """
-    B, S, K = x.shape
-    x_flat = x.reshape(-1, K)  # [B*S, K]
-    # Quantize activations to int8
-    x_abs_max = x_flat.abs().amax(dim=-1, keepdim=True).clamp(min=1e-8)
-    x_scale = x_abs_max / 127.0
-    x_int8 = (x_flat / x_scale).round().clamp(-128, 127).to(torch.int8)
-    # Weights: already ternary, just cast
-    w_int8 = w_ternary.to(torch.int8)  # {-1, 0, 1} fits in int8
-    # INT8 GEMM — uses hardware VNNI/AMX if available
-    # torch._int_mm requires 2D inputs, both int8, K divisible by some alignment
-    try:
-        out_int32 = torch._int_mm(x_int8, w_int8.t())  # [B*S, N]
-        out = out_int32.float() * x_scale * w_scale
-    except RuntimeError:
-        # Fallback if alignment requirements not met
-        out = F.linear(x_flat.float(), w_ternary.float()) * w_scale
-    return out.reshape(B, S, -1)
-# ═══════════════════════════════════════════════════════════
-# P-TURBO-2 : torch.compile (Regional)
-# ═══════════════════════════════════════════════════════════
-def try_compile_model(model: nn.Module, mode: str = "reduce-overhead") -> nn.Module:
-    """
-    Attempt torch.compile with graceful fallback.
-    Uses regional compilation: compiles sub-modules individually
-    to work around graph breaks from STE custom autograd functions.
-    """
-    if not hasattr(torch, "compile"):
-        warnings.warn("torch.compile not available (PyTorch < 2.0)")
-        return model
-    # First: diagnose graph breaks
-    try:
-        import torch._dynamo as dynamo
-        # Try compiling individual attention/MLP blocks instead of full model
-        compiled_count = 0
-        for name, module in model.named_modules():
-            # Skip the top-level model and BitLinear (STE graph breaks)
-            if module is model:
-                continue
-            # Compile "clean" blocks: attention, MLP, norms
-            module_type = type(module).__name__.lower()
-            if any(k in module_type for k in ["attention", "mlp", "feedforward", "norm"]):
-                try:
-                    compiled = torch.compile(
-                        module,
-                        backend="inductor",
-                        mode=mode,
-                        fullgraph=False,
-                    )
-                    # Replace in parent
-                    parent_name = ".".join(name.split(".")[:-1])
-                    child_name = name.split(".")[-1]
-                    parent = model
-                    if parent_name:
-                        for part in parent_name.split("."):
-                            parent = getattr(parent, part)
-                    setattr(parent, child_name, compiled)
-                    compiled_count += 1
-                except Exception as e:
-                    pass  # Skip modules that can't be compiled
-        if compiled_count == 0:
-            # Fallback: try compiling the whole model with fullgraph=False
-            model = torch.compile(model, backend="inductor", mode=mode, fullgraph=False)
-            print(f"[TURBO-2] Compiled full model (fullgraph=False)")
-        else:
-            print(f"[TURBO-2] Compiled {compiled_count} sub-modules (regional)")
-        return model
-    except Exception as e:
-        warnings.warn(f"torch.compile failed: {e}. Running in eager mode.")
-        return model
 # ═══════════════════════════════════════════════════════════
@@ -296,21 +179,20 @@ def try_ipex_optimize(
     if not cpu_info.get("ipex_available"):
         print("[TURBO-4] IPEX not available — install: pip install intel-extension-for-pytorch")
         return model, optimizer
     import intel_extension_for_pytorch as ipex
-    # Choose dtype based on hardware
     if dtype is None:
         if cpu_info["has_amx"]:
-            dtype = torch.bfloat16  # AMX tiles → massive bf16 speedup
             print("[TURBO-4] IPEX + AMX bf16 enabled (Sapphire Rapids+)")
         elif cpu_info["has_avx512"]:
-            dtype = torch.bfloat16  # Moderate benefit with AVX-512
             print("[TURBO-4] IPEX + AVX-512 bf16 enabled")
         else:
-            dtype = torch.float32  # bf16 slower than fp32 without hardware support
             print("[TURBO-4] IPEX fp32 (no bf16 hardware support detected)")
     model, optimizer = ipex.optimize(
         model,
         optimizer=optimizer,
@@ -318,76 +200,62 @@ def try_ipex_optimize(
         level="O1",
         inplace=True,
     )
     return model, optimizer
 # ═══════════════════════════════════════════════════════════
-# P-TURBO-7 : Arrow mmap Dataset
 # ═══════════════════════════════════════════════════════════
-def prepare_arrow_dataset(
-    dataset_name: str = "roneneldan/TinyStories",
-    split: str = "train",
-    tokenizer=None,
-    seq_len: int = 32,
-    max_tokens: int = 500_000,
-    cache_dir: str = "./cache/arrow",
-    num_proc: int = 4,
-):
     """
-    Prepare dataset as Arrow mmap format for zero-copy loading.
-    Replaces streaming + custom .pt cache with HF datasets Arrow backend.
-    Benefits: zero-copy to PyTorch, random access, efficient memory via mmap.
     """
-    from datasets import load_dataset, Dataset
-    from pathlib import Path
-    cache_path = Path(cache_dir) / f"{dataset_name.replace('/', '_')}_{split}_{max_tokens}_seq{seq_len}"
-    if cache_path.exists():
-        print(f"[TURBO-7] Loading cached Arrow dataset from {cache_path}")
-        dataset = Dataset.load_from_disk(str(cache_path))
-        return dataset.with_format("torch")
-    print(f"[TURBO-7] Preparing Arrow dataset from {dataset_name}...")
-    # Load and tokenize
-    raw = load_dataset(dataset_name, split=split, streaming=True)
-    # Collect tokens
-    all_tokens = []
-    total = 0
-    for example in raw:
-        text = example.get("text", "")
-        if tokenizer is not None:
-            tokens = tokenizer.encode(text)
-        else:
-            # Fallback: assume pre-tokenized or return text
-            tokens = text
-        if isinstance(tokens, list):
-            all_tokens.extend(tokens)
-            total += len(tokens)
-        if total >= max_tokens:
-            break
-    all_tokens = all_tokens[:max_tokens]
-    # Chunk into sequences
-    n_seqs = len(all_tokens) // seq_len
-    chunks = [all_tokens[i * seq_len:(i + 1) * seq_len] for i in range(n_seqs)]
-    dataset = Dataset.from_dict({
-        "input_ids": chunks,
-    })
-    # Save as Arrow
-    cache_path.parent.mkdir(parents=True, exist_ok=True)
-    dataset.save_to_disk(str(cache_path))
-    print(f"[TURBO-7] Saved {n_seqs} sequences to {cache_path}")
-    return dataset.with_format("torch")
 # ═══════════════════════════════════════════════════════════
@@ -400,80 +268,70 @@ def apply(
     lr: float = 1e-3,
     weight_decay: float = 0.05,
     warmup_steps: int = 500,
-    use_compile: bool = True,
     use_ipex: bool = True,
     use_lion: bool = False,
     verbose: bool = True,
 ) -> Tuple[nn.Module, torch.optim.Optimizer, Any]:
     """
     Apply all turbo optimizations to ch1mera model.
     Returns: (model, optimizer, scheduler)
-    Usage in train_hyper.py:
-        import chimera_turbo
-        model, optimizer, scheduler = chimera_turbo.apply(
-            model, max_steps=10000, lr=1e-3
-        )
-        # Then use normal training loop:
-        for step, batch in enumerate(dataloader):
-            loss = model(batch).loss
-            loss.backward()
-            if (step + 1) % grad_accum == 0:
-                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-                optimizer.step()
-                scheduler.step()
-                optimizer.zero_grad(set_to_none=True)
-                chimera_turbo.invalidate_all_caches(model)
     """
-    # ── Step 1: Detect CPU ──
     cpu_info = detect_cpu_info()
     if verbose:
         print("=" * 65)
-        print("CHIMERA TURBO — CPU Acceleration Layer")
         print("=" * 65)
         print(f"  Physical cores: {cpu_info['physical_cores']}")
         print(f"  CPU capability: {cpu_info['capability']}")
-        print(f"  AMX: {cpu_info['has_amx']}  AVX-512: {cpu_info['has_avx512']}")
         print(f"  IPEX: {cpu_info['ipex_available']}")
         print(f"  tcmalloc: {cpu_info['tcmalloc']}")
-    # ── Step 2: Threading ──
     n_threads = configure_threading(cpu_info)
     if verbose:
-        print(f"[TURBO-3] Threads: {n_threads} compute + {torch.get_num_interop_threads()} interop")
-    # ── Step 3: Optimizer (replaces MeZO) ──
     optimizer = create_optimizer(model, lr=lr, weight_decay=weight_decay, use_lion=use_lion)
     scheduler = create_scheduler(optimizer, max_steps=max_steps, warmup_steps=warmup_steps)
     if verbose:
         opt_name = type(optimizer).__name__
         n_params = sum(p.numel() for g in optimizer.param_groups for p in g["params"])
         print(f"[TURBO-1] {opt_name} (lr={lr}, wd={weight_decay}) — {n_params:,} params")
-        print(f"          Replaces MeZO: 528 forwards/step → 1 forward + 1 backward")
-    # ── Step 4: IPEX ──
     if use_ipex:
         model, optimizer = try_ipex_optimize(model, optimizer, cpu_info)
-    # ── Step 5: torch.compile ──
     if use_compile:
         model = try_compile_model(model)
     if verbose:
         if not cpu_info["tcmalloc"]:
             print()
             print("  ⚠️  tcmalloc not detected. For +10-25% speedup:")
             print("     sudo apt install google-perftools")
             print("     LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 python train_hyper.py ...")
         print("=" * 65)
     return model, optimizer, scheduler
 # ═══════════════════════════════════════════════════════════
-# Training loop helper
 # ═══════════════════════════════════════════════════════════
 def training_step(
@@ -488,12 +346,15 @@ def training_step(
 ) -> float:
     """
     Single training step with all turbo optimizations active.
     Handles: autocast, gradient accumulation, clipping, cache invalidation.
     """
     is_accum_step = (step + 1) % grad_accum_steps == 0
-    # Forward + backward
     ctx = torch.autocast(device_type="cpu", dtype=autocast_dtype) if autocast_dtype else nullcontext()
     with ctx:
         if isinstance(batch, dict):
@@ -503,37 +364,38 @@ def training_step(
         else:
             outputs = model(batch)
         loss = outputs if isinstance(outputs, torch.Tensor) else outputs.loss
-        loss = loss / grad_accum_steps
     loss.backward()
     if is_accum_step:
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
         optimizer.step()
         scheduler.step()
         optimizer.zero_grad(set_to_none=True)
         invalidate_all_caches(model)
-    return loss.item() * grad_accum_steps
 # ═══════════════════════════════════════════════════════════
-# Diagnostic tool
 # ═══════════════════════════════════════════════════════════
 def profile_model(model: nn.Module, dummy_input: torch.Tensor, steps: int = 5):
     """Profile forward+backward to find bottlenecks."""
     print("\n[TURBO-DIAG] Profiling...")
-    # Warmup
     for _ in range(2):
         out = model(dummy_input)
-        if hasattr(out, "loss"):
             out.loss.backward()
-        else:
             out.sum().backward()
         model.zero_grad(set_to_none=True)
     with torch.profiler.profile(
         activities=[torch.profiler.ProfilerActivity.CPU],
         record_shapes=True,
@@ -541,9 +403,26 @@ def profile_model(model: nn.Module, dummy_input: torch.Tensor, steps: int = 5):
     ) as prof:
         for _ in range(steps):
             out = model(dummy_input)
-            loss = out.loss if hasattr(out, "loss") else out.sum()
             loss.backward()
             model.zero_grad(set_to_none=True)
     print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
     return prof

 """
 chimera_turbo.py — Drop-in CPU acceleration for ch1mera 5.3
+Usage: import chimera_turbo; chimera_turbo.apply(model, max_steps=N)
 Paradigmes intégrés:
   P-TURBO-1: STE + AdamW (remplace MeZO → fix convergence + 50x moins de forwards)
+  P-TURBO-2: torch.compile regional — DISABLED (84 graph breaks from _RoundTernarySTE)
   P-TURBO-3: Threading optimal + tcmalloc detection
   P-TURBO-4: IPEX bf16/AMX si disponible
+  P-TURBO-5: Cache poids quantifiés inter micro-batch (via BitLinear existing cache)
   P-TURBO-6: INT8 ternary forward path (VNNI/AMX dispatch)
   P-TURBO-7: Arrow mmap dataset
+v2 changes:
+  - torch.compile DISABLED by default: _RoundTernarySTE (autograd.Function) causes
+    84+ graph breaks (28 layers × 3 BitLinear each). Net effect is SLOWER than eager.
+    Re-enable only after migrating STE to functional torch (torch.round + custom_vjp).
+  - Fix grad_accum_steps logic: DataLoader already provides eff_batch, don't double-accumulate.
+  - Add profile_bottleneck() for quick diagnosis.
+  - Better bf16 autocast handling: skip autocast if CPU has no AMX/AVX512-BF16.
 """
+import math
 import os
 import sys
 import warnings
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from typing import Optional, Dict, Any, Tuple, List
 from contextlib import nullcontext
 # ═══════════════════════════════════════════════════════════
 def detect_cpu_info() -> Dict[str, Any]:
     """Detect CPU capabilities for optimal configuration."""
     info = {}
     # Physical cores (not hyperthreads)
     try:
         physical = len(os.sched_getaffinity(0))
         import multiprocessing
         logical = multiprocessing.cpu_count()
         info["physical_cores"] = logical // 2 if logical == physical else physical
         import multiprocessing
         info["logical_cores"] = multiprocessing.cpu_count()
         info["physical_cores"] = info["logical_cores"] // 2
     # CPU capability
     try:
         info["capability"] = torch.backends.cpu.get_cpu_capability()
     except Exception:
         info["capability"] = "unknown"
+    cap = (info["capability"] or "").lower()
+    info["has_amx"] = "amx" in cap
+    info["has_avx512"] = "avx512" in cap or "avx512_vnni" in cap
+    info["has_avx512_bf16"] = "avx512_bf16" in cap or info["has_amx"]
+    info["has_vnni"] = info["has_avx512"]
     # IPEX available?
     try:
         import intel_extension_for_pytorch
         info["ipex_version"] = intel_extension_for_pytorch.__version__
     except ImportError:
         info["ipex_available"] = False
     # tcmalloc loaded?
     info["tcmalloc"] = "tcmalloc" in os.environ.get("LD_PRELOAD", "")
     return info
 def configure_threading(cpu_info: Dict[str, Any], reserve_for_io: int = 1):
     """Set optimal threading for CPU training."""
     n_compute = max(1, cpu_info["physical_cores"] - reserve_for_io)
+    # Only set num_threads — interop threads can only be set once before
+    # any tensor ops, and train_hyper.py already sets them at import time.
     torch.set_num_threads(n_compute)
     os.environ["OMP_NUM_THREADS"] = str(n_compute)
     os.environ["MKL_NUM_THREADS"] = str(n_compute)
     return n_compute
 ) -> torch.optim.Optimizer:
     """
     Create optimizer for STE-based ternary training (replaces MeZO).
     Based on BitNet b1.58 Reloaded (2407.09527):
     - lr=1e-3 for <300M params (NOT 1e-2, that's for 3B+)
     - weight_decay=0.05
     - AdamW with β=(0.9, 0.95)
     """
     decay_params = []
     no_decay_params = []
     for name, param in model.named_parameters():
         if not param.requires_grad:
             continue
             no_decay_params.append(param)
         else:
             decay_params.append(param)
     param_groups = [
         {"params": decay_params, "weight_decay": weight_decay},
         {"params": no_decay_params, "weight_decay": 0.0},
     ]
     if use_lion:
         try:
             from lion_pytorch import Lion
             return Lion(param_groups, lr=lr * 0.3, betas=(0.95, 0.98))
         except ImportError:
             warnings.warn("lion-pytorch not installed, falling back to AdamW")
     return torch.optim.AdamW(param_groups, lr=lr, betas=betas, fused=False)
 def create_scheduler(optimizer, max_steps: int, warmup_steps: int = 500):
     """Cosine schedule with linear warmup — standard BitNet recipe."""
     from torch.optim.lr_scheduler import LambdaLR
     def lr_lambda(step):
         if step < warmup_steps:
             return step / max(1, warmup_steps)
         progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
         return max(0.01, 0.5 * (1.0 + math.cos(math.pi * progress)))
     return LambdaLR(optimizer, lr_lambda)
 # ═══════════════════════════════════════════════════════════
+# P-TURBO-5 : Invalidate BitLinear packed caches after optimizer step
 # ═══════════════════════════════════════════════════════════
 def invalidate_all_caches(model: nn.Module):
+    """Call after optimizer.step() to force BitLinear re-quantization.
+    In training mode, BitLinear._forward_train() recomputes quantized
+    weights every call via STE, so the packed cache is not used.
+    This is still good practice for eval steps between training.
     """
+    from chimera.quantization import BitLinear
+    for m in model.modules():
+        if isinstance(m, BitLinear):
+            m.invalidate_packed()
 # ═══════════════════════════════════════════════════════════
     if not cpu_info.get("ipex_available"):
         print("[TURBO-4] IPEX not available — install: pip install intel-extension-for-pytorch")
         return model, optimizer
     import intel_extension_for_pytorch as ipex
     if dtype is None:
         if cpu_info["has_amx"]:
+            dtype = torch.bfloat16
             print("[TURBO-4] IPEX + AMX bf16 enabled (Sapphire Rapids+)")
         elif cpu_info["has_avx512"]:
+            dtype = torch.bfloat16
             print("[TURBO-4] IPEX + AVX-512 bf16 enabled")
         else:
+            dtype = torch.float32
             print("[TURBO-4] IPEX fp32 (no bf16 hardware support detected)")
     model, optimizer = ipex.optimize(
         model,
         optimizer=optimizer,
         level="O1",
         inplace=True,
     )
     return model, optimizer
 # ═══════════════════════════════════════════════════════════
+# P-TURBO-2 : torch.compile — DISABLED by default
 # ═══════════════════════════════════════════════════════════
+def try_compile_model(model: nn.Module, mode: str = "reduce-overhead") -> nn.Module:
     """
+    Attempt torch.compile with graceful fallback.
+    CURRENTLY DISABLED: _RoundTernarySTE (torch.autograd.Function) causes
+    84+ graph breaks across 28 layers × 3 BitLinear. This makes torch.compile
+    slower than eager mode due to recompilation overhead.
+    To re-enable: migrate STE to use torch library custom ops:
+        @torch.library.custom_op("chimera::ste_ternary", mutates_args=())
+        def ste_ternary(w: torch.Tensor) -> torch.Tensor:
+            return torch.round(torch.clamp(w, -1.0, 1.0))
+        @ste_ternary.register_fake
+        def _(w): return torch.empty_like(w)
+        @torch.library.register_autograd("chimera::ste_ternary", ...)
     """
+    print("[TURBO-2] torch.compile SKIPPED (84 graph breaks from STE autograd.Function)")
+    print("          To enable: migrate _RoundTernarySTE to torch.library.custom_op")
+    return model
+# ═══════════════════════════════════════════════════════════
+# P-TURBO-6 : INT8 Ternary Forward Path
+# ═══════════════════════════════════════════════════════════
+def ternary_matmul_int8(
+    x: torch.Tensor,
+    w_ternary: torch.Tensor,
+    w_scale: torch.Tensor,
+) -> torch.Tensor:
+    """INT8 ternary matmul using torch._int_mm (dispatches to VNNI/AMX)."""
+    B, S, K = x.shape
+    x_flat = x.reshape(-1, K)
+    x_abs_max = x_flat.abs().amax(dim=-1, keepdim=True).clamp(min=1e-8)
+    x_scale = x_abs_max / 127.0
+    x_int8 = (x_flat / x_scale).round().clamp(-128, 127).to(torch.int8)
+    w_int8 = w_ternary.to(torch.int8)
+    try:
+        out_int32 = torch._int_mm(x_int8, w_int8.t())
+        out = out_int32.float() * x_scale * w_scale
+    except RuntimeError:
+        out = F.linear(x_flat.float(), w_ternary.float()) * w_scale
+    return out.reshape(B, S, -1)
 # ═══════════════════════════════════════════════════════════
     lr: float = 1e-3,
     weight_decay: float = 0.05,
     warmup_steps: int = 500,
+    use_compile: bool = False,  # ← DISABLED by default (was True)
     use_ipex: bool = True,
     use_lion: bool = False,
     verbose: bool = True,
 ) -> Tuple[nn.Module, torch.optim.Optimizer, Any]:
     """
     Apply all turbo optimizations to ch1mera model.
     Returns: (model, optimizer, scheduler)
     """
     cpu_info = detect_cpu_info()
     if verbose:
         print("=" * 65)
+        print("CHIMERA TURBO v2 — CPU Acceleration Layer")
         print("=" * 65)
         print(f"  Physical cores: {cpu_info['physical_cores']}")
         print(f"  CPU capability: {cpu_info['capability']}")
+        print(f"  AMX: {cpu_info['has_amx']}  AVX-512: {cpu_info['has_avx512']}  BF16 hw: {cpu_info['has_avx512_bf16']}")
         print(f"  IPEX: {cpu_info['ipex_available']}")
         print(f"  tcmalloc: {cpu_info['tcmalloc']}")
+    # ── Threading ──
     n_threads = configure_threading(cpu_info)
     if verbose:
+        print(f"[TURBO-3] Compute threads: {n_threads}")
+    # ── Optimizer (replaces MeZO) ──
     optimizer = create_optimizer(model, lr=lr, weight_decay=weight_decay, use_lion=use_lion)
     scheduler = create_scheduler(optimizer, max_steps=max_steps, warmup_steps=warmup_steps)
     if verbose:
         opt_name = type(optimizer).__name__
         n_params = sum(p.numel() for g in optimizer.param_groups for p in g["params"])
         print(f"[TURBO-1] {opt_name} (lr={lr}, wd={weight_decay}) — {n_params:,} params")
+        print(f"          STE backprop: 1 forward + 1 backward per step")
+    # ── IPEX ──
     if use_ipex:
         model, optimizer = try_ipex_optimize(model, optimizer, cpu_info)
+    # ── torch.compile ──
     if use_compile:
         model = try_compile_model(model)
+    # ── Autocast recommendation ──
     if verbose:
+        if not cpu_info["has_avx512_bf16"]:
+            print()
+            print("  ⚠️  No hardware BF16 support detected (need AVX512-BF16 or AMX).")
+            print("     BF16 autocast may be SLOWER than fp32 on this CPU.")
+            print("     Consider --no-bf16 flag if training is slow.")
         if not cpu_info["tcmalloc"]:
             print()
             print("  ⚠️  tcmalloc not detected. For +10-25% speedup:")
             print("     sudo apt install google-perftools")
             print("     LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 python train_hyper.py ...")
         print("=" * 65)
     return model, optimizer, scheduler
 # ═══════════════════════════════════════════════════════════
+# Training step helper
 # ═══════════════════════════════════════════════════════════
 def training_step(
 ) -> float:
     """
     Single training step with all turbo optimizations active.
     Handles: autocast, gradient accumulation, clipping, cache invalidation.
+    IMPORTANT: grad_accum_steps should be 1 if the DataLoader already provides
+    the full effective batch. Set >1 only if you want to split a large batch
+    across multiple forward passes.
     """
     is_accum_step = (step + 1) % grad_accum_steps == 0
     ctx = torch.autocast(device_type="cpu", dtype=autocast_dtype) if autocast_dtype else nullcontext()
     with ctx:
         if isinstance(batch, dict):
         else:
             outputs = model(batch)
         loss = outputs if isinstance(outputs, torch.Tensor) else outputs.loss
+        loss_val = loss.item()
+        if grad_accum_steps > 1:
+            loss = loss / grad_accum_steps
     loss.backward()
     if is_accum_step:
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
         optimizer.step()
         scheduler.step()
         optimizer.zero_grad(set_to_none=True)
         invalidate_all_caches(model)
+    return loss_val
 # ═══════════════════════════════════════════════════════════
+# Diagnostic tools
 # ═══════════════════════════════════════════════════════════
 def profile_model(model: nn.Module, dummy_input: torch.Tensor, steps: int = 5):
     """Profile forward+backward to find bottlenecks."""
     print("\n[TURBO-DIAG] Profiling...")
     for _ in range(2):
         out = model(dummy_input)
+        if hasattr(out, "loss") and out.loss is not None:
             out.loss.backward()
+        elif isinstance(out, torch.Tensor):
             out.sum().backward()
         model.zero_grad(set_to_none=True)
     with torch.profiler.profile(
         activities=[torch.profiler.ProfilerActivity.CPU],
         record_shapes=True,
     ) as prof:
         for _ in range(steps):
             out = model(dummy_input)
+            loss = out.loss if (hasattr(out, "loss") and out.loss is not None) else out.sum()
             loss.backward()
             model.zero_grad(set_to_none=True)
     print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
     return prof
+def count_compile_graph_breaks(model: nn.Module, dummy_input: torch.Tensor):
+    """Count how many graph breaks torch.compile would produce."""
+    try:
+        import torch._dynamo as dynamo
+        explanation = dynamo.explain(model)(dummy_input)
+        n_breaks = len(explanation.break_reasons)
+        print(f"\n[TURBO-DIAG] Graph breaks: {n_breaks}")
+        for i, reason in enumerate(explanation.break_reasons[:10]):
+            print(f"  [{i+1}] {reason}")
+        if n_breaks > 10:
+            print(f"  ... and {n_breaks - 10} more")
+        return n_breaks
+    except Exception as e:
+        print(f"[TURBO-DIAG] dynamo.explain failed: {e}")
+        return -1