Lgr54HFi
/

chomera

chimera51

custom_code

Model card Files Files and versions

xet

Community

Lgr54HFi commited on 13 days ago

Commit

a97a233

verified ·

1 Parent(s): ec200d2

fix: lower max_grad_norm 1.0→0.5 to prevent NaN with ternary STE training"

Browse files

Files changed (1) hide show

chimera_turbo.py +24 -230

chimera_turbo.py CHANGED Viewed

@@ -10,10 +10,7 @@ Paradigmes intégrés:
   P-TURBO-5: Invalidate BitLinear packed caches after optimizer step
   P-TURBO-6: INT8 ternary forward path (VNNI/AMX dispatch)
-v5 changes:
-  - Fix IPEX version mismatch crash: IPEX for PyTorch 2.8 installed with
-    PyTorch 2.11 calls os.exit(127) which doesn't exist → AttributeError.
-    Now catches Exception (not just ImportError) on IPEX import.
 """
 import math
@@ -25,14 +22,9 @@ import torch.nn.functional as F
 from typing import Optional, Dict, Any, Tuple
 from contextlib import nullcontext
-# ═══════════════════════════════════════════════════════════
-# P-TURBO-3 : Threading + Environment
-# ═══════════════════════════════════════════════════════════
 def detect_cpu_info() -> Dict[str, Any]:
-    """Detect CPU capabilities for optimal configuration."""
     info = {}
     try:
         physical = len(os.sched_getaffinity(0))
         import multiprocessing
@@ -43,35 +35,26 @@ def detect_cpu_info() -> Dict[str, Any]:
         import multiprocessing
         info["logical_cores"] = multiprocessing.cpu_count()
         info["physical_cores"] = info["logical_cores"] // 2
     try:
         info["capability"] = torch.backends.cpu.get_cpu_capability()
     except Exception:
         info["capability"] = "unknown"
     cap = (info["capability"] or "").lower()
     info["has_amx"] = "amx" in cap
     info["has_avx512"] = "avx512" in cap or "avx512_vnni" in cap
     info["has_avx512_bf16"] = "avx512_bf16" in cap or info["has_amx"]
     info["has_vnni"] = info["has_avx512"]
-    # IPEX import can crash in many ways: ImportError (not installed),
-    # SystemExit (version mismatch), AttributeError (buggy os.exit in IPEX),
-    # RuntimeError, etc. Catch broadly.
     try:
         import intel_extension_for_pytorch
         info["ipex_available"] = True
         info["ipex_version"] = intel_extension_for_pytorch.__version__
     except Exception:
         info["ipex_available"] = False
     info["tcmalloc"] = "tcmalloc" in os.environ.get("LD_PRELOAD", "")
     return info
 def configure_threading(cpu_info: Dict[str, Any], reserve_for_io: int = 1):
-    """Set optimal threading for CPU training."""
     n_compute = max(1, cpu_info["physical_cores"] - reserve_for_io)
     torch.set_num_threads(n_compute)
     os.environ["OMP_NUM_THREADS"] = str(n_compute)
@@ -79,28 +62,11 @@ def configure_threading(cpu_info: Dict[str, Any], reserve_for_io: int = 1):
     return n_compute
-# ═══════════════════════════════════════════════════════════
-# P-TURBO-1 : STE + AdamW (remplace MeZO)
-# ═══════════════════════════════════════════════════════════
 def create_optimizer(
-    model: nn.Module,
-    lr: float = 1e-3,
-    weight_decay: float = 0.05,
-    use_lion: bool = False,
-    betas: Tuple[float, float] = (0.9, 0.95),
 ) -> torch.optim.Optimizer:
-    """
-    Create optimizer for STE-based ternary training (replaces MeZO).
-    Based on BitNet b1.58 Reloaded (2407.09527):
-    - lr=1e-3 for <300M params
-    - weight_decay=0.05
-    - AdamW with β=(0.9, 0.95)
-    """
-    decay_params = []
-    no_decay_params = []
     for name, param in model.named_parameters():
         if not param.requires_grad:
             continue
@@ -108,237 +74,118 @@ def create_optimizer(
             no_decay_params.append(param)
         else:
             decay_params.append(param)
     param_groups = [
         {"params": decay_params, "weight_decay": weight_decay},
         {"params": no_decay_params, "weight_decay": 0.0},
     ]
     if use_lion:
         try:
             from lion_pytorch import Lion
             return Lion(param_groups, lr=lr * 0.3, betas=(0.95, 0.98))
         except ImportError:
             warnings.warn("lion-pytorch not installed, falling back to AdamW")
     return torch.optim.AdamW(param_groups, lr=lr, betas=betas, fused=False)
 def create_scheduler(optimizer, max_steps: int, warmup_steps: int = 500):
-    """Cosine schedule with linear warmup — standard BitNet recipe."""
     from torch.optim.lr_scheduler import LambdaLR
     def lr_lambda(step):
         if step < warmup_steps:
             return step / max(1, warmup_steps)
         progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
         return max(0.01, 0.5 * (1.0 + math.cos(math.pi * progress)))
     return LambdaLR(optimizer, lr_lambda)
-# ═══════════════════════════════════════════════════════════
-# P-TURBO-5 : Invalidate BitLinear packed caches
-# ═══════════════════════════════════════════════════════════
 def invalidate_all_caches(model: nn.Module):
-    """Call after optimizer.step() to force BitLinear re-quantization."""
     from chimera.quantization import BitLinear
     for m in model.modules():
         if isinstance(m, BitLinear):
             m.invalidate_packed()
-# ═══════════════════════════════════════════════════════════
-# P-TURBO-4 : IPEX Integration
-# ═══════════════════════════════════════════════════════════
-def try_ipex_optimize(
-    model: nn.Module,
-    optimizer: torch.optim.Optimizer,
-    cpu_info: Dict[str, Any],
-    dtype: Optional[torch.dtype] = None,
-) -> Tuple[nn.Module, torch.optim.Optimizer]:
-    """Apply IPEX optimization if available and beneficial."""
     if not cpu_info.get("ipex_available"):
         print("[TURBO-4] IPEX not available — skipping")
         return model, optimizer
     try:
         import intel_extension_for_pytorch as ipex
     except Exception:
         print("[TURBO-4] IPEX import failed — skipping")
         return model, optimizer
     if dtype is None:
         if cpu_info["has_amx"]:
             dtype = torch.bfloat16
-            print("[TURBO-4] IPEX + AMX bf16 enabled (Sapphire Rapids+)")
         elif cpu_info["has_avx512"]:
             dtype = torch.bfloat16
             print("[TURBO-4] IPEX + AVX-512 bf16 enabled")
         else:
             dtype = torch.float32
-            print("[TURBO-4] IPEX fp32 (no bf16 hardware support detected)")
-    model, optimizer = ipex.optimize(
-        model, optimizer=optimizer, dtype=dtype, level="O1", inplace=True,
-    )
     return model, optimizer
-# ═══════════════════════════════════════════════════════════
-# P-TURBO-2 : torch.compile
-# ═══════════════════════════════════════════════════════════
 def try_compile_model(model: nn.Module, mode: str = "default") -> nn.Module:
-    """
-    Compile model with torch.compile for kernel fusion.
-    Uses mode='default' for CPU stability. Do NOT use 'reduce-overhead'
-    on CPU — it corrupts the glibc heap allocator.
-    Expected: first ~10 steps slow (compilation), then ~1.5-2x speedup.
-    """
     if not hasattr(torch, "compile"):
-        warnings.warn("torch.compile not available (PyTorch < 2.0)")
         return model
     try:
-        compiled = torch.compile(
-            model,
-            backend="inductor",
-            mode=mode,
-            fullgraph=False,
-        )
-        print(f"[TURBO-2] torch.compile enabled (backend=inductor, mode={mode})")
-        print(f"          First few steps will be slow (compilation). Then ~1.5-2x speedup.")
         return compiled
     except Exception as e:
-        warnings.warn(f"torch.compile failed: {e}. Running in eager mode.")
         return model
-# ═══════════════════════════════════════════════════════════
-# P-TURBO-6 : INT8 Ternary Forward Path
-# ═══════════════════════════════════════════════════════════
-def ternary_matmul_int8(
-    x: torch.Tensor,
-    w_ternary: torch.Tensor,
-    w_scale: torch.Tensor,
-) -> torch.Tensor:
-    """INT8 ternary matmul using torch._int_mm (dispatches to VNNI/AMX)."""
-    B, S, K = x.shape
-    x_flat = x.reshape(-1, K)
-    x_abs_max = x_flat.abs().amax(dim=-1, keepdim=True).clamp(min=1e-8)
-    x_scale = x_abs_max / 127.0
-    x_int8 = (x_flat / x_scale).round().clamp(-128, 127).to(torch.int8)
-    w_int8 = w_ternary.to(torch.int8)
-    try:
-        out_int32 = torch._int_mm(x_int8, w_int8.t())
-        out = out_int32.float() * x_scale * w_scale
-    except RuntimeError:
-        out = F.linear(x_flat.float(), w_ternary.float()) * w_scale
-    return out.reshape(B, S, -1)
-# ═══════════════════════════════════════════════════════════
-# MAIN: apply()
-# ═══════════════════════════════════════════════════════════
 def apply(
-    model: nn.Module,
-    max_steps: int = 10000,
-    lr: float = 1e-3,
-    weight_decay: float = 0.05,
-    warmup_steps: int = 500,
-    use_compile: bool = True,
-    use_ipex: bool = True,
-    use_lion: bool = False,
-    verbose: bool = True,
 ) -> Tuple[nn.Module, torch.optim.Optimizer, Any]:
-    """
-    Apply all turbo optimizations to ch1mera model.
-    Returns: (model, optimizer, scheduler)
-    """
     cpu_info = detect_cpu_info()
     if verbose:
         print("=" * 65)
-        print("CHIMERA TURBO v5 — CPU Acceleration Layer")
         print("=" * 65)
-        print(f"  Physical cores: {cpu_info['physical_cores']}")
-        print(f"  CPU capability: {cpu_info['capability']}")
         print(f"  AMX: {cpu_info['has_amx']}  AVX-512: {cpu_info['has_avx512']}  BF16 hw: {cpu_info['has_avx512_bf16']}")
-        print(f"  IPEX: {cpu_info['ipex_available']}")
-        print(f"  tcmalloc: {cpu_info['tcmalloc']}")
-    # ── Threading ──
     n_threads = configure_threading(cpu_info)
     if verbose:
         print(f"[TURBO-3] Compute threads: {n_threads}")
-    # ── Optimizer (replaces MeZO) ──
     optimizer = create_optimizer(model, lr=lr, weight_decay=weight_decay, use_lion=use_lion)
     scheduler = create_scheduler(optimizer, max_steps=max_steps, warmup_steps=warmup_steps)
     if verbose:
-        opt_name = type(optimizer).__name__
         n_params = sum(p.numel() for g in optimizer.param_groups for p in g["params"])
-        print(f"[TURBO-1] {opt_name} (lr={lr}, wd={weight_decay}) — {n_params:,} params")
-        print(f"          STE backprop: 1 forward + 1 backward per step")
-    # ── IPEX ──
     if use_ipex:
         model, optimizer = try_ipex_optimize(model, optimizer, cpu_info)
-    # ── torch.compile ──
     if use_compile:
         model = try_compile_model(model, mode="default")
-    # ── Warnings ──
     if verbose:
         if not cpu_info["has_avx512_bf16"]:
-            print()
-            print("  ⚠️  No hardware BF16 support detected (need AVX512-BF16 or AMX).")
-            print("     BF16 autocast may be SLOWER than fp32 on this CPU.")
-            print("     Consider --no-bf16 flag if training is slow.")
         if not cpu_info["tcmalloc"]:
-            print()
-            print("  ⚠️  tcmalloc not detected. For +10-25% speedup:")
-            print("     sudo apt install google-perftools")
-            print("     LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 python train_hyper.py ...")
         print("=" * 65)
     return model, optimizer, scheduler
-# ═══════════════════════════════════════════════════════════
-# Training step helper
-# ═══════════════════════════════════════════════════════════
 def training_step(
-    model: nn.Module,
-    batch,
-    optimizer: torch.optim.Optimizer,
-    scheduler,
-    grad_accum_steps: int = 1,
-    step: int = 0,
-    max_grad_norm: float = 1.0,
     autocast_dtype: Optional[torch.dtype] = torch.bfloat16,
 ) -> float:
-    """
-    Single training step with all turbo optimizations active.
-    IMPORTANT: grad_accum_steps should be 1 if the DataLoader already provides
-    the full effective batch. Set >1 only for memory-constrained scenarios.
-    """
     is_accum_step = (step + 1) % grad_accum_steps == 0
     ctx = torch.autocast(device_type="cpu", dtype=autocast_dtype) if autocast_dtype else nullcontext()
     with ctx:
         if isinstance(batch, dict):
@@ -351,64 +198,11 @@ def training_step(
         loss_val = loss.item()
         if grad_accum_steps > 1:
             loss = loss / grad_accum_steps
     loss.backward()
     if is_accum_step:
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
         optimizer.step()
         scheduler.step()
         optimizer.zero_grad(set_to_none=True)
         invalidate_all_caches(model)
     return loss_val
-# ═══════════════════════════════════════════════════════════
-# Diagnostic tools
-# ═══════════════════════════════════════════════════════════
-def profile_model(model: nn.Module, dummy_input: torch.Tensor, steps: int = 5):
-    """Profile forward+backward to find bottlenecks."""
-    print("\n[TURBO-DIAG] Profiling...")
-    for _ in range(2):
-        out = model(dummy_input)
-        if hasattr(out, "loss") and out.loss is not None:
-            out.loss.backward()
-        elif isinstance(out, torch.Tensor):
-            out.sum().backward()
-        model.zero_grad(set_to_none=True)
-    with torch.profiler.profile(
-        activities=[torch.profiler.ProfilerActivity.CPU],
-        record_shapes=True,
-        with_stack=True,
-    ) as prof:
-        for _ in range(steps):
-            out = model(dummy_input)
-            loss = out.loss if (hasattr(out, "loss") and out.loss is not None) else out.sum()
-            loss.backward()
-            model.zero_grad(set_to_none=True)
-    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
-    return prof
-def count_compile_graph_breaks(model: nn.Module, dummy_input: torch.Tensor):
-    """Count how many graph breaks torch.compile would produce."""
-    try:
-        import torch._dynamo as dynamo
-        explanation = dynamo.explain(model)(dummy_input)
-        n_breaks = len(explanation.break_reasons)
-        print(f"\n[TURBO-DIAG] Graph breaks: {n_breaks}")
-        for i, reason in enumerate(explanation.break_reasons[:10]):
-            print(f"  [{i+1}] {reason}")
-        if n_breaks > 10:
-            print(f"  ... and {n_breaks - 10} more")
-        if n_breaks == 0:
-            print("  ✅ Zero graph breaks — full model is compilable!")
-        return n_breaks
-    except Exception as e:
-        print(f"[TURBO-DIAG] dynamo.explain failed: {e}")
-        return -1

   P-TURBO-5: Invalidate BitLinear packed caches after optimizer step
   P-TURBO-6: INT8 ternary forward path (VNNI/AMX dispatch)
+v6: lower max_grad_norm 1.0→0.5, clamp-aware STE in quantization.py
 """
 import math
 from typing import Optional, Dict, Any, Tuple
 from contextlib import nullcontext
 def detect_cpu_info() -> Dict[str, Any]:
     info = {}
     try:
         physical = len(os.sched_getaffinity(0))
         import multiprocessing
         import multiprocessing
         info["logical_cores"] = multiprocessing.cpu_count()
         info["physical_cores"] = info["logical_cores"] // 2
     try:
         info["capability"] = torch.backends.cpu.get_cpu_capability()
     except Exception:
         info["capability"] = "unknown"
     cap = (info["capability"] or "").lower()
     info["has_amx"] = "amx" in cap
     info["has_avx512"] = "avx512" in cap or "avx512_vnni" in cap
     info["has_avx512_bf16"] = "avx512_bf16" in cap or info["has_amx"]
     info["has_vnni"] = info["has_avx512"]
     try:
         import intel_extension_for_pytorch
         info["ipex_available"] = True
         info["ipex_version"] = intel_extension_for_pytorch.__version__
     except Exception:
         info["ipex_available"] = False
     info["tcmalloc"] = "tcmalloc" in os.environ.get("LD_PRELOAD", "")
     return info
 def configure_threading(cpu_info: Dict[str, Any], reserve_for_io: int = 1):
     n_compute = max(1, cpu_info["physical_cores"] - reserve_for_io)
     torch.set_num_threads(n_compute)
     os.environ["OMP_NUM_THREADS"] = str(n_compute)
     return n_compute
 def create_optimizer(
+    model: nn.Module, lr: float = 1e-3, weight_decay: float = 0.05,
+    use_lion: bool = False, betas: Tuple[float, float] = (0.9, 0.95),
 ) -> torch.optim.Optimizer:
+    decay_params, no_decay_params = [], []
     for name, param in model.named_parameters():
         if not param.requires_grad:
             continue
             no_decay_params.append(param)
         else:
             decay_params.append(param)
     param_groups = [
         {"params": decay_params, "weight_decay": weight_decay},
         {"params": no_decay_params, "weight_decay": 0.0},
     ]
     if use_lion:
         try:
             from lion_pytorch import Lion
             return Lion(param_groups, lr=lr * 0.3, betas=(0.95, 0.98))
         except ImportError:
             warnings.warn("lion-pytorch not installed, falling back to AdamW")
     return torch.optim.AdamW(param_groups, lr=lr, betas=betas, fused=False)
 def create_scheduler(optimizer, max_steps: int, warmup_steps: int = 500):
     from torch.optim.lr_scheduler import LambdaLR
     def lr_lambda(step):
         if step < warmup_steps:
             return step / max(1, warmup_steps)
         progress = (step - warmup_steps) / max(1, max_steps - warmup_steps)
         return max(0.01, 0.5 * (1.0 + math.cos(math.pi * progress)))
     return LambdaLR(optimizer, lr_lambda)
 def invalidate_all_caches(model: nn.Module):
     from chimera.quantization import BitLinear
     for m in model.modules():
         if isinstance(m, BitLinear):
             m.invalidate_packed()
+def try_ipex_optimize(model, optimizer, cpu_info, dtype=None):
     if not cpu_info.get("ipex_available"):
         print("[TURBO-4] IPEX not available — skipping")
         return model, optimizer
     try:
         import intel_extension_for_pytorch as ipex
     except Exception:
         print("[TURBO-4] IPEX import failed — skipping")
         return model, optimizer
     if dtype is None:
         if cpu_info["has_amx"]:
             dtype = torch.bfloat16
+            print("[TURBO-4] IPEX + AMX bf16 enabled")
         elif cpu_info["has_avx512"]:
             dtype = torch.bfloat16
             print("[TURBO-4] IPEX + AVX-512 bf16 enabled")
         else:
             dtype = torch.float32
+            print("[TURBO-4] IPEX fp32")
+    model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=dtype, level="O1", inplace=True)
     return model, optimizer
 def try_compile_model(model: nn.Module, mode: str = "default") -> nn.Module:
     if not hasattr(torch, "compile"):
         return model
     try:
+        compiled = torch.compile(model, backend="inductor", mode=mode, fullgraph=False)
+        print(f"[TURBO-2] torch.compile enabled (mode={mode})")
         return compiled
     except Exception as e:
+        warnings.warn(f"torch.compile failed: {e}. Eager mode.")
         return model
 def apply(
+    model: nn.Module, max_steps: int = 10000, lr: float = 1e-3,
+    weight_decay: float = 0.05, warmup_steps: int = 500,
+    use_compile: bool = True, use_ipex: bool = True,
+    use_lion: bool = False, verbose: bool = True,
 ) -> Tuple[nn.Module, torch.optim.Optimizer, Any]:
     cpu_info = detect_cpu_info()
     if verbose:
         print("=" * 65)
+        print("CHIMERA TURBO v6 — CPU Acceleration Layer")
         print("=" * 65)
+        print(f"  Cores: {cpu_info['physical_cores']}  CPU: {cpu_info['capability']}")
         print(f"  AMX: {cpu_info['has_amx']}  AVX-512: {cpu_info['has_avx512']}  BF16 hw: {cpu_info['has_avx512_bf16']}")
+        print(f"  IPEX: {cpu_info['ipex_available']}  tcmalloc: {cpu_info['tcmalloc']}")
     n_threads = configure_threading(cpu_info)
     if verbose:
         print(f"[TURBO-3] Compute threads: {n_threads}")
     optimizer = create_optimizer(model, lr=lr, weight_decay=weight_decay, use_lion=use_lion)
     scheduler = create_scheduler(optimizer, max_steps=max_steps, warmup_steps=warmup_steps)
     if verbose:
         n_params = sum(p.numel() for g in optimizer.param_groups for p in g["params"])
+        print(f"[TURBO-1] AdamW (lr={lr}, wd={weight_decay}) — {n_params:,} params")
     if use_ipex:
         model, optimizer = try_ipex_optimize(model, optimizer, cpu_info)
     if use_compile:
         model = try_compile_model(model, mode="default")
     if verbose:
         if not cpu_info["has_avx512_bf16"]:
+            print("  ⚠️  No BF16 hw — use --no-bf16")
         if not cpu_info["tcmalloc"]:
+            print("  ⚠️  No tcmalloc — LD_PRELOAD=...libtcmalloc.so.4 for +15%")
         print("=" * 65)
     return model, optimizer, scheduler
 def training_step(
+    model: nn.Module, batch, optimizer: torch.optim.Optimizer, scheduler,
+    grad_accum_steps: int = 1, step: int = 0,
+    max_grad_norm: float = 0.5,  # ← lowered from 1.0 to prevent NaN
     autocast_dtype: Optional[torch.dtype] = torch.bfloat16,
 ) -> float:
     is_accum_step = (step + 1) % grad_accum_steps == 0
     ctx = torch.autocast(device_type="cpu", dtype=autocast_dtype) if autocast_dtype else nullcontext()
     with ctx:
         if isinstance(batch, dict):
         loss_val = loss.item()
         if grad_accum_steps > 1:
             loss = loss / grad_accum_steps
     loss.backward()
     if is_accum_step:
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
         optimizer.step()
         scheduler.step()
         optimizer.zero_grad(set_to_none=True)
         invalidate_all_caches(model)
     return loss_val