fix: NaN at step 150 — add gradient clamping to STE detach trick + lower max_grad_norm to 0.5\n\nThe pure detach() STE passes gradients through unbounded, causing\ngradient explosion around step 140-150 when loss is still high.\n\nFix: clamp the gradient contribution within the detach trick:\n w_q = clamp(w_scaled, -1, 1) + (round(clamped) - clamped).detach()\nThis ensures gradients are zero outside [-1, 1] (weights already at\nquantization boundary get no gradient push) while keeping the STE\nidentity pass-through inside the valid range.\n\nAlso reduces max_grad_norm from 1.0 to 0.5 for additional stability.\n\nRef: 4-bit CPU training paper (2603.13931) uses tanh soft clipping\nfor the same reason."

Browse files

Files changed (1) hide show

chimera/quantization.py +35 -95

chimera/quantization.py CHANGED Viewed

@@ -37,9 +37,7 @@ import torch.nn.functional as F
 # ---------------------------------------------------------------------------
-# Lazy C++ kernel.  We never compile it during ``import``; it is only built
-# when explicitly requested via :func:`enable_native_kernel` or the env var
-# ``CHIMERA_NATIVE=1``.  All public APIs work with the pure-PyTorch path.
 # ---------------------------------------------------------------------------
 _NATIVE_LOCK = threading.Lock()
@@ -55,7 +53,6 @@ _CPP_SOURCE = r"""
 #include <omp.h>
 #endif
-// Encoding: -1->0b10, 0->0b00, +1->0b01
 static const float LUT[4] = {0.0f, 1.0f, -1.0f, 0.0f};
 torch::Tensor pack_ternary_cpu(torch::Tensor w) {
@@ -108,8 +105,6 @@ torch::Tensor unpack_ternary_cpu(torch::Tensor packed, int64_t K) {
     return out;
 }
-// Fused "unpack and scale" -> bf16/fp32 dense weight.  Saves a pass over memory
-// and a temporary FP32 tensor when running under bf16 autocast.
 torch::Tensor dequantize_cpu(torch::Tensor packed, torch::Tensor alpha, int64_t K) {
     auto p = packed.contiguous();
     auto a = alpha.contiguous().to(torch::kFloat32);
@@ -144,7 +139,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 def _try_load_native() -> Optional[object]:
-    """Compile/load the optional native helper.  Idempotent and thread-safe."""
     global _NATIVE_EXT, _NATIVE_TRIED
     if _NATIVE_TRIED:
         return _NATIVE_EXT
@@ -154,7 +148,6 @@ def _try_load_native() -> Optional[object]:
         _NATIVE_TRIED = True
         try:
             from torch.utils.cpp_extension import load_inline
             build_dir = os.path.join(
                 os.path.dirname(os.path.abspath(__file__)), "..", ".ternary_build"
             )
@@ -167,17 +160,13 @@ def _try_load_native() -> Optional[object]:
                 build_directory=build_dir,
                 verbose=False,
             )
-        except Exception as exc:  # pragma: no cover - best-effort.
             os.environ.setdefault("CHIMERA_NATIVE_DISABLED", str(exc)[:200])
             _NATIVE_EXT = None
         return _NATIVE_EXT
 def enable_native_kernel(force: bool = False) -> bool:
-    """Eagerly try to compile the native kernel.
-    Returns ``True`` if the kernel is loaded and available.
-    """
     global _NATIVE_TRIED
     if force:
         _NATIVE_TRIED = False
@@ -188,28 +177,20 @@ def native_kernel_available() -> bool:
     return _NATIVE_EXT is not None
-# Allow opt-in from the environment without code changes.
 if os.environ.get("CHIMERA_NATIVE", "0") == "1":
     enable_native_kernel()
 # ---------------------------------------------------------------------------
-# Pure PyTorch ternary primitives (always available).
 # ---------------------------------------------------------------------------
-# Lookup tables compiled once.  Casting to a registered buffer is overkill –
-# they live on CPU and broadcast naturally.
 _TERNARY_LUT_F32 = torch.tensor([0.0, 1.0, -1.0, 0.0], dtype=torch.float32)
 _TERNARY_LUT_I8 = torch.tensor([0, 1, -1, 0], dtype=torch.int8)
 _SHIFTS = torch.tensor([6, 4, 2, 0], dtype=torch.uint8)
 def pack_ternary(q: torch.Tensor) -> torch.Tensor:
-    """Pack a ternary {-1,0,1} tensor into a 2-bit uint8 tensor.
-    Vectorised pure-PyTorch implementation — no Python loops over rows.
-    Trailing positions that don't divide by four are zero-padded.
-    """
     q = q.detach()
     if q.dim() == 1:
         q = q.unsqueeze(0)
@@ -219,7 +200,6 @@ def pack_ternary(q: torch.Tensor) -> torch.Tensor:
     pad = K4 * 4 - K
     if pad:
         flat = F.pad(flat, (0, pad))
-    # codes: 0 / 1 / 2  (uint8)
     codes = torch.where(flat == 1, torch.full_like(flat, 1),
                         torch.where(flat == -1, torch.full_like(flat, 2), torch.zeros_like(flat))).to(torch.uint8)
     codes = codes.view(M, K4, 4)
@@ -231,19 +211,13 @@ def pack_ternary(q: torch.Tensor) -> torch.Tensor:
 def unpack_ternary(packed: torch.Tensor, k: int,
                    alpha: Optional[torch.Tensor] = None,
                    dtype: torch.dtype = torch.float32) -> torch.Tensor:
-    """Vectorised inverse of :func:`pack_ternary`.
-    Returns ``out`` with last dim ``k``; optionally pre-multiplied by
-    ``alpha`` (per-row scale, broadcastable on the leading axes).
-    """
     packed = packed.to(torch.uint8)
     if packed.dim() == 1:
         packed = packed.unsqueeze(0)
     flat = packed.reshape(-1, packed.shape[-1])
     M, K4 = flat.shape
-    # Gather all 4 sub-positions in one vectorised op.
     shifts = _SHIFTS.to(packed.device)
-    codes = (flat.unsqueeze(-1) >> shifts).bitwise_and_(3).to(torch.long)  # [M, K4, 4]
     lut = _TERNARY_LUT_F32.to(device=packed.device, dtype=dtype)
     out = lut[codes].reshape(M, K4 * 4)[:, :k]
     if alpha is not None:
@@ -252,33 +226,25 @@ def unpack_ternary(packed: torch.Tensor, k: int,
 def _absmean_alpha(weight: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:
-    """Per-output-channel scale (``\\alpha = mean|w|`` clamped)."""
     return weight.detach().abs().mean(dim=-1, keepdim=False).clamp_min(eps).to(torch.float32)
 def ternarize_weight(weight: torch.Tensor, group_size: int = 128
                     ) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Quantise FP32 weights to ternary using BitNet's abs-mean rule.
-    ``group_size`` is kept for API compatibility but every row is its own
-    group in this slim implementation.  Returns ``(w_ternary, alpha)``.
-    """
     alpha = _absmean_alpha(weight)
     w_q = torch.round(torch.clamp(weight / alpha.unsqueeze(-1), -1.0, 1.0)).to(torch.int8)
     return w_q, alpha
-_quantize_weights_ternary = ternarize_weight  # legacy alias used elsewhere
 def apply_2_4_sparsity_(weight: torch.Tensor) -> torch.Tensor:
-    """In-place N:M 2:4 pruning.  Vectorised — no Python row loops."""
     with torch.no_grad():
         last = weight.shape[-1]
         pad = (-last) % 4
         target = F.pad(weight, (0, pad)) if pad else weight
         view = target.view(*target.shape[:-1], -1, 4)
-        # Keep the two largest in absolute value, zero the rest.
         idx = view.abs().argsort(dim=-1)[..., :2]
         view.scatter_(-1, idx, 0.0)
         if pad:
@@ -290,29 +256,25 @@ def apply_2_4_sparsity_(weight: torch.Tensor) -> torch.Tensor:
 # Straight-Through Estimator for ternary quantization.
 # ---------------------------------------------------------------------------
 #
-# COMPILE-FRIENDLY STE using the detach() identity trick:
 #
-#   w + (round(clamp(w, -1, 1)) - w).detach()
 #
-# Forward: evaluates to round(clamp(w, -1, 1)) because +w and -w cancel.
-# Backward: ∂/∂w [w + constant] = 1 (identity / pass-through).
 #
-# This replaces the old _RoundTernarySTE(torch.autograd.Function) which
-# caused 84+ graph breaks under torch.compile (one per BitLinear.apply()).
-# The detach() trick uses only standard aten ops — Inductor can fuse the
-# entire quantize+linear sequence into a single optimized kernel.
 #
-# Pattern from official BitNet b1.58 (arxiv:2402.17764, 1bitLLM/bitnet_b1_58-large).
-#
-# Note: the old STE also clipped gradients to [-1, 1]. The detach trick
-# passes gradients through unclipped, which is actually better for convergence
-# (see BitNet b1.58 Reloaded, arxiv:2407.09527). If you need grad clipping,
-# use torch.nn.utils.clip_grad_norm_() at the optimizer step instead.
 # ---------------------------------------------------------------------------
-# Keep the old class around for backward compatibility (MeZOOptimizer uses it
-# indirectly through ternary_nonzero_mask), but it is no longer called in the
-# training forward path.
 class _RoundTernarySTE(torch.autograd.Function):
     """LEGACY — kept for backward compat. Use ste_ternary() instead."""
     @staticmethod
@@ -328,28 +290,24 @@ def ste_ternary(w: torch.Tensor) -> torch.Tensor:
     """Straight-through estimator for ternary quantization.
     Forward: round(clamp(w, -1, 1))
-    Backward: identity (gradient passes through unchanged)
     Uses the detach() trick for zero graph breaks under torch.compile.
     """
-    w_q = torch.round(torch.clamp(w, -1.0, 1.0))
-    return w + (w_q - w).detach()
 # ---------------------------------------------------------------------------
-# BitLinear — single class, single fast path.
 # ---------------------------------------------------------------------------
 class BitLinear(nn.Module):
     """Linear layer with ternary {-1, 0, 1} weights and per-row absmean scale.
-    *Training (grad-enabled)*: STE ternarisation on the latent weight, dense
-    fp32/bf16 matmul.  Backward flows to the latent weight via STE.
-    Uses detach() trick — fully torch.compile compatible (zero graph breaks).
-    *Inference / no-grad*: weights are quantised once and cached as packed
-    2-bit uint8 + fp32 alpha.  Each forward unpacks (vectorised PyTorch or
-    optional C++ kernel) into a reusable buffer and calls a single matmul.
     """
     __constants__ = ["in_features", "out_features", "use_2_4"]
@@ -368,7 +326,6 @@ class BitLinear(nn.Module):
         else:
             self.register_parameter("bias", None)
-        # Caches for inference path.
         self.register_buffer("_packed", torch.zeros(0, dtype=torch.uint8), persistent=False)
         self.register_buffer("_alpha", torch.zeros(0, dtype=torch.float32), persistent=False)
         self.register_buffer("_dense_w", torch.zeros(0, dtype=torch.float32), persistent=False)
@@ -378,25 +335,19 @@ class BitLinear(nn.Module):
         self.reset_parameters()
-    # -- init ------------------------------------------------------------------
     def reset_parameters(self) -> None:
         nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
         if self.bias is not None:
             nn.init.zeros_(self.bias)
         self._cache_version += 1
-    # -- helpers ---------------------------------------------------------------
     def invalidate_packed(self) -> None:
-        """Mark the packed cache stale.  Called after weight mutations."""
         self._cache_version += 1
         if self._dense_w.numel() > 0:
             self._dense_w = torch.zeros(0, dtype=torch.float32, device=self._dense_w.device)
         self._dense_version = -1
     def _quantize_latent(self) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Quantise the FP32 latent weight to ternary (no-grad, no copy)."""
         with torch.no_grad():
             w = self.weight
             alpha = _absmean_alpha(w)
@@ -421,13 +372,11 @@ class BitLinear(nn.Module):
     @torch.no_grad()
     def prepare_for_inference(self) -> None:
-        """Materialise the packed cache so the next forward is allocation-free."""
         self.invalidate_packed()
         self._ensure_packed()
     @torch.no_grad()
     def ternary_nonzero_mask(self) -> torch.Tensor:
-        """Boolean mask of currently non-zero ternary positions (cached)."""
         self._ensure_packed()
         ext = _NATIVE_EXT
         if ext is not None:
@@ -436,24 +385,21 @@ class BitLinear(nn.Module):
             w = unpack_ternary(self._packed, self.in_features)
         return w.ne(0)
-    # -- forward ---------------------------------------------------------------
     def _forward_train(self, x: torch.Tensor) -> torch.Tensor:
-        """STE forward: differentiable, fp32/bf16 dense matmul.
-        Uses detach() trick for torch.compile compatibility:
-          w_scaled = w / alpha
-          w_q = w_scaled + (round(clamp(w_scaled)) - w_scaled).detach()
-          output = F.linear(x, w_q * alpha)
-        Forward: w_q evaluates to round(clamp(w/alpha, -1, 1))
-        Backward: grad flows through w_scaled unchanged (STE identity)
         """
         w = self.weight
         alpha = w.detach().abs().mean(dim=-1, keepdim=True).clamp_min(1e-5)
         w_scaled = w / alpha
-        # STE via detach trick — zero graph breaks under torch.compile
-        w_q = w_scaled + (torch.round(torch.clamp(w_scaled, -1.0, 1.0)) - w_scaled).detach()
         w_q = w_q * alpha
         if self.use_2_4:
             with torch.no_grad():
@@ -462,7 +408,6 @@ class BitLinear(nn.Module):
         return F.linear(x, w_q.to(x.dtype), self.bias)
     def _ensure_dense(self) -> torch.Tensor:
-        """Materialise (and cache) the fp32 dense ternary weight."""
         self._ensure_packed()
         if self._dense_version == self._cache_version and self._dense_w.numel() > 0:
             return self._dense_w
@@ -476,7 +421,6 @@ class BitLinear(nn.Module):
         return self._dense_w
     def _forward_packed(self, x: torch.Tensor) -> torch.Tensor:
-        """No-grad fast path that uses the cached dequantised weights."""
         w = self._ensure_dense()
         if x.dtype != w.dtype:
             w_used = w.to(x.dtype)
@@ -489,8 +433,6 @@ class BitLinear(nn.Module):
             return self._forward_train(x)
         return self._forward_packed(x)
-    # -- introspection ---------------------------------------------------------
     def extra_repr(self) -> str:
         return (f"in_features={self.in_features}, out_features={self.out_features}, "
                 f"bias={self.bias is not None}, nm_2_4={self.use_2_4}, "
@@ -498,12 +440,10 @@ class BitLinear(nn.Module):
 # ---------------------------------------------------------------------------
-# RMSNorm.
 # ---------------------------------------------------------------------------
 class RMSNorm(nn.Module):
-    """Numerically-stable Root Mean Square LayerNorm (no bias, no centering)."""
     __constants__ = ["dim", "eps"]
     def __init__(self, dim: int, eps: float = 1e-6):

 # ---------------------------------------------------------------------------
+# Lazy C++ kernel.
 # ---------------------------------------------------------------------------
 _NATIVE_LOCK = threading.Lock()
 #include <omp.h>
 #endif
 static const float LUT[4] = {0.0f, 1.0f, -1.0f, 0.0f};
 torch::Tensor pack_ternary_cpu(torch::Tensor w) {
     return out;
 }
 torch::Tensor dequantize_cpu(torch::Tensor packed, torch::Tensor alpha, int64_t K) {
     auto p = packed.contiguous();
     auto a = alpha.contiguous().to(torch::kFloat32);
 def _try_load_native() -> Optional[object]:
     global _NATIVE_EXT, _NATIVE_TRIED
     if _NATIVE_TRIED:
         return _NATIVE_EXT
         _NATIVE_TRIED = True
         try:
             from torch.utils.cpp_extension import load_inline
             build_dir = os.path.join(
                 os.path.dirname(os.path.abspath(__file__)), "..", ".ternary_build"
             )
                 build_directory=build_dir,
                 verbose=False,
             )
+        except Exception as exc:
             os.environ.setdefault("CHIMERA_NATIVE_DISABLED", str(exc)[:200])
             _NATIVE_EXT = None
         return _NATIVE_EXT
 def enable_native_kernel(force: bool = False) -> bool:
     global _NATIVE_TRIED
     if force:
         _NATIVE_TRIED = False
     return _NATIVE_EXT is not None
 if os.environ.get("CHIMERA_NATIVE", "0") == "1":
     enable_native_kernel()
 # ---------------------------------------------------------------------------
+# Pure PyTorch ternary primitives.
 # ---------------------------------------------------------------------------
 _TERNARY_LUT_F32 = torch.tensor([0.0, 1.0, -1.0, 0.0], dtype=torch.float32)
 _TERNARY_LUT_I8 = torch.tensor([0, 1, -1, 0], dtype=torch.int8)
 _SHIFTS = torch.tensor([6, 4, 2, 0], dtype=torch.uint8)
 def pack_ternary(q: torch.Tensor) -> torch.Tensor:
     q = q.detach()
     if q.dim() == 1:
         q = q.unsqueeze(0)
     pad = K4 * 4 - K
     if pad:
         flat = F.pad(flat, (0, pad))
     codes = torch.where(flat == 1, torch.full_like(flat, 1),
                         torch.where(flat == -1, torch.full_like(flat, 2), torch.zeros_like(flat))).to(torch.uint8)
     codes = codes.view(M, K4, 4)
 def unpack_ternary(packed: torch.Tensor, k: int,
                    alpha: Optional[torch.Tensor] = None,
                    dtype: torch.dtype = torch.float32) -> torch.Tensor:
     packed = packed.to(torch.uint8)
     if packed.dim() == 1:
         packed = packed.unsqueeze(0)
     flat = packed.reshape(-1, packed.shape[-1])
     M, K4 = flat.shape
     shifts = _SHIFTS.to(packed.device)
+    codes = (flat.unsqueeze(-1) >> shifts).bitwise_and_(3).to(torch.long)
     lut = _TERNARY_LUT_F32.to(device=packed.device, dtype=dtype)
     out = lut[codes].reshape(M, K4 * 4)[:, :k]
     if alpha is not None:
 def _absmean_alpha(weight: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:
     return weight.detach().abs().mean(dim=-1, keepdim=False).clamp_min(eps).to(torch.float32)
 def ternarize_weight(weight: torch.Tensor, group_size: int = 128
                     ) -> Tuple[torch.Tensor, torch.Tensor]:
     alpha = _absmean_alpha(weight)
     w_q = torch.round(torch.clamp(weight / alpha.unsqueeze(-1), -1.0, 1.0)).to(torch.int8)
     return w_q, alpha
+_quantize_weights_ternary = ternarize_weight
 def apply_2_4_sparsity_(weight: torch.Tensor) -> torch.Tensor:
     with torch.no_grad():
         last = weight.shape[-1]
         pad = (-last) % 4
         target = F.pad(weight, (0, pad)) if pad else weight
         view = target.view(*target.shape[:-1], -1, 4)
         idx = view.abs().argsort(dim=-1)[..., :2]
         view.scatter_(-1, idx, 0.0)
         if pad:
 # Straight-Through Estimator for ternary quantization.
 # ---------------------------------------------------------------------------
 #
+# CLAMP-AWARE STE using the detach() trick:
 #
+#   clamped = clamp(w, -1, 1)
+#   w_q = clamped + (round(clamped) - clamped).detach()
 #
+# Forward: evaluates to round(clamp(w, -1, 1)) — same as before.
+# Backward: ∂/∂w [clamp(w, -1, 1)] = 1 if |w| <= 1 else 0.
+#   → Gradients are ZERO for weights outside [-1, 1] (at quantization boundary).
+#   → Gradients pass through unchanged inside [-1, 1] (STE identity).
 #
+# This prevents gradient explosion that caused NaN at step ~150 with the
+# pure identity STE (w + (quant - w).detach()). The clamp derivative acts
+# as a natural gradient gate: weights that have drifted beyond the ternary
+# range get no gradient push, preventing runaway accumulation.
 #
+# Ref: 4-bit CPU training (arxiv:2603.13931) uses tanh soft clipping for
+# the same stabilization purpose.
 # ---------------------------------------------------------------------------
 class _RoundTernarySTE(torch.autograd.Function):
     """LEGACY — kept for backward compat. Use ste_ternary() instead."""
     @staticmethod
     """Straight-through estimator for ternary quantization.
     Forward: round(clamp(w, -1, 1))
+    Backward: clamp derivative (zero outside [-1, 1], identity inside)
     Uses the detach() trick for zero graph breaks under torch.compile.
     """
+    clamped = torch.clamp(w, -1.0, 1.0)
+    w_q = torch.round(clamped)
+    return clamped + (w_q - clamped).detach()
 # ---------------------------------------------------------------------------
+# BitLinear
 # ---------------------------------------------------------------------------
 class BitLinear(nn.Module):
     """Linear layer with ternary {-1, 0, 1} weights and per-row absmean scale.
+    *Training*: STE ternarisation with clamp-aware gradient gating.
+    *Inference*: cached packed 2-bit uint8 weights.
     """
     __constants__ = ["in_features", "out_features", "use_2_4"]
         else:
             self.register_parameter("bias", None)
         self.register_buffer("_packed", torch.zeros(0, dtype=torch.uint8), persistent=False)
         self.register_buffer("_alpha", torch.zeros(0, dtype=torch.float32), persistent=False)
         self.register_buffer("_dense_w", torch.zeros(0, dtype=torch.float32), persistent=False)
         self.reset_parameters()
     def reset_parameters(self) -> None:
         nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
         if self.bias is not None:
             nn.init.zeros_(self.bias)
         self._cache_version += 1
     def invalidate_packed(self) -> None:
         self._cache_version += 1
         if self._dense_w.numel() > 0:
             self._dense_w = torch.zeros(0, dtype=torch.float32, device=self._dense_w.device)
         self._dense_version = -1
     def _quantize_latent(self) -> Tuple[torch.Tensor, torch.Tensor]:
         with torch.no_grad():
             w = self.weight
             alpha = _absmean_alpha(w)
     @torch.no_grad()
     def prepare_for_inference(self) -> None:
         self.invalidate_packed()
         self._ensure_packed()
     @torch.no_grad()
     def ternary_nonzero_mask(self) -> torch.Tensor:
         self._ensure_packed()
         ext = _NATIVE_EXT
         if ext is not None:
             w = unpack_ternary(self._packed, self.in_features)
         return w.ne(0)
     def _forward_train(self, x: torch.Tensor) -> torch.Tensor:
+        """STE forward with clamp-aware gradient gating.
+        The clamp on w_scaled ensures:
+        - Forward: round(clamp(w/alpha, -1, 1)) * alpha — correct ternary
+        - Backward: gradient is ZERO for w_scaled outside [-1, 1],
+          preventing gradient explosion from weights at the boundary.
         """
         w = self.weight
         alpha = w.detach().abs().mean(dim=-1, keepdim=True).clamp_min(1e-5)
         w_scaled = w / alpha
+        # Clamp FIRST, then detach the rounding residual.
+        # Gradient of clamp: 1 inside [-1,1], 0 outside → natural gradient gate
+        clamped = torch.clamp(w_scaled, -1.0, 1.0)
+        w_q = clamped + (torch.round(clamped) - clamped).detach()
         w_q = w_q * alpha
         if self.use_2_4:
             with torch.no_grad():
         return F.linear(x, w_q.to(x.dtype), self.bias)
     def _ensure_dense(self) -> torch.Tensor:
         self._ensure_packed()
         if self._dense_version == self._cache_version and self._dense_w.numel() > 0:
             return self._dense_w
         return self._dense_w
     def _forward_packed(self, x: torch.Tensor) -> torch.Tensor:
         w = self._ensure_dense()
         if x.dtype != w.dtype:
             w_used = w.to(x.dtype)
             return self._forward_train(x)
         return self._forward_packed(x)
     def extra_repr(self) -> str:
         return (f"in_features={self.in_features}, out_features={self.out_features}, "
                 f"bias={self.bias is not None}, nm_2_4={self.use_2_4}, "
 # ---------------------------------------------------------------------------
+# RMSNorm
 # ---------------------------------------------------------------------------
 class RMSNorm(nn.Module):
     __constants__ = ["dim", "eps"]
     def __init__(self, dim: int, eps: float = 1e-6):