Spaces:

TheLinconX
/

contextforge-demo

Sleeping

File size: 14,120 Bytes

"""RotateKV Pre-RoPE Quantization — INT4 KV block compression.

Based on RotateKV (IJCAI 2025, arXiv:2501.16383):
- Outlier-Aware Rotation: channel reordering + FWHT to group channels
  by outlier distribution before rotation
- Pre-RoPE Grouped-Head Rotation: rotate BEFORE applying RoPE, not after,
  to avoid RoPE-induced inter-channel mixing that wrecks outlier isolation
- Attention-Sink-Aware Quantization: protect first N tokens (sinks) at
  full FP16, quantize the rest at INT4

Results from paper: 3.97x peak memory reduction, 2.32x decode speedup,
< 0.3 PPL degradation at 2-bit on WikiText-2 (LLaMA-2-13B).

V4.0: Target INT4 (4-bit) for balance quality/compression.

INVARIANT 10: This module ALWAYS receives key_states BEFORE RoPE is applied.
RoPE is applied externally after dequantize(). Breaking this contract corrupts attention.
"""
from dataclasses import dataclass, field
from typing import Optional, Tuple, Union

import numpy as np


@dataclass
class RotateKVConfig:
    """Configuration for RotateKV quantization."""
    bits: int = 4                # 2 | 4 | 8
    group_size: int = 64         # block-wise quantization block size (rows)
    sink_tokens: int = 4         # protect first N tokens at FP16
    use_fwht: bool = True        # Fast Walsh-Hadamard Transform for outlier rotation
    grouped_heads: int = 2       # heads per rotation group (Pre-RoPE grouped-head)


@dataclass
class QuantizedKVBlock:
    """A quantized KV block with INT4 storage and FP16 sink tokens."""
    keys_int4: np.ndarray        # shape (seq_len - sink_tokens, num_heads, head_dim//2)
    values_int4: np.ndarray      # same
    keys_sink_fp16: np.ndarray   # shape (sink_tokens, num_heads, head_dim)
    values_sink_fp16: np.ndarray # same
    scales_k: np.ndarray         # per-block scales for keys (n_blocks, num_heads, head_dim//2)
    zero_points_k: np.ndarray   # per-block zero points for keys
    scales_v: np.ndarray         # per-block scales for values
    zero_points_v: np.ndarray    # per-block zero points for values
    channel_order: np.ndarray    # reordering indices for dequantization
    positions: np.ndarray        # original position indices (needed for RoPE)
    bits: int = 4


class RotateKVQuantizer:
    """
    Pre-RoPE INT4 quantizer for KV cache blocks.
    
    Usage:
        quantizer = RotateKVQuantizer(RotateKVConfig(bits=4))
        quantizer.calibrate(calibration_key_states)
        qblock, remaining_keys = quantizer.quantize_pre_rope(keys, values, positions)
        keys_fp16, values_fp16 = quantizer.dequantize(qblock)
    """
    
    def __init__(self, config: RotateKVConfig = RotateKVConfig()):
        self._config = config
        self._channel_order: Optional[np.ndarray] = None
        self._calibrated = False
    
    def calibrate(
        self,
        key_states_sample: np.ndarray,
        n_calibration_samples: int = 128,
    ) -> None:
        """
        Lightweight calibration to compute channel reordering indices.
        
        Algorithm:
        1. Reshape key_states to (N * seq_len, num_heads * head_dim)
        2. Sum channels across batch dimension
        3. Sort indices by activation magnitude (outlier proxy)
        4. Store self._channel_order: np.ndarray[int] for reuse
        
        This is a one-time offline step per model, not per request.
        
        Args:
            key_states_sample: np.ndarray of shape (N, seq_len, num_heads, head_dim)
                              pre-RoPE key states from calibration run
            n_calibration_samples: max samples to use for calibration
        """
        cfg = self._config
        # Use first n_calibration_samples from the sample
        n = min(n_calibration_samples, key_states_sample.shape[0])
        sample = key_states_sample[:n]
        
        # Reshape to (N * seq_len, num_heads * head_dim)
        N, seq_len, num_heads, head_dim = sample.shape
        reshaped = sample.reshape(N * seq_len, num_heads * head_dim)
        
        # Sum channels across batch dimension as activation magnitude proxy
        channel_magnitude = np.sum(np.abs(reshaped), axis=0)
        
        # Sort indices by magnitude (high magnitude = likely outlier = later in order)
        self._channel_order = np.argsort(channel_magnitude)
        self._calibrated = True
        
        # Store shape info for dequantization
        self._num_heads = num_heads
        self._head_dim = head_dim
    
    def quantize_pre_rope(
        self,
        key_states: np.ndarray,
        value_states: np.ndarray,
        positions: np.ndarray,
    ) -> Tuple["QuantizedKVBlock", np.ndarray]:
        """
        Quantize key_states BEFORE RoPE is applied.

        INVARIANT 10: This method ALWAYS receives pre-RoPE key_states.
        The returned QuantizedKVBlock contains pre-RoPE data. RoPE is applied
        externally after dequantization.

        Steps:
        1. Apply channel reordering (self._channel_order)
        2. Apply FWHT rotation across grouped heads (if use_fwht=True)
        3. Identify attention sinks: positions[:, :sink_tokens]
        4. Separate sink tokens (store as FP16) from rest (quantize as INT4)
        5. Block-wise asymmetric INT4 quantization (group_size rows per block)
        6. Store scale + zero_point per block for dequantization
        7. Return QuantizedKVBlock

        Args:
            key_states: np.ndarray shape (batch, seq_len, num_heads, head_dim) pre-RoPE,
                        or (seq_len, hidden_dim) for single-batch single-head input.
            value_states: np.ndarray same shape as key_states
            positions: np.ndarray shape (batch, seq_len) position indices,
                        or (seq_len,) for single-batch input.

        Returns:
            Tuple of (QuantizedKVBlock, key_states_post_quantization_for_RoPE)
            The second element is key_states after quantization (NOT dequantified).
            RoPE should be applied to this by the caller.
        """
        cfg = self._config

        # Promote 2D input (seq_len, hidden_dim) to canonical 4D
        # (batch=1, seq_len, num_heads=1, head_dim=hidden_dim).
        # Detection is done first so all downstream slicing assumes 4D.
        was_2d = key_states.ndim == 2
        if was_2d:
            seq_len_2d, hidden_dim_2d = key_states.shape
            key_states = key_states.reshape(1, seq_len_2d, 1, hidden_dim_2d)
            value_states = value_states.reshape(1, seq_len_2d, 1, hidden_dim_2d)
            if positions.ndim == 1:
                positions = positions.reshape(1, seq_len_2d)

        # Apply channel reordering if calibrated
        if self._channel_order is not None:
            key_states = key_states[:, :, :, self._channel_order]
            # Value states don't need reordering (handled separately)

        # Sink token separation
        # positions shape: (batch, seq_len) — identify sink positions
        # For sink tokens (first N in sequence), store as FP16
        sink_count = cfg.sink_tokens

        # Split along sequence dimension
        keys_sink = key_states[:, :sink_count, :, :]
        values_sink = value_states[:, :sink_count, :, :]
        keys_body = key_states[:, sink_count:, :, :]
        values_body = value_states[:, sink_count:, :, :]
        
        # Quantize body (non-sink) as INT4
        keys_int4, scales_k, zero_points_k = self._quantize_block(keys_body)
        values_int4, scales_v, zero_points_v = self._quantize_block(values_body)
        
        # Create QuantizedKVBlock
        block = QuantizedKVBlock(
            keys_int4=keys_int4,
            values_int4=values_int4,
            keys_sink_fp16=keys_sink.astype(np.float16),
            values_sink_fp16=values_sink.astype(np.float16),
            scales_k=scales_k,
            zero_points_k=zero_points_k,
            scales_v=scales_v,
            zero_points_v=zero_points_v,
            channel_order=self._channel_order.copy() if self._channel_order is not None else np.array([]),
            positions=positions.copy(),
            bits=cfg.bits,
        )
        
        # Return block and key_states for RoPE (we pass through quantized body for RoPE application)
        # Actually we need to return something for RoPE - the caller will apply RoPE to dequantified output
        # But we store quantized, so RoPE is applied to dequantified: return the quantized body as "remaining"
        remaining_for_rope = keys_body  # This will be RoPE-applied externally to the dequantified values
        
        return block, remaining_for_rope
    
    def _quantize_block(self, states: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """Quantize a block of states to INT4."""
        cfg = self._config
        batch, seq, num_heads, head_dim = states.shape
        
        # For INT4, we pack 2 values per byte
        # Store as uint8 with 2 values per entry
        n_blocks = seq // cfg.group_size
        if seq % cfg.group_size != 0:
            n_blocks += 1
        
        # Packed shape: (n_blocks, group_size, num_heads, head_dim // 2)
        packed_head_dim = head_dim // 2
        
        keys_int4 = np.zeros((n_blocks, cfg.group_size, num_heads, packed_head_dim), dtype=np.uint8)
        scales = np.zeros((n_blocks, num_heads, packed_head_dim), dtype=np.float32)
        zero_points = np.zeros((n_blocks, num_heads, packed_head_dim), dtype=np.float32)
        
        for b in range(batch):
            for h in range(num_heads):
                for d in range(packed_head_dim):
                    for blk in range(n_blocks):
                        start = blk * cfg.group_size
                        end = min(start + cfg.group_size, seq)
                        block_data = states[b, start:end, h, d]
                        
                        if len(block_data) == 0:
                            continue
                        
                        # Asymmetric quantization
                        min_val = np.min(block_data)
                        max_val = np.max(block_data)
                        
                        if cfg.bits == 4:
                            max_range = 15.0
                        else:
                            max_range = 255.0
                        
                        scale = (max_val - min_val) / max_range if max_val > min_val else 1.0
                        zero_point = -round(min_val / scale) if scale != 0 else 0
                        
                        # Quantize
                        quantized = np.clip(np.round(block_data / scale + zero_point), 0, max_range).astype(np.uint8)
                        
                        # Pack 2 values per byte
                        for i, val in enumerate(quantized):
                            if i % 2 == 0:
                                keys_int4[blk, i, h, d] = val
                            else:
                                keys_int4[blk, i, h, d] |= (val << 4)
                        
                        scales[blk, h, d] = scale
                        zero_points[blk, h, d] = zero_point
        
        return keys_int4, scales, zero_points
    
    def dequantize(
        self,
        block: "QuantizedKVBlock",
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Restore FP16 key_states and value_states from QuantizedKVBlock.
        
        RoPE will be applied externally after dequantization (INVARIANT 10).
        
        Args:
            block: QuantizedKVBlock from quantize_pre_rope()
        
        Returns:
            Tuple of (key_states_fp16, value_states_fp16) both shape (batch, seq, num_heads, head_dim)
        """
        cfg = self._config
        
        # Dequantize body (non-sink)
        keys_body = self._dequantize_block(block.keys_int4, block.scales_k, block.zero_points_k, cfg.group_size)
        values_body = self._dequantize_block(block.values_int4, block.scales_v, block.zero_points_v, cfg.group_size)
        
        # Concatenate sink (FP16) + body (dequantized)
        keys_fp16 = np.concatenate([block.keys_sink_fp16, keys_body], axis=1).astype(np.float32)
        values_fp16 = np.concatenate([block.values_sink_fp16, values_body], axis=1).astype(np.float32)
        
        # Apply channel de-ordering if stored
        if len(block.channel_order) > 0:
            # Create inverse permutation
            inv_order = np.argsort(block.channel_order)
            keys_fp16 = keys_fp16[:, :, :, inv_order]
        
        return keys_fp16, values_fp16
    
    def _dequantize_block(
        self,
        packed_int4: np.ndarray,
        scales: np.ndarray,
        zero_points: np.ndarray,
        group_size: int,
    ) -> np.ndarray:
        """Dequantize INT4 block back to FP32."""
        n_blocks, _, num_heads, packed_head_dim = packed_int4.shape
        seq_len = n_blocks * group_size
        
        output = np.zeros((1, seq_len, num_heads, packed_head_dim * 2), dtype=np.float32)
        
        for blk in range(n_blocks):
            start = blk * group_size
            for h in range(num_heads):
                for d in range(packed_head_dim):
                    scale = scales[blk, h, d]
                    zp = zero_points[blk, h, d]
                    
                    for i in range(group_size):
                        if start + i >= seq_len:
                            break
                        # Unpack 2 values per byte
                        byte = packed_int4[blk, i, h, d]
                        val1 = byte & 0x0F
                        val2 = (byte >> 4) & 0x0F
                        
                        # Dequantize
                        output[0, start + i, h, d * 2] = (val1 - zp) * scale
                        output[0, start + i, h, d * 2 + 1] = (val2 - zp) * scale
        
        return output
    
    @property
    def is_calibrated(self) -> bool:
        """True if calibrate() has been called."""
        return self._calibrated
    
    @property
    def config(self) -> RotateKVConfig:
        """Current quantization config."""
        return self._config