"""
DKM Layer: Differentiable K-Means Clustering Layer

Implements the core algorithm from Section 3.2 and 3.3 of the paper:
  1. Compute distance matrix D between weights W and centroids C
  2. Apply softmax with temperature τ to get attention matrix A
  3. Update centroids: c_j = Σ_i(a_ij * w_i) / Σ_i(a_ij)
  4. Iterate until convergence or max iterations
  5. Compute compressed weights: W_tilde = A @ C

Supports both 1D and multi-dimensional clustering (Section 3.3).
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class DKMLayer(nn.Module):
    """
    Differentiable K-Means Clustering Layer.
    
    This layer performs differentiable weight clustering by casting k-means
    as an attention problem. During training, soft assignment via attention
    allows gradients to flow through the clustering process. During inference,
    weights are snapped to nearest centroids (hard assignment).
    
    Args:
        weight_tensor: The weight parameter to cluster (nn.Parameter)
        n_clusters: Number of cluster centroids (k = 2^bits)
        tau: Temperature for softmax attention (controls hardness of assignment)
        dim: Dimension for multi-dimensional clustering (default=1 for scalar)
        max_iter: Maximum number of DKM iterations per forward pass
        epsilon: Convergence threshold for centroid updates
        init_method: Centroid initialization method ('kmeans++' or 'random')
    """
    
    def __init__(
        self,
        weight_tensor: nn.Parameter,
        n_clusters: int = 16,
        tau: float = 2e-5,
        dim: int = 1,
        max_iter: int = 5,
        epsilon: float = 1e-4,
        init_method: str = "kmeans++",
    ):
        super().__init__()
        
        self.n_clusters = n_clusters
        self.tau = tau
        self.dim = dim
        self.max_iter = max_iter
        self.epsilon = epsilon
        self.init_method = init_method
        
        # Store reference to the weight parameter (not a copy)
        self.weight = weight_tensor
        self.original_shape = weight_tensor.shape
        
        # Validate dimensions
        n_elements = weight_tensor.numel()
        if n_elements % dim != 0:
            raise ValueError(
                f"Weight tensor has {n_elements} elements, which is not "
                f"divisible by dim={dim}. Choose a different dim."
            )
        
        self.n_vectors = n_elements // dim
        
        # Initialize centroids as a buffer (not a parameter - updated by DKM iterations)
        # Shape: (n_clusters, dim) for multi-dim, (n_clusters, 1) for scalar
        centroids = self._initialize_centroids(weight_tensor)
        self.register_buffer("centroids", centroids)
        
        # Track whether this is the first forward pass
        self.register_buffer("initialized", torch.tensor(False))
    
    def _initialize_centroids(self, weight_tensor: torch.Tensor) -> torch.Tensor:
        """
        Initialize centroids using k-means++ or random selection.
        
        For multi-dimensional clustering, weights are reshaped into 
        (N/d, d) sub-vectors before selecting initial centroids.
        """
        with torch.no_grad():
            # Flatten and reshape for multi-dim clustering
            flat_weights = weight_tensor.detach().reshape(-1)
            
            if self.dim == 1:
                vectors = flat_weights.unsqueeze(1)  # (N, 1)
            else:
                vectors = flat_weights.reshape(self.n_vectors, self.dim)  # (N/d, d)
            
            if self.init_method == "kmeans++":
                centroids = self._kmeans_plus_plus_init(vectors)
            else:
                # Random initialization: select k random weight vectors
                indices = torch.randperm(vectors.shape[0])[:self.n_clusters]
                centroids = vectors[indices].clone()
            
            return centroids  # (n_clusters, dim)
    
    def _kmeans_plus_plus_init(self, vectors: torch.Tensor) -> torch.Tensor:
        """
        K-means++ initialization for better centroid starting positions.
        
        Args:
            vectors: (N, d) weight vectors
        
        Returns:
            centroids: (k, d) initial centroids
        """
        n = vectors.shape[0]
        k = self.n_clusters
        d = vectors.shape[1]
        
        # If fewer vectors than clusters, duplicate with small noise
        if n < k:
            repeats = (k // n) + 1
            vectors_expanded = vectors.repeat(repeats, 1)[:k]
            noise = torch.randn_like(vectors_expanded) * 1e-6
            return (vectors_expanded + noise).clone()
        
        # Choose first centroid randomly
        idx = torch.randint(0, n, (1,)).item()
        centroids = [vectors[idx].clone()]
        
        for _ in range(1, k):
            # Compute distances to nearest existing centroid
            stacked = torch.stack(centroids, dim=0)  # (current_k, d)
            # distances: (N, current_k)
            dists = torch.cdist(vectors.unsqueeze(0), stacked.unsqueeze(0)).squeeze(0)
            min_dists = dists.min(dim=1).values  # (N,)
            
            # Choose next centroid with probability proportional to distance squared
            probs = min_dists ** 2
            prob_sum = probs.sum()
            
            if prob_sum < 1e-30 or torch.isnan(prob_sum) or torch.isinf(prob_sum):
                # Fallback: all distances are zero (e.g., uniform weights)
                # Select a random unused index and add small noise to break ties
                idx = torch.randint(0, n, (1,)).item()
            else:
                probs = probs / prob_sum
                # Clamp to avoid negative values from float errors
                probs = probs.clamp(min=0.0)
                idx = torch.multinomial(probs, 1).item()
            
            centroids.append(vectors[idx].clone())
        
        result = torch.stack(centroids, dim=0)  # (k, d)
        
        # Add tiny noise to break ties if centroids are identical
        if result.unique(dim=0).shape[0] < k:
            noise = torch.randn_like(result) * (result.abs().mean() * 1e-4 + 1e-8)
            result = result + noise
        
        return result
    
    def _compute_distance_matrix(
        self, weights: torch.Tensor, centroids: torch.Tensor
    ) -> torch.Tensor:
        """
        Compute negative squared Euclidean distance matrix D.
        
        D[i,j] = -||w_i - c_j||^2
        
        Per the paper: d_ij = -f(w_i, c_j) where f is Euclidean distance.
        We use squared Euclidean for efficiency (equivalent for softmax).
        
        Args:
            weights: (N/d, d) weight sub-vectors
            centroids: (k, d) centroid vectors
        
        Returns:
            D: (N/d, k) negative distance matrix
        """
        # Efficient computation: ||w - c||^2 = ||w||^2 - 2*w.c + ||c||^2
        w_sq = (weights ** 2).sum(dim=1, keepdim=True)     # (N/d, 1)
        c_sq = (centroids ** 2).sum(dim=1, keepdim=True).t()  # (1, k)
        wc = weights @ centroids.t()                         # (N/d, k)
        
        D = -(w_sq - 2 * wc + c_sq)  # (N/d, k) — negative squared Euclidean
        return D
    
    def _compute_attention(self, D: torch.Tensor) -> torch.Tensor:
        """
        Compute attention matrix A from distance matrix D using softmax with temperature τ.
        
        a_ij = exp(d_ij / τ) / Σ_k exp(d_ik / τ)
        
        This is the key differentiable component that enables gradient flow
        through the clustering assignment.
        
        Args:
            D: (N/d, k) negative distance matrix
        
        Returns:
            A: (N/d, k) attention matrix (rows sum to 1)
        """
        # Scale by temperature and apply softmax along cluster dimension
        A = F.softmax(D / self.tau, dim=1)  # (N/d, k)
        return A
    
    def _update_centroids(
        self, A: torch.Tensor, weights: torch.Tensor
    ) -> torch.Tensor:
        """
        Update centroids using attention-weighted average of weights.
        
        c_j_new = Σ_i(a_ij * w_i) / Σ_i(a_ij)
        
        This is the M-step equivalent in the EM interpretation (Appendix G).
        
        Args:
            A: (N/d, k) attention matrix
            weights: (N/d, d) weight sub-vectors
        
        Returns:
            new_centroids: (k, d) updated centroids
        """
        # Numerator: Σ_i(a_ij * w_i) for each centroid j
        # A.t() @ weights: (k, N/d) @ (N/d, d) = (k, d)
        numerator = A.t() @ weights  # (k, d)
        
        # Denominator: Σ_i(a_ij) for each centroid j
        denominator = A.sum(dim=0, keepdim=True).t()  # (k, 1)
        
        # Avoid division by zero
        denominator = denominator.clamp(min=1e-10)
        
        new_centroids = numerator / denominator  # (k, d)
        return new_centroids
    
    def forward(self, weight_override: torch.Tensor = None) -> torch.Tensor:
        """
        Forward pass: perform differentiable k-means clustering.
        
        The iterative process (Fig. 2 of the paper):
        1. Compute distance matrix D between weights and centroids
        2. Compute attention matrix A = softmax(D/τ)
        3. Update centroids: C_new = (A^T @ W) / sum(A)
        4. Repeat until convergence or max_iter reached
        5. Return compressed weights: W_tilde = A @ C
        
        During training: returns soft-assigned weights (differentiable)
        During eval: returns hard-assigned weights (nearest centroid)
        
        Args:
            weight_override: Optional weight tensor to use instead of self.weight
        
        Returns:
            compressed_weight: Tensor with same shape as original weight
        """
        weight_tensor = weight_override if weight_override is not None else self.weight
        
        # Reshape weights into sub-vectors for multi-dim clustering
        flat_weights = weight_tensor.reshape(-1)
        if self.dim == 1:
            W = flat_weights.unsqueeze(1)  # (N, 1)
        else:
            W = flat_weights.reshape(self.n_vectors, self.dim)  # (N/d, d)
        
        # Re-initialize centroids on first call (ensures correct device/dtype)
        if not self.initialized:
            self.centroids = self._initialize_centroids(weight_tensor).to(
                weight_tensor.device
            )
            self.initialized = torch.tensor(True, device=weight_tensor.device)
        
        # Current centroids (detached from previous iteration's graph)
        C = self.centroids.clone()
        
        # Iterative DKM clustering (Section 3.2, Fig. 2)
        for iteration in range(self.max_iter):
            # Step 1: Compute distance matrix
            D = self._compute_distance_matrix(W, C)
            
            # Step 2: Compute attention matrix with temperature τ
            A = self._compute_attention(D)
            
            # Step 3: Update centroids
            C_new = self._update_centroids(A, W)
            
            # Step 4: Check convergence |C - C_new| ≤ ε
            delta = (C - C_new).abs().max().item()
            C = C_new
            
            if delta <= self.epsilon:
                break
        
        # Store converged centroids for next batch (warm start)
        self.centroids = C.detach().clone()
        
        if self.training:
            # Training: use soft assignment (differentiable)
            # W_tilde = A @ C (attention-weighted centroids)
            W_tilde = A @ C  # (N/d, d)
        else:
            # Inference: snap to nearest centroid (hard assignment)
            # Find nearest centroid for each weight vector
            D_final = self._compute_distance_matrix(W, C)
            assignments = D_final.argmax(dim=1)  # (N/d,) — argmax because D is negative distance
            W_tilde = C[assignments]  # (N/d, d)
        
        # Reshape back to original weight shape
        compressed_weight = W_tilde.reshape(self.original_shape)
        
        return compressed_weight
    
    def get_assignments(self) -> torch.Tensor:
        """
        Get hard cluster assignments for each weight (for inference/analysis).
        
        Returns:
            assignments: (N/d,) tensor of cluster indices
        """
        with torch.no_grad():
            flat_weights = self.weight.detach().reshape(-1)
            if self.dim == 1:
                W = flat_weights.unsqueeze(1)
            else:
                W = flat_weights.reshape(self.n_vectors, self.dim)
            
            D = self._compute_distance_matrix(W, self.centroids)
            assignments = D.argmax(dim=1)
        
        return assignments
    
    def get_codebook(self) -> torch.Tensor:
        """
        Get the current centroid codebook.
        
        Returns:
            centroids: (k, d) centroid values
        """
        return self.centroids.clone()
    
    def extra_repr(self) -> str:
        bits = math.log2(self.n_clusters)
        bpw = bits / self.dim
        return (
            f"n_clusters={self.n_clusters}, tau={self.tau}, dim={self.dim}, "
            f"max_iter={self.max_iter}, eps={self.epsilon}, "
            f"bits={bits:.1f}, bits_per_weight={bpw:.2f}, "
            f"weight_shape={list(self.original_shape)}"
        )