Add UniSITH source code: weight_extraction, concept_pool, comp, unisith

Browse files

Files changed (8) hide show

unimodal_sith/__pycache__/comp.cpython-312.pyc +0 -0
unimodal_sith/__pycache__/concept_pool.cpython-312.pyc +0 -0
unimodal_sith/__pycache__/unisith.cpython-312.pyc +0 -0
unimodal_sith/__pycache__/weight_extraction.cpython-312.pyc +0 -0
unimodal_sith/comp.py +158 -0
unimodal_sith/concept_pool.py +194 -0
unimodal_sith/unisith.py +450 -0
unimodal_sith/weight_extraction.py +378 -0

unimodal_sith/__pycache__/comp.cpython-312.pyc ADDED Viewed

Binary file (5.88 kB). View file

unimodal_sith/__pycache__/concept_pool.cpython-312.pyc ADDED Viewed

Binary file (8.44 kB). View file

unimodal_sith/__pycache__/unisith.cpython-312.pyc ADDED Viewed

Binary file (18.6 kB). View file

unimodal_sith/__pycache__/weight_extraction.cpython-312.pyc ADDED Viewed

Binary file (16.6 kB). View file

unimodal_sith/comp.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+COMP: Coherent Orthogonal Matching Pursuit
+Adapted from SITH (Vaquero et al., 2025), Algorithm 1.
+Given a singular vector v_hat and a concept dictionary Gamma_hat, COMP finds
+a sparse, semantically coherent combination of K concepts that best
+approximates v_hat.
+This implementation works with both text concept embeddings (original SITH)
+and image concept embeddings (UniSITH).
+"""
+import torch
+import numpy as np
+from scipy.optimize import nnls
+from typing import List, Tuple, Optional
+def comp(
+    v_hat: torch.Tensor,
+    Gamma_hat: torch.Tensor,
+    K: int = 5,
+    lambda_coh: float = 0.3,
+) -> Tuple[torch.Tensor, List[int]]:
+    """
+    Coherent Orthogonal Matching Pursuit (COMP).
+    Extends Non-Negative Orthogonal Matching Pursuit (NNOMP) by incorporating
+    a coherence term that encourages semantically coherent concept selections.
+    Args:
+        v_hat: [d] projected singular vector (L2-normalized)
+        Gamma_hat: [C, d] concept embedding matrix (L2-normalized rows)
+        K: Sparsity level (number of concepts to select)
+        lambda_coh: Coherence weight (λ in the paper, default 0.3)
+    Returns:
+        c: [C] sparse coefficient vector (non-negative)
+        support: List of K selected concept indices
+    """
+    C, d = Gamma_hat.shape
+    device = v_hat.device
+    # Move to CPU for scipy nnls
+    v_hat_np = v_hat.cpu().numpy().astype(np.float64)
+    Gamma_np = Gamma_hat.cpu().numpy().astype(np.float64)
+    # Initialize
+    r = v_hat_np.copy()  # Residual
+    S = []  # Support set (selected concept indices)
+    c = np.zeros(C)
+    # Precompute concept-concept similarity matrix (for coherence)
+    # Only compute upper triangle for efficiency - but we'll compute on the fly
+    # since C can be very large
+    for k in range(K):
+        # Step 1: Compute correlations with residual
+        s_res = Gamma_np @ r  # [C]
+        # Step 2: Compute coherence scores
+        s_coh = np.zeros(C)
+        if len(S) > 0:
+            # Average similarity of each candidate to already-selected concepts
+            S_embeddings = Gamma_np[S]  # [|S|, d]
+            # Similarity of all concepts to selected ones
+            sim_to_selected = Gamma_np @ S_embeddings.T  # [C, |S|]
+            s_coh = sim_to_selected.mean(axis=1)  # [C]
+            # Zero out already selected
+            for idx in S:
+                s_coh[idx] = -np.inf
+        # Step 3: Combined score
+        s_final = s_res + lambda_coh * s_coh
+        # Mask already selected concepts
+        for idx in S:
+            s_final[idx] = -np.inf
+        # Step 4: Greedy selection
+        j_k = int(np.argmax(s_final))
+        S.append(j_k)
+        # Step 5: Non-negative least squares on current support
+        G_S = Gamma_np[S].T  # [d, |S|] - columns are selected concept embeddings
+        c_S, _ = nnls(G_S, v_hat_np)  # min ||v_hat - G_S @ c_S||^2, c_S >= 0
+        # Step 6: Update residual
+        r = v_hat_np - G_S @ c_S
+    # Construct final coefficient vector
+    c = np.zeros(C)
+    for i, j in enumerate(S):
+        c[j] = c_S[i]
+    return torch.tensor(c, dtype=torch.float32, device=device), S
+def comp_batch(
+    V_hat: torch.Tensor,
+    Gamma_hat: torch.Tensor,
+    K: int = 5,
+    lambda_coh: float = 0.3,
+) -> Tuple[torch.Tensor, List[List[int]]]:
+    """
+    Apply COMP to multiple singular vectors.
+    Args:
+        V_hat: [n, d] batch of projected singular vectors
+        Gamma_hat: [C, d] concept embedding matrix
+        K: Sparsity level
+        lambda_coh: Coherence weight
+    Returns:
+        C_mat: [n, C] coefficient matrix
+        supports: List of n support sets
+    """
+    n = V_hat.shape[0]
+    C = Gamma_hat.shape[0]
+    C_mat = torch.zeros(n, C, device=V_hat.device)
+    supports = []
+    for i in range(n):
+        c_i, support_i = comp(V_hat[i], Gamma_hat, K=K, lambda_coh=lambda_coh)
+        C_mat[i] = c_i
+        supports.append(support_i)
+    return C_mat, supports
+def top_k_selection(
+    v_hat: torch.Tensor,
+    Gamma_hat: torch.Tensor,
+    K: int = 5,
+) -> Tuple[torch.Tensor, List[int]]:
+    """
+    Simple top-K selection baseline: pick the K most similar concepts.
+    Args:
+        v_hat: [d] projected singular vector
+        Gamma_hat: [C, d] concept embedding matrix
+        K: Number of concepts to select
+    Returns:
+        c: [C] coefficient vector (similarity scores for top-K, 0 elsewhere)
+        support: List of K selected concept indices
+    """
+    similarities = Gamma_hat @ v_hat  # [C]
+    top_k_vals, top_k_idx = torch.topk(similarities, K)
+    c = torch.zeros(Gamma_hat.shape[0], device=v_hat.device)
+    support = top_k_idx.tolist()
+    for i, idx in enumerate(support):
+        c[idx] = max(0, top_k_vals[i].item())  # Non-negative
+    return c, support

unimodal_sith/concept_pool.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+Visual Concept Pool for UniSITH.
+Instead of text concepts (ConceptNet strings + CLIP text encoder),
+we use captioned images as the concept pool.
+Each concept is an image from a captioned dataset, and the corresponding
+caption provides human-interpretable meaning.
+The concept embeddings are computed by encoding each image through the
+same unimodal vision model being analyzed.
+"""
+import torch
+import torch.nn.functional as F
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+from PIL import Image
+from tqdm import tqdm
+import os
+import json
+class VisualConceptPool:
+    """
+    A pool of visual concepts, each represented by:
+      - An image embedding (computed by the model being analyzed)
+      - A caption (for human interpretability)
+      - Optionally, the original image
+    Analogous to Γ = {γ_1, ..., γ_C} in SITH, but each γ_i is an image
+    embedding rather than a text embedding.
+    """
+    def __init__(
+        self,
+        embeddings: torch.Tensor,
+        captions: List[str],
+        image_ids: Optional[List[int]] = None,
+        metadata: Optional[Dict] = None,
+    ):
+        """
+        Args:
+            embeddings: [C, d] tensor of L2-normalized concept embeddings
+            captions: List of C caption strings
+            image_ids: Optional list of C image IDs for retrieval
+            metadata: Optional metadata dict
+        """
+        assert embeddings.shape[0] == len(captions), \
+            f"Embeddings ({embeddings.shape[0]}) and captions ({len(captions)}) must match"
+        self.embeddings = embeddings  # [C, d]
+        self.captions = captions
+        self.image_ids = image_ids
+        self.metadata = metadata or {}
+        self.num_concepts = len(captions)
+        self.embed_dim = embeddings.shape[1]
+    @classmethod
+    def from_dataset(
+        cls,
+        dataset,
+        model,
+        processor,
+        architecture: str,
+        image_column: str = "image",
+        caption_column: str = "caption",
+        image_id_column: str = "image_id",
+        batch_size: int = 64,
+        max_concepts: Optional[int] = None,
+        device: str = "cpu",
+        cache_path: Optional[str] = None,
+    ) -> "VisualConceptPool":
+        """
+        Build a concept pool from a HuggingFace dataset.
+        Args:
+            dataset: HF dataset with image and caption columns
+            model: Vision model (HuggingFace transformers)
+            processor: Image processor/transform
+            architecture: Model architecture type
+            image_column: Column name for images
+            caption_column: Column name for captions
+            image_id_column: Column name for image IDs
+            batch_size: Batch size for encoding
+            max_concepts: Max number of concepts to use
+            device: Device for computation
+            cache_path: If set, cache embeddings to/from this path
+        """
+        # Check for cached embeddings
+        if cache_path and os.path.exists(cache_path):
+            print(f"Loading cached concept pool from {cache_path}")
+            return cls.load(cache_path)
+        if max_concepts is not None:
+            dataset = dataset.select(range(min(max_concepts, len(dataset))))
+        captions = dataset[caption_column]
+        image_ids = None
+        if image_id_column in dataset.column_names:
+            image_ids = dataset[image_id_column]
+        # Encode all images
+        model = model.to(device)
+        model.eval()
+        all_embeddings = []
+        print(f"Encoding {len(dataset)} concept images...")
+        for i in tqdm(range(0, len(dataset), batch_size)):
+            batch_end = min(i + batch_size, len(dataset))
+            batch_images = [dataset[j][image_column] for j in range(i, batch_end)]
+            # Ensure images are RGB
+            batch_images = [img.convert("RGB") if img.mode != "RGB" else img for img in batch_images]
+            # Process images
+            inputs = processor(images=batch_images, return_tensors="pt").to(device)
+            with torch.no_grad():
+                if architecture == "dinov2":
+                    outputs = model(**inputs)
+                    embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
+                elif architecture == "clip":
+                    # For CLIP, get the vision features
+                    outputs = model.vision_model(**inputs)
+                    # Get CLS token, apply post-layernorm
+                    pooled = outputs.pooler_output  # Already pooled + post-LN
+                    # Apply visual projection
+                    embeddings = model.visual_projection(pooled)
+                elif architecture == "vit":
+                    outputs = model(**inputs)
+                    embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
+                # L2 normalize
+                embeddings = F.normalize(embeddings, dim=-1)
+                all_embeddings.append(embeddings.cpu())
+        embeddings = torch.cat(all_embeddings, dim=0)
+        pool = cls(
+            embeddings=embeddings,
+            captions=captions,
+            image_ids=image_ids,
+            metadata={
+                "architecture": architecture,
+                "num_concepts": len(captions),
+                "embed_dim": embeddings.shape[1],
+            },
+        )
+        # Cache if requested
+        if cache_path:
+            pool.save(cache_path)
+        return pool
+    def get_centered_embeddings(self) -> torch.Tensor:
+        """
+        Return embeddings after mean-centering and re-normalization.
+        This is analogous to the modality gap correction in SITH (Eq. 18-19),
+        but for unimodal models we center within the image embedding distribution
+        to ensure the concept embeddings are centered around the origin.
+        This is important for matching with singular vectors, which themselves
+        are zero-centered directions.
+        """
+        mu = self.embeddings.mean(dim=0, keepdim=True)  # [1, d]
+        centered = self.embeddings - mu  # [C, d]
+        centered = F.normalize(centered, dim=-1)  # Re-normalize
+        return centered, mu
+    def save(self, path: str):
+        """Save concept pool to disk."""
+        os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
+        torch.save({
+            "embeddings": self.embeddings,
+            "captions": self.captions,
+            "image_ids": self.image_ids,
+            "metadata": self.metadata,
+        }, path)
+        print(f"Saved concept pool to {path}")
+    @classmethod
+    def load(cls, path: str) -> "VisualConceptPool":
+        """Load concept pool from disk."""
+        data = torch.load(path, weights_only=False)
+        return cls(
+            embeddings=data["embeddings"],
+            captions=data["captions"],
+            image_ids=data.get("image_ids"),
+            metadata=data.get("metadata", {}),
+        )

unimodal_sith/unisith.py ADDED Viewed

	@@ -0,0 +1,450 @@

+"""
+UniSITH: Unimodal Semantic Inspection of Transformer Heads
+Main analysis class that orchestrates:
+1. Weight extraction (W_VO matrices from attention heads)
+2. SVD decomposition (finding principal directions)
+3. Projection to feature space
+4. Concept attribution via COMP (matching to visual concepts)
+5. Model editing (amplifying/suppressing concepts)
+Key difference from original SITH:
+- Works with ANY ViT (not just CLIP)
+- Uses captioned images as concept pool (not text from ConceptNet)
+- Captions provide human interpretability
+- No cross-modal projection needed (same model encodes both the weights and concepts)
+"""
+import torch
+import torch.nn.functional as F
+import numpy as np
+from typing import Dict, List, Optional, Tuple, Any
+from dataclasses import dataclass, field
+import json
+import os
+from .weight_extraction import WeightExtractor
+from .concept_pool import VisualConceptPool
+from .comp import comp, comp_batch, top_k_selection
+@dataclass
+class SingularVectorInterpretation:
+    """Interpretation of a single singular vector."""
+    layer_idx: int
+    head_idx: int
+    sv_idx: int
+    singular_value: float
+    concepts: List[str]  # Captions of matched concepts
+    concept_indices: List[int]  # Indices into concept pool
+    coefficients: List[float]  # COMP coefficients
+    fidelity: float  # Cosine similarity between original and reconstruction
+    image_ids: Optional[List[int]] = None  # IDs for retrieving original images
+    def to_dict(self) -> Dict:
+        return {
+            "layer": self.layer_idx,
+            "head": self.head_idx,
+            "sv_index": self.sv_idx,
+            "singular_value": self.singular_value,
+            "concepts": [
+                {"caption": c, "coefficient": w, "concept_idx": idx}
+                for c, w, idx in zip(self.concepts, self.coefficients, self.concept_indices)
+            ],
+            "fidelity": self.fidelity,
+            "image_ids": self.image_ids,
+        }
+    def __repr__(self) -> str:
+        lines = [f"Layer {self.layer_idx}, Head {self.head_idx}, SV {self.sv_idx} "
+                 f"(σ={self.singular_value:.4f}, fidelity={self.fidelity:.4f})"]
+        for c, w in zip(self.concepts, self.coefficients):
+            lines.append(f"  [{w:.4f}] {c}")
+        return "\n".join(lines)
+@dataclass
+class HeadInterpretation:
+    """Full interpretation of an attention head."""
+    layer_idx: int
+    head_idx: int
+    singular_vectors: List[SingularVectorInterpretation]
+    def to_dict(self) -> Dict:
+        return {
+            "layer": self.layer_idx,
+            "head": self.head_idx,
+            "singular_vectors": [sv.to_dict() for sv in self.singular_vectors],
+        }
+    def __repr__(self) -> str:
+        lines = [f"=== Layer {self.layer_idx}, Head {self.head_idx} ==="]
+        for sv in self.singular_vectors:
+            lines.append(str(sv))
+            lines.append("")
+        return "\n".join(lines)
+class UniSITH:
+    """
+    Unimodal Semantic Inspection of Transformer Heads.
+    Analyzes the internal representations of ViT attention heads by:
+    1. Decomposing W_VO matrices via SVD
+    2. Projecting singular vectors to the model's feature space
+    3. Attributing visual concepts from a captioned image pool
+    """
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        architecture: str,
+        n_heads: int,
+        d_model: int,
+        concept_pool: VisualConceptPool,
+        device: str = "cpu",
+    ):
+        """
+        Args:
+            model: Vision transformer model
+            architecture: One of "dinov2", "clip", "vit"
+            n_heads: Number of attention heads
+            d_model: Hidden dimension
+            concept_pool: Visual concept pool with embeddings and captions
+            device: Computation device
+        """
+        self.model = model
+        self.architecture = architecture
+        self.device = device
+        self.concept_pool = concept_pool
+        self.extractor = WeightExtractor(model, architecture, n_heads, d_model)
+        self.n_heads = n_heads
+        self.d_model = d_model
+        # Precompute centered concept embeddings
+        self.centered_concepts, self.concept_mean = concept_pool.get_centered_embeddings()
+        self.centered_concepts = self.centered_concepts.to(device)
+        self.concept_mean = self.concept_mean.to(device)
+    def analyze_head(
+        self,
+        layer_idx: int,
+        head_idx: int,
+        n_singular_vectors: int = 5,
+        K: int = 5,
+        lambda_coh: float = 0.3,
+        method: str = "comp",
+    ) -> HeadInterpretation:
+        """
+        Analyze a single attention head: decompose its W_VO matrix and
+        interpret the top singular vectors.
+        Args:
+            layer_idx: Transformer layer index
+            head_idx: Attention head index
+            n_singular_vectors: Number of top singular vectors to interpret
+            K: Number of concepts per singular vector
+            lambda_coh: COMP coherence weight
+            method: "comp" or "top_k"
+        Returns:
+            HeadInterpretation with concept attributions for each singular vector
+        """
+        # Step 1: Extract W_VO and decompose via SVD
+        W_VO_all = self.extractor.compute_WVO(layer_idx, fold_ln=True, project_ones=True)
+        W_VO_h = W_VO_all[head_idx]  # [D, D]
+        U, sigma, Vt = self.extractor.svd_decompose(W_VO_h, top_k=n_singular_vectors)
+        # U: [D, n_sv], sigma: [n_sv], Vt: [n_sv, D]
+        # Step 2: Project right singular vectors to feature space
+        V_projected = self.extractor.project_to_feature_space(Vt)  # [n_sv, d_out]
+        # Step 3: Center the projected vectors (analogous to modality gap correction)
+        V_centered = V_projected - self.concept_mean
+        V_centered = F.normalize(V_centered, dim=-1)
+        # Step 4: Attribute concepts via COMP (or top-k)
+        sv_interpretations = []
+        for i in range(n_singular_vectors):
+            v_hat = V_centered[i]  # [d_out]
+            if method == "comp":
+                coeffs, support = comp(
+                    v_hat, self.centered_concepts, K=K, lambda_coh=lambda_coh
+                )
+            elif method == "top_k":
+                coeffs, support = top_k_selection(
+                    v_hat, self.centered_concepts, K=K
+                )
+            else:
+                raise ValueError(f"Unknown method: {method}")
+            # Extract concept captions and coefficients
+            concept_captions = [self.concept_pool.captions[idx] for idx in support]
+            concept_coeffs = [coeffs[idx].item() for idx in support]
+            concept_image_ids = None
+            if self.concept_pool.image_ids is not None:
+                concept_image_ids = [self.concept_pool.image_ids[idx] for idx in support]
+            # Compute fidelity: cosine similarity between original and reconstruction
+            reconstruction = torch.zeros_like(v_hat)
+            for idx, coeff in zip(support, concept_coeffs):
+                reconstruction += coeff * self.centered_concepts[idx]
+            fidelity = F.cosine_similarity(
+                v_hat.unsqueeze(0), reconstruction.unsqueeze(0)
+            ).item()
+            sv_interpretations.append(SingularVectorInterpretation(
+                layer_idx=layer_idx,
+                head_idx=head_idx,
+                sv_idx=i,
+                singular_value=sigma[i].item(),
+                concepts=concept_captions,
+                concept_indices=support,
+                coefficients=concept_coeffs,
+                fidelity=fidelity,
+                image_ids=concept_image_ids,
+            ))
+        return HeadInterpretation(
+            layer_idx=layer_idx,
+            head_idx=head_idx,
+            singular_vectors=sv_interpretations,
+        )
+    def analyze_layer(
+        self,
+        layer_idx: int,
+        n_singular_vectors: int = 5,
+        K: int = 5,
+        lambda_coh: float = 0.3,
+        method: str = "comp",
+    ) -> List[HeadInterpretation]:
+        """Analyze all heads in a layer."""
+        results = []
+        for h in range(self.n_heads):
+            print(f"  Analyzing head {h}/{self.n_heads}...")
+            result = self.analyze_head(
+                layer_idx, h, n_singular_vectors, K, lambda_coh, method
+            )
+            results.append(result)
+        return results
+    def analyze_model(
+        self,
+        layers: Optional[List[int]] = None,
+        n_singular_vectors: int = 5,
+        K: int = 5,
+        lambda_coh: float = 0.3,
+        method: str = "comp",
+    ) -> Dict[int, List[HeadInterpretation]]:
+        """
+        Analyze multiple layers of the model.
+        Args:
+            layers: List of layer indices. If None, analyzes last 4 layers.
+            n_singular_vectors: Number of top singular vectors per head
+            K: Concepts per singular vector
+            lambda_coh: COMP coherence weight
+            method: "comp" or "top_k"
+        Returns:
+            Dict mapping layer_idx -> list of HeadInterpretations
+        """
+        if layers is None:
+            n_layers = self.extractor._get_num_layers()
+            layers = list(range(max(0, n_layers - 4), n_layers))
+        results = {}
+        for layer_idx in layers:
+            print(f"Analyzing layer {layer_idx}...")
+            results[layer_idx] = self.analyze_layer(
+                layer_idx, n_singular_vectors, K, lambda_coh, method
+            )
+        return results
+    def edit_model(
+        self,
+        layer_idx: int,
+        head_idx: int,
+        sv_indices: List[int],
+        scale_factors: List[float],
+    ) -> None:
+        """
+        Edit the model by scaling specific singular values.
+        This enables:
+        - Suppressing concepts (scale -> 0): remove spurious features
+        - Amplifying concepts (scale > 1): enhance task-relevant features
+        Args:
+            layer_idx: Layer to edit
+            head_idx: Head to edit
+            sv_indices: Indices of singular vectors to modify
+            scale_factors: Scaling factor for each (0 = suppress, >1 = amplify)
+        """
+        # Get original W_VO
+        W_VO_all = self.extractor.compute_WVO(layer_idx, fold_ln=False, project_ones=False)
+        W_VO_h = W_VO_all[head_idx]
+        # SVD decompose
+        U, sigma, Vt = torch.linalg.svd(W_VO_h, full_matrices=False)
+        # Scale selected singular values
+        for sv_idx, scale in zip(sv_indices, scale_factors):
+            sigma[sv_idx] *= scale
+        # Reconstruct
+        W_VO_edited = U @ torch.diag(sigma) @ Vt
+        # Write back to the model
+        # W_VO = W_V^T @ W_O^T, so we need to update W_V and W_O
+        # Simplest approach: perform low-rank update on W_V
+        # Since W_VO = W_V_h^T @ W_O_h^T and we want W_VO_edited,
+        # we can set W_V_h_new such that W_V_h_new^T @ W_O_h^T = W_VO_edited
+        # This is: W_V_h_new^T = W_VO_edited @ (W_O_h^T)^(-1)
+        # But W_O_h is rank d_h, so not invertible in D x D space.
+        #
+        # Alternative: directly edit the singular values in the SVD of the
+        # original (non-folded) W_VO by identifying correspondence.
+        #
+        # For simplicity, we reconstruct W_VO and decompose into W_V and W_O
+        # via the original head dimension factorization.
+        self._write_WVO_to_model(layer_idx, head_idx, W_VO_edited)
+    def _write_WVO_to_model(
+        self,
+        layer_idx: int,
+        head_idx: int,
+        W_VO_edited: torch.Tensor,
+    ):
+        """
+        Write an edited W_VO back to the model weights.
+        Since W_VO = W_V_h^T @ W_O_h^T and has rank d_h, we can use SVD
+        to factorize W_VO_edited into new W_V_h and W_O_h.
+        W_VO_edited = U_e @ S_e @ V_e^T
+        Take top-d_h components:
+        W_V_h_new^T = U_e[:, :d_h] @ sqrt(S_e[:d_h])
+        W_O_h_new^T = sqrt(S_e[:d_h]) @ V_e[:d_h, :]
+        """
+        d_h = self.extractor.head_dim
+        # SVD of edited W_VO
+        U_e, S_e, Vt_e = torch.linalg.svd(W_VO_edited, full_matrices=False)
+        # Keep top d_h components
+        sqrt_S = torch.sqrt(S_e[:d_h])
+        # New W_V_h^T = U_e[:, :d_h] @ diag(sqrt_S)  => shape [D, d_h]
+        # So W_V_h = (U_e[:, :d_h] @ diag(sqrt_S))^T = diag(sqrt_S) @ U_e[:, :d_h]^T
+        # => W_V_h shape [d_h, D]
+        W_V_h_new = (sqrt_S.unsqueeze(1) * U_e[:, :d_h].T)  # [d_h, D]
+        # New W_O_h^T = diag(sqrt_S) @ Vt_e[:d_h, :]  => shape [d_h, D]
+        # So W_O_h = (diag(sqrt_S) @ Vt_e[:d_h, :])^T = Vt_e[:d_h, :]^T @ diag(sqrt_S)
+        # => W_O_h shape [D, d_h]
+        W_O_h_new = (Vt_e[:d_h, :].T * sqrt_S.unsqueeze(0))  # [D, d_h]
+        # Write W_V_h back
+        _, _, W_V = self.extractor._get_qkv_weights(layer_idx)
+        W_O = self.extractor._get_output_weight(layer_idx)
+        h = head_idx
+        d_h = self.extractor.head_dim
+        # W_V is [d_model, d_model], head h occupies rows [h*d_h : (h+1)*d_h]
+        W_V[h * d_h : (h + 1) * d_h, :] = W_V_h_new
+        # W_O is [d_model, d_model], head h occupies columns [h*d_h : (h+1)*d_h]
+        W_O[:, h * d_h : (h + 1) * d_h] = W_O_h_new
+    def find_concept_heads(
+        self,
+        target_concepts: List[str],
+        concept_embeddings: torch.Tensor,
+        layers: Optional[List[int]] = None,
+        n_singular_vectors: int = 10,
+        K: int = 5,
+        lambda_coh: float = 0.3,
+        threshold: float = 0.3,
+    ) -> List[Dict]:
+        """
+        Find attention heads that encode specific concepts.
+        Useful for targeted model editing: find which heads encode
+        "background" features, "texture" features, etc.
+        Args:
+            target_concepts: List of target concept descriptions
+            concept_embeddings: [n_targets, d] embeddings of target concepts
+            layers: Layers to search
+            n_singular_vectors: SVs per head to check
+            K: Concepts per SV
+            lambda_coh: COMP coherence weight
+            threshold: Minimum similarity to consider a match
+        Returns:
+            List of dicts with head locations and matching info
+        """
+        results = self.analyze_model(
+            layers=layers,
+            n_singular_vectors=n_singular_vectors,
+            K=K,
+            lambda_coh=lambda_coh,
+        )
+        matches = []
+        concept_embeddings = F.normalize(concept_embeddings.to(self.device), dim=-1)
+        for layer_idx, heads in results.items():
+            for head_interp in heads:
+                for sv_interp in head_interp.singular_vectors:
+                    # Check if any of the attributed concepts match targets
+                    for ci, concept_idx in enumerate(sv_interp.concept_indices):
+                        concept_emb = self.centered_concepts[concept_idx]
+                        sims = (concept_embeddings @ concept_emb).tolist()
+                        max_sim = max(sims)
+                        if max_sim > threshold:
+                            matches.append({
+                                "layer": layer_idx,
+                                "head": head_interp.head_idx,
+                                "sv_index": sv_interp.sv_idx,
+                                "concept": sv_interp.concepts[ci],
+                                "coefficient": sv_interp.coefficients[ci],
+                                "target_similarity": max_sim,
+                                "singular_value": sv_interp.singular_value,
+                            })
+        # Sort by relevance (target_similarity * singular_value * coefficient)
+        matches.sort(
+            key=lambda x: x["target_similarity"] * x["singular_value"] * x["coefficient"],
+            reverse=True,
+        )
+        return matches
+    @staticmethod
+    def save_results(
+        results: Dict[int, List[HeadInterpretation]],
+        path: str,
+    ):
+        """Save analysis results to JSON."""
+        serialized = {}
+        for layer_idx, heads in results.items():
+            serialized[str(layer_idx)] = [h.to_dict() for h in heads]
+        os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
+        with open(path, "w") as f:
+            json.dump(serialized, f, indent=2)
+        print(f"Results saved to {path}")
+    @staticmethod
+    def load_results(path: str) -> Dict:
+        """Load analysis results from JSON."""
+        with open(path) as f:
+            return json.load(f)

unimodal_sith/weight_extraction.py ADDED Viewed

	@@ -0,0 +1,378 @@

+"""
+Weight extraction utilities for various ViT architectures.
+Supports:
+  - DINOv2 (facebook/dinov2-*)
+  - CLIP ViT (openai/clip-vit-* via HuggingFace transformers)
+  - Any HuggingFace ViT (google/vit-*)
+For each architecture, extracts:
+  - W_V (value projection) and W_O (output projection) per attention head
+  - W_VO = W_V^T @ W_O^T  (the value-output matrix, as in SITH)
+  - LayerNorm parameters for folding
+  - Final projection matrix W_p (if present)
+"""
+import torch
+import torch.nn as nn
+from typing import Dict, List, Optional, Tuple
+def fold_layernorm_into_weights(
+    W: torch.Tensor,
+    ln_weight: torch.Tensor,
+    ln_bias: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Fold LayerNorm affine parameters into a weight matrix.
+    Given: LN(x) = (x - mean) / std * w + b
+    The affine part: x_affine = diag(w) @ x + b
+    Folded: W' = diag(w) @ W, b_proj = W^T @ b (absorbed into bias)
+    Args:
+        W: Weight matrix [out_dim, in_dim] (applied as x @ W^T)
+        ln_weight: LayerNorm weight [in_dim]
+        ln_bias: LayerNorm bias [in_dim]
+    Returns:
+        W_folded: [out_dim, in_dim]
+        b_folded: [out_dim]
+    """
+    # W' = W @ diag(ln_weight)  (since W acts as x @ W^T, LN acts on x first)
+    # Actually: LN(x) @ W^T = (diag(w) @ x + b) @ W^T = x @ diag(w) @ W^T + b @ W^T
+    # So W_folded = W @ diag(w) when W is [out, in] and applied as x @ W^T
+    # But in PyTorch Linear: y = x @ W^T + bias
+    # LN(x) @ W^T = (x * w + b) @ W^T = x @ diag(w) @ W^T + b @ W^T
+    # => W_folded^T = diag(w) @ W^T => W_folded = W @ diag(w)... no:
+    # W_folded = diag(w) @ W  when W is [out, in] and we want W_folded to replace W
+    # such that x @ W_folded^T = (x * w) @ W^T = x @ (diag(w) @ W)^T = x @ W^T @ diag(w)
+    # Hmm, let's be precise:
+    # y = (x * w + b) @ W^T = (x @ diag(w)) @ W^T + b @ W^T
+    # = x @ (W @ diag(w))^T + b @ W^T
+    # So: W_folded = W @ diag(w)  ... no wait:
+    # x @ (W @ diag(w))^T = x @ diag(w)^T @ W^T = x @ diag(w) @ W^T  ✓
+    # Hmm, diag(w)^T = diag(w), so: W_folded such that x @ W_folded^T = x @ diag(w) @ W^T
+    # => W_folded^T = diag(w) @ W^T => W_folded = W @ diag(w)
+    # Actually more carefully:
+    # For W [out_dim, in_dim], y = x W^T
+    # After LN: y' = LN(x) W^T = (x*w + b) W^T
+    # = (diag(w) x^T)^T W^T + b W^T  ... nope, element-wise
+    # x*w is element-wise: x*w = x @ diag(w) (treating x as [1, in_dim])
+    # So y' = x @ diag(w) @ W^T + b @ W^T
+    # New W_folded: W_folded = W @ diag(w)  (then x @ W_folded^T = x @ diag(w) @ W^T ✓)
+    # Wait: (W @ diag(w))^T = diag(w) @ W^T, so x @ (W @ diag(w))^T = x @ diag(w) @ W^T ✓
+    W_folded = W * ln_weight.unsqueeze(0)  # Broadcast: [out, in] * [1, in] = [out, in]
+    b_folded = ln_bias @ W.t()  # [in] @ [in, out] = [out]
+    return W_folded, b_folded
+def project_out_ones(W: torch.Tensor) -> torch.Tensor:
+    """
+    Project weight matrix columns onto the subspace orthogonal to the all-ones direction.
+    This accounts for the centering operation of LayerNorm.
+    For a matrix W [D, D], we subtract the mean of each column from itself.
+    Equivalently: W_proj = W - (1/D) * ones @ ones^T @ W
+    """
+    D = W.shape[0]
+    col_means = W.mean(dim=0, keepdim=True)  # [1, D]
+    W_proj = W - col_means
+    return W_proj
+class WeightExtractor:
+    """
+    Extracts and processes attention head weights for SITH analysis.
+    Architecture-agnostic: supports DINOv2, CLIP ViT, standard ViT.
+    """
+    SUPPORTED_ARCHITECTURES = ["dinov2", "clip", "vit"]
+    def __init__(self, model: nn.Module, architecture: str, n_heads: int, d_model: int):
+        """
+        Args:
+            model: The loaded model (HuggingFace transformers model)
+            architecture: One of "dinov2", "clip", "vit"
+            n_heads: Number of attention heads
+            d_model: Hidden dimension
+        """
+        assert architecture in self.SUPPORTED_ARCHITECTURES, \
+            f"Unsupported architecture: {architecture}. Use one of {self.SUPPORTED_ARCHITECTURES}"
+        self.model = model
+        self.architecture = architecture
+        self.n_heads = n_heads
+        self.d_model = d_model
+        self.head_dim = d_model // n_heads
+    def _get_layer(self, layer_idx: int):
+        """Get the transformer layer by index."""
+        if self.architecture == "dinov2":
+            return self.model.encoder.layer[layer_idx]
+        elif self.architecture == "clip":
+            return self.model.vision_model.encoder.layers[layer_idx]
+        elif self.architecture == "vit":
+            # AutoModel for ViT doesn't have .vit prefix
+            if hasattr(self.model, 'vit'):
+                return self.model.vit.encoder.layer[layer_idx]
+            else:
+                return self.model.encoder.layer[layer_idx]
+        else:
+            raise ValueError(f"Unknown architecture: {self.architecture}")
+    def _get_num_layers(self) -> int:
+        """Get total number of transformer layers."""
+        if self.architecture == "dinov2":
+            return len(self.model.encoder.layer)
+        elif self.architecture == "clip":
+            return len(self.model.vision_model.encoder.layers)
+        elif self.architecture == "vit":
+            if hasattr(self.model, 'vit'):
+                return len(self.model.vit.encoder.layer)
+            else:
+                return len(self.model.encoder.layer)
+        else:
+            raise ValueError(f"Unknown architecture: {self.architecture}")
+    def _get_qkv_weights(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Extract Q, K, V weight matrices from a layer."""
+        layer = self._get_layer(layer_idx)
+        if self.architecture == "dinov2":
+            attn = layer.attention.attention
+            W_Q = attn.query.weight.data  # [d_model, d_model]
+            W_K = attn.key.weight.data
+            W_V = attn.value.weight.data
+        elif self.architecture == "clip":
+            attn = layer.self_attn
+            W_Q = attn.q_proj.weight.data
+            W_K = attn.k_proj.weight.data
+            W_V = attn.v_proj.weight.data
+        elif self.architecture == "vit":
+            attn = layer.attention.attention
+            W_Q = attn.query.weight.data
+            W_K = attn.key.weight.data
+            W_V = attn.value.weight.data
+        return W_Q, W_K, W_V
+    def _get_output_weight(self, layer_idx: int) -> torch.Tensor:
+        """Extract output projection weight matrix."""
+        layer = self._get_layer(layer_idx)
+        if self.architecture == "dinov2":
+            return layer.attention.output.dense.weight.data  # [d_model, d_model]
+        elif self.architecture == "clip":
+            return layer.self_attn.out_proj.weight.data
+        elif self.architecture == "vit":
+            return layer.attention.output.dense.weight.data
+    def _get_pre_attn_layernorm(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Get the LayerNorm weight and bias that precedes the attention block."""
+        layer = self._get_layer(layer_idx)
+        if self.architecture == "dinov2":
+            ln = layer.norm1
+        elif self.architecture == "clip":
+            ln = layer.layer_norm1
+        elif self.architecture == "vit":
+            ln = layer.layernorm_before
+        return ln.weight.data, ln.bias.data
+    def _get_final_layernorm(self) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
+        """Get the final LayerNorm (applied before projection, if present)."""
+        if self.architecture == "dinov2":
+            ln = self.model.layernorm
+        elif self.architecture == "clip":
+            ln = self.model.vision_model.post_layernorm
+        elif self.architecture == "vit":
+            if hasattr(self.model, 'vit'):
+                ln = self.model.vit.layernorm
+            else:
+                ln = self.model.layernorm
+        return ln.weight.data, ln.bias.data
+    def _get_projection_matrix(self) -> Optional[torch.Tensor]:
+        """Get the final projection matrix W_p (maps hidden dim to output dim)."""
+        if self.architecture == "clip":
+            # CLIP has a visual projection: [proj_dim, d_model]
+            # Applied as: features = cls_token @ W_p^T
+            try:
+                W_p = self.model.visual_projection.weight.data  # [proj_dim, d_model]
+                return W_p.t()  # Return as [d_model, proj_dim]
+            except AttributeError:
+                return None
+        elif self.architecture == "dinov2":
+            # DINOv2 has no projection matrix
+            return None
+        elif self.architecture == "vit":
+            return None
+    def _get_layerscale(self, layer_idx: int) -> Optional[torch.Tensor]:
+        """Get LayerScale parameter (DINOv2 specific)."""
+        if self.architecture == "dinov2":
+            layer = self._get_layer(layer_idx)
+            try:
+                return layer.layer_scale1.lambda1.data  # [d_model]
+            except AttributeError:
+                return None
+        return None
+    def compute_WVO(
+        self,
+        layer_idx: int,
+        fold_ln: bool = True,
+        project_ones: bool = True,
+    ) -> torch.Tensor:
+        """
+        Compute the Value-Output (VO) weight matrix for all heads in a layer.
+        W_VO_h = W_V_h^T @ W_O_h^T  where:
+          - W_V_h is [head_dim, d_model] (head h's slice of W_V)
+          - W_O_h is [d_model, head_dim] (head h's slice of W_O)
+          - W_VO_h is [d_model, d_model] (rank head_dim)
+        Following the paper:
+          H'_h(X) = softmax(...) @ X @ W_VO_h
+          where W_VO_h = W_V_h @ W_O_h, with W_V [d_model, d_model] and W_O [d_model, d_model]
+        Actually from Eq. (4): MHA(X) = sum_h A^h @ X @ W_VO^h
+        where W_VO^h = W_V^h @ W_O^h
+        W_V^h: [D, d_h], W_O^h: [d_h, D]  => W_VO^h: [D, D]
+        In PyTorch Linear(in, out): weight is [out, in], applied as x @ W^T
+        So W_V.weight is [d_model, d_model] applied as x @ W_V^T
+        But per head: W_V_h is rows [h*d_h : (h+1)*d_h] of W_V.weight => [d_h, D]
+        Applied as x @ W_V_h^T => [N, D] @ [D, d_h] => [N, d_h]
+        W_O.weight is [D, D] applied as h_out @ W_O^T
+        Per head: W_O_h is columns [h*d_h : (h+1)*d_h] of W_O.weight => [D, d_h]
+        So h_out_h @ W_O_h^T => [N, d_h] @ [d_h, D] => [N, D]  ... wait
+        Actually W_O.weight [D, D]: output = concat(h1..hH) @ W_O^T
+        concat has shape [N, H*d_h] = [N, D]
+        But per-head: W_O_h = W_O.weight[:, h*d_h:(h+1)*d_h]  => [D, d_h]
+        Applied: h_out_h [N, d_h] @ W_O_h^T [d_h, D] => [N, D]
+        Full per-head: output_h = A_h @ X @ W_V_h^T @ W_O_h^T
+        W_VO_h = W_V_h^T @ W_O_h^T = [D, d_h] @ [d_h, D] = [D, D]
+        Returns:
+            W_VO: [n_heads, d_model, d_model]
+        """
+        _, _, W_V = self._get_qkv_weights(layer_idx)
+        W_O = self._get_output_weight(layer_idx)
+        # Optionally fold LayerNorm into W_V
+        if fold_ln:
+            ln_weight, ln_bias = self._get_pre_attn_layernorm(layer_idx)
+            # W_V is [d_model, d_model], LN acts on input x before attention
+            # x_ln = x * w + b (element-wise), then x_ln @ W_V^T
+            # = (x @ diag(w)) @ W_V^T + b @ W_V^T
+            # Folded W_V: such that x @ W_V_folded^T = x @ diag(w) @ W_V^T
+            # => W_V_folded^T = diag(w) @ W_V^T => W_V_folded = W_V @ diag(w)
+            # But W_V is [out=d_model, in=d_model] and diag(w) acts on input dim
+            W_V = W_V * ln_weight.unsqueeze(0)  # [d_model, d_model] * [1, d_model]
+        # Fold LayerScale if present (DINOv2)
+        ls = self._get_layerscale(layer_idx)
+        if ls is not None:
+            # LayerScale is applied after attention output: output = ls * attn_output
+            # So W_O_effective = diag(ls) @ W_O
+            # In our notation: output_h = A_h @ X @ W_V_h^T @ W_O_h^T * ls
+            # = A_h @ X @ W_V_h^T @ (diag(ls) @ W_O_h)^T
+            # W_O is [D, D], ls is [D]
+            W_O = W_O * ls.unsqueeze(1)  # [D, D] * [D, 1] = [D, D]
+        # Split into per-head matrices
+        # W_V: [d_model, d_model] -> W_V_h: [d_h, d_model] for head h
+        W_V_per_head = W_V.view(self.n_heads, self.head_dim, self.d_model)  # [H, d_h, D]
+        # W_O: [d_model, d_model] -> W_O_h: [d_model, d_h] for head h
+        # W_O[:, h*d_h:(h+1)*d_h] => reshaped
+        W_O_per_head = W_O.view(self.d_model, self.n_heads, self.head_dim)  # [D, H, d_h]
+        W_O_per_head = W_O_per_head.permute(1, 0, 2)  # [H, D, d_h]
+        # W_VO_h = W_V_h^T @ W_O_h^T = [D, d_h] @ [d_h, D] = [D, D]
+        # W_V_h^T: [D, d_h], W_O_h^T: [d_h, D]
+        W_VO = torch.bmm(
+            W_V_per_head.transpose(1, 2),  # [H, D, d_h]
+            W_O_per_head.transpose(1, 2),  # [H, d_h, D]
+        )  # [H, D, D]
+        # Project out the all-ones direction (centering from LN)
+        if project_ones:
+            for h in range(self.n_heads):
+                W_VO[h] = project_out_ones(W_VO[h])
+        return W_VO
+    def svd_decompose(
+        self,
+        W_VO_h: torch.Tensor,
+        top_k: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Decompose a per-head W_VO matrix via SVD.
+        W_VO = U @ diag(sigma) @ V^T
+        Args:
+            W_VO_h: [d_model, d_model] VO matrix for a single head
+            top_k: If set, only return top-k singular vectors
+        Returns:
+            U: [d_model, r] left singular vectors (reading directions)
+            sigma: [r] singular values
+            Vt: [r, d_model] right singular vectors (writing directions)
+        """
+        U, sigma, Vt = torch.linalg.svd(W_VO_h, full_matrices=False)
+        if top_k is not None:
+            U = U[:, :top_k]
+            sigma = sigma[:top_k]
+            Vt = Vt[:top_k, :]
+        return U, sigma, Vt
+    def project_to_feature_space(
+        self,
+        vectors: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Project singular vectors from the residual stream to the model's output feature space.
+        For CLIP: apply final LN then W_p projection
+        For DINOv2/ViT: apply final LN (no projection matrix)
+        Args:
+            vectors: [n, d_model] singular vectors in residual stream space
+        Returns:
+            projected: [n, d_out] vectors in the output feature space, L2-normalized
+        """
+        # Get final LayerNorm
+        ln_w, ln_b = self._get_final_layernorm()
+        # Apply LN affine transformation (without the data-dependent normalization)
+        # Since these are abstract directions (not activations), we just apply the affine part
+        # v_ln = v * ln_weight + ln_bias
+        vectors_ln = vectors * ln_w.unsqueeze(0) + ln_b.unsqueeze(0)
+        # Apply projection if present
+        W_p = self._get_projection_matrix()
+        if W_p is not None:
+            # W_p is [d_model, proj_dim]
+            vectors_proj = vectors_ln @ W_p  # [n, proj_dim]
+        else:
+            vectors_proj = vectors_ln
+        # L2 normalize
+        vectors_proj = torch.nn.functional.normalize(vectors_proj, dim=-1)
+        return vectors_proj