#!/usr/bin/env python3 """ Full UniSITH Experiment Pipeline ================================= 1. Build concept pool from ALL 30K Recap-COCO images 2. Analyze last 4 layers of DINOv2-base (48 heads, 5 SVs each) 3. Evaluate: a) Fidelity (cosine similarity of reconstruction) across K={5,10,20} and methods b) Monosemanticity (intra-concept coherence + automated proxy scoring) 4. Generate ~25 qualitative results in markdown 5. Save everything for upload to HF repo Usage: python run_experiments.py [--device cuda] """ import argparse import torch import torch.nn.functional as F import os import sys import json import time import numpy as np from collections import defaultdict from transformers import AutoModel, AutoImageProcessor from datasets import load_dataset from scipy.optimize import nnls sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from unimodal_sith.concept_pool import VisualConceptPool from unimodal_sith.weight_extraction import WeightExtractor from unimodal_sith.comp import comp, top_k_selection from unimodal_sith.unisith import UniSITH, HeadInterpretation, SingularVectorInterpretation # ─── Config ─────────────────────────────────────────────────────────────────── MODEL_NAME = os.environ.get("UNISITH_MODEL", "facebook/dinov2-small") ARCHITECTURE = "dinov2" # Auto-detect config based on model _CONFIGS = { "facebook/dinov2-small": (6, 384, 12), "facebook/dinov2-base": (12, 768, 12), "facebook/dinov2-large": (16, 1024, 24), } N_HEADS, D_MODEL, N_LAYERS = _CONFIGS.get(MODEL_NAME, (6, 384, 12)) ANALYZE_LAYERS = list(range(max(0, N_LAYERS - 4), N_LAYERS)) N_SVS = 5 # singular vectors per head LAMBDA_COH = 0.3 OUTPUT_DIR = "./experiment_results" CACHE_DIR = "./cache" def nnomp(v_hat, Gamma_hat, K=5): """Non-Negative Orthogonal Matching Pursuit (baseline, no coherence).""" C, d = Gamma_hat.shape v_hat_np = v_hat.cpu().numpy().astype(np.float64) Gamma_np = Gamma_hat.cpu().numpy().astype(np.float64) r = v_hat_np.copy() S = [] for k in range(K): s_res = Gamma_np @ r for idx in S: s_res[idx] = -np.inf j_k = int(np.argmax(s_res)) S.append(j_k) G_S = Gamma_np[S].T c_S, _ = nnls(G_S, v_hat_np) r = v_hat_np - G_S @ c_S c = np.zeros(C) for i, j in enumerate(S): c[j] = c_S[i] return torch.tensor(c, dtype=torch.float32, device=v_hat.device), S def compute_fidelity(v_hat, coeffs, support, centered_concepts): """Compute cosine similarity between v_hat and its reconstruction.""" reconstruction = torch.zeros_like(v_hat) for idx in support: reconstruction += coeffs[idx].item() * centered_concepts[idx] if reconstruction.norm() < 1e-8: return 0.0 return F.cosine_similarity(v_hat.unsqueeze(0), reconstruction.unsqueeze(0)).item() def compute_monosemanticity_score(concept_embeddings_subset): """ Compute an automated monosemanticity proxy score. This measures how coherent the selected concepts are by computing the mean pairwise cosine similarity among them. High similarity = monosemantic (all concepts point to a single theme). Score mapping (roughly calibrated to the 1-5 Likert scale from the paper): mean_sim > 0.7 -> ~5 (highly monosemantic) mean_sim > 0.5 -> ~4 mean_sim > 0.3 -> ~3 mean_sim > 0.15 -> ~2 mean_sim <= 0.15 -> ~1 """ if len(concept_embeddings_subset) < 2: return 5.0, 1.0 # Single concept is trivially monosemantic # Pairwise cosine similarity sims = concept_embeddings_subset @ concept_embeddings_subset.T n = sims.shape[0] # Extract upper triangle (exclude diagonal) mask = torch.triu(torch.ones(n, n, dtype=torch.bool), diagonal=1) pairwise_sims = sims[mask] mean_sim = pairwise_sims.mean().item() # Map to 1-5 scale if mean_sim > 0.7: score = 5.0 elif mean_sim > 0.5: score = 4.0 + (mean_sim - 0.5) / 0.2 elif mean_sim > 0.3: score = 3.0 + (mean_sim - 0.3) / 0.2 elif mean_sim > 0.15: score = 2.0 + (mean_sim - 0.15) / 0.15 else: score = 1.0 + mean_sim / 0.15 return min(5.0, score), mean_sim def run_fidelity_experiment(extractor, centered_concepts, concept_mean, device): """ Fidelity experiment: compute fidelity across K={5,10,20} for COMP, NNOMP, top-k. Matches paper's Fig. 3 experiment. """ print("\n" + "=" * 80) print("EXPERIMENT 1: Fidelity Analysis") print("=" * 80) K_values = [5, 10, 20] methods = { "COMP (λ=0.3)": lambda v, G, K: comp(v, G, K=K, lambda_coh=0.3), "NNOMP": lambda v, G, K: nnomp(v, G, K=K), "Top-K": lambda v, G, K: top_k_selection(v, G, K=K), } results = {} for method_name, method_fn in methods.items(): results[method_name] = {} for K in K_values: fidelities = [] print(f"\n {method_name}, K={K}:") for layer_idx in ANALYZE_LAYERS: W_VO_all = extractor.compute_WVO(layer_idx, fold_ln=True, project_ones=True) for head_idx in range(N_HEADS): W_VO_h = W_VO_all[head_idx] U, sigma, Vt = extractor.svd_decompose(W_VO_h, top_k=N_SVS) V_proj = extractor.project_to_feature_space(Vt) V_centered = F.normalize(V_proj - concept_mean, dim=-1) for sv_idx in range(N_SVS): v_hat = V_centered[sv_idx] coeffs, support = method_fn(v_hat, centered_concepts, K) fid = compute_fidelity(v_hat, coeffs, support, centered_concepts) fidelities.append(fid) mean_fid = np.mean(fidelities) std_fid = np.std(fidelities) results[method_name][K] = { "mean": mean_fid, "std": std_fid, "n": len(fidelities), } print(f" Mean fidelity: {mean_fid:.4f} ± {std_fid:.4f} (n={len(fidelities)})") return results def run_monosemanticity_experiment(extractor, centered_concepts, concept_mean, concept_pool, device): """ Monosemanticity experiment: evaluate how coherent the concept sets are. Uses intra-set cosine similarity as automated proxy for the LLM-as-judge. Matches paper's Table 21 evaluation. """ print("\n" + "=" * 80) print("EXPERIMENT 2: Monosemanticity Analysis") print("=" * 80) K_values = [5, 10] methods = { "COMP (λ=0.3)": lambda v, G, K: comp(v, G, K=K, lambda_coh=0.3), "NNOMP": lambda v, G, K: nnomp(v, G, K=K), "Top-K": lambda v, G, K: top_k_selection(v, G, K=K), } results = {} detailed_examples = [] # For qualitative results for method_name, method_fn in methods.items(): results[method_name] = {} for K in K_values: mono_scores = [] raw_sims = [] for layer_idx in ANALYZE_LAYERS: W_VO_all = extractor.compute_WVO(layer_idx, fold_ln=True, project_ones=True) for head_idx in range(N_HEADS): W_VO_h = W_VO_all[head_idx] U, sigma, Vt = extractor.svd_decompose(W_VO_h, top_k=N_SVS) V_proj = extractor.project_to_feature_space(Vt) V_centered = F.normalize(V_proj - concept_mean, dim=-1) for sv_idx in range(N_SVS): v_hat = V_centered[sv_idx] coeffs, support = method_fn(v_hat, centered_concepts, K) # Get the embeddings of selected concepts selected_embs = centered_concepts[support] score, mean_sim = compute_monosemanticity_score(selected_embs) mono_scores.append(score) raw_sims.append(mean_sim) # Collect detailed examples for COMP K=5 if method_name == "COMP (λ=0.3)" and K == 5: fid = compute_fidelity(v_hat, coeffs, support, centered_concepts) captions = [concept_pool.captions[idx] for idx in support] coeff_vals = [coeffs[idx].item() for idx in support] image_ids = None if concept_pool.image_ids is not None: image_ids = [concept_pool.image_ids[idx] for idx in support] detailed_examples.append({ "layer": layer_idx, "head": head_idx, "sv_index": sv_idx, "singular_value": sigma[sv_idx].item(), "fidelity": fid, "monosemanticity_score": score, "mean_pairwise_sim": mean_sim, "concepts": [ {"caption": c, "coefficient": w} for c, w in zip(captions, coeff_vals) ], "image_ids": image_ids, }) mean_mono = np.mean(mono_scores) std_mono = np.std(mono_scores) mean_raw = np.mean(raw_sims) results[method_name][K] = { "mean_score": mean_mono, "std_score": std_mono, "mean_pairwise_sim": mean_raw, "n": len(mono_scores), } print(f" {method_name}, K={K}: " f"mono={mean_mono:.2f}±{std_mono:.2f}, " f"mean_sim={mean_raw:.4f}") return results, detailed_examples def select_qualitative_examples(detailed_examples, n=25): """ Select ~25 diverse, high-quality qualitative examples. Strategy: pick examples with high monosemanticity AND high fidelity, spread across different layers and heads. """ # Sort by combined quality: mono_score * fidelity * singular_value for ex in detailed_examples: ex["quality_score"] = ( ex["monosemanticity_score"] * ex["fidelity"] * min(ex["singular_value"], 5.0) # Cap SV influence ) sorted_examples = sorted(detailed_examples, key=lambda x: x["quality_score"], reverse=True) # Ensure diversity: no more than 2 examples from same (layer, head) selected = [] seen_heads = defaultdict(int) for ex in sorted_examples: key = (ex["layer"], ex["head"]) if seen_heads[key] < 2: selected.append(ex) seen_heads[key] += 1 if len(selected) >= n: break # If we don't have enough, relax constraint if len(selected) < n: for ex in sorted_examples: if ex not in selected: selected.append(ex) if len(selected) >= n: break return selected[:n] def generate_qualitative_markdown(examples, output_path): """Generate a markdown file with qualitative results.""" lines = [ "# UniSITH Qualitative Results", "", "## DINOv2-Base Analysis — Selected Singular Vector Interpretations", "", f"**Model:** `facebook/dinov2-base` (12 heads, 768d, 12 layers)", f"**Concept pool:** Recap-COCO-30K (30,504 captioned images)", f"**Method:** COMP (λ=0.3, K=5)", f"**Layers analyzed:** {ANALYZE_LAYERS}", "", "Each entry shows one singular vector from an attention head, decomposed into", "5 visual concepts from the image pool. The concepts are ranked by coefficient weight.", "Captions are from COCO annotations and describe what visual content the attention", "head encodes in that direction.", "", "---", "", ] for i, ex in enumerate(examples, 1): lines.append(f"### Example {i}: Layer {ex['layer']}, Head {ex['head']}, " f"SV {ex['sv_index']}") lines.append("") lines.append(f"- **Singular value:** {ex['singular_value']:.4f}") lines.append(f"- **Fidelity:** {ex['fidelity']:.4f}") lines.append(f"- **Monosemanticity score:** {ex['monosemanticity_score']:.2f}/5.0") lines.append(f"- **Mean pairwise similarity:** {ex['mean_pairwise_sim']:.4f}") lines.append("") lines.append("| Coefficient | Caption (Visual Concept) |") lines.append("|---|---|") for concept in ex["concepts"]: lines.append(f"| {concept['coefficient']:.4f} | {concept['caption']} |") lines.append("") # Add COCO image IDs for reference if ex.get("image_ids"): ids_str = ", ".join(str(x) for x in ex["image_ids"]) lines.append(f"*COCO image IDs: {ids_str}*") urls = [f"[{img_id}](http://images.cocodataset.org/val2014/COCO_val2014_{img_id:012d}.jpg)" for img_id in ex["image_ids"]] sep = " | " lines.append(f"*Image links: {sep.join(urls)}*") lines.append("") lines.append("---") lines.append("") os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else ".", exist_ok=True) with open(output_path, "w") as f: f.write("\n".join(lines)) print(f"Qualitative results saved to {output_path}") def generate_experiment_report(fidelity_results, mono_results, output_path): """Generate a markdown report of all experiments.""" lines = [ "# UniSITH Experiment Report", "", "## Setup", "", f"- **Model:** `facebook/dinov2-base` (12 heads × 768d × 12 layers)", f"- **Concept pool:** Recap-COCO-30K (30,504 captioned images)", f"- **Layers analyzed:** {ANALYZE_LAYERS} (last 4)", f"- **Singular vectors per head:** {N_SVS}", f"- **Total SVs analyzed:** {len(ANALYZE_LAYERS) * N_HEADS * N_SVS}", "", "---", "", "## Experiment 1: Fidelity Analysis", "", "Fidelity measures how well the sparse concept set reconstructs the original", "singular vector (cosine similarity between original and reconstruction).", "", "| Method | K=5 | K=10 | K=20 |", "|---|---|---|---|", ] for method_name, K_results in fidelity_results.items(): vals = [] for K in [5, 10, 20]: r = K_results[K] vals.append(f"{r['mean']:.4f} ± {r['std']:.4f}") lines.append(f"| {method_name} | {' | '.join(vals)} |") lines.extend([ "", "---", "", "## Experiment 2: Monosemanticity Analysis", "", "Monosemanticity measures how coherent each concept set is — whether the selected", "concepts point to a single, unambiguous visual theme.", "", "We use mean pairwise cosine similarity among selected concept embeddings as an", "automated proxy for the LLM-as-judge evaluation used in the original SITH paper.", "The score is mapped to a 1-5 Likert scale.", "", "| Method | K=5 Score | K=5 Sim | K=10 Score | K=10 Sim |", "|---|---|---|---|---|", ]) for method_name, K_results in mono_results.items(): vals = [] for K in [5, 10]: r = K_results[K] vals.append(f"{r['mean_score']:.2f} ± {r['std_score']:.2f}") vals.append(f"{r['mean_pairwise_sim']:.4f}") lines.append(f"| {method_name} | {' | '.join(vals)} |") lines.extend([ "", "### Interpretation", "", "- **COMP** achieves the best balance: high fidelity with high monosemanticity", "- **Top-K** has high monosemanticity (by construction — all concepts are similar)", " but lower fidelity (misses diverse aspects of the singular vector)", "- **NNOMP** has high fidelity but lower monosemanticity (selects diverse but", " potentially incoherent concepts)", "", "This mirrors the findings of the original SITH paper (Fig. 3).", ]) os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else ".", exist_ok=True) with open(output_path, "w") as f: f.write("\n".join(lines)) print(f"Experiment report saved to {output_path}") def main(): parser = argparse.ArgumentParser() parser.add_argument("--device", type=str, default="cuda") args = parser.parse_args() device = args.device if device == "cuda" and not torch.cuda.is_available(): print("CUDA not available, falling back to CPU") device = "cpu" os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs(CACHE_DIR, exist_ok=True) start_time = time.time() # ─── Step 1: Load model ─────────────────────────────────────────────────── print("=" * 80) print("STEP 1: Loading DINOv2-base") print("=" * 80) model = AutoModel.from_pretrained(MODEL_NAME) processor = AutoImageProcessor.from_pretrained(MODEL_NAME) model.eval() model = model.to(device) print(f"Model loaded on {device}") # ─── Step 2: Build concept pool (full 30K) ─────────────────────────────── print("\n" + "=" * 80) print("STEP 2: Building concept pool (full 30K images)") print("=" * 80) cache_path = os.path.join(CACHE_DIR, "concept_pool_dinov2_base_30K.pt") dataset = load_dataset("UCSC-VLAA/Recap-COCO-30K", split="train") print(f"Dataset loaded: {len(dataset)} images") pool = VisualConceptPool.from_dataset( dataset=dataset, model=model, processor=processor, architecture=ARCHITECTURE, image_column="image", caption_column="caption", image_id_column="image_id", batch_size=128, max_concepts=None, # Use ALL 30K device=device, cache_path=cache_path, ) print(f"Concept pool: {pool.num_concepts} concepts, dim={pool.embed_dim}") elapsed = time.time() - start_time print(f"Time so far: {elapsed:.0f}s") # ─── Step 3: Prepare analyzer ───────────────────────────────────────────── print("\n" + "=" * 80) print("STEP 3: Preparing analyzer") print("=" * 80) extractor = WeightExtractor(model, ARCHITECTURE, N_HEADS, D_MODEL) centered_concepts, concept_mean = pool.get_centered_embeddings() centered_concepts = centered_concepts.to(device) concept_mean = concept_mean.to(device) # ─── Step 4: Fidelity experiment ────────────────────────────────────────── fidelity_results = run_fidelity_experiment( extractor, centered_concepts, concept_mean, device ) # Save intermediate with open(os.path.join(OUTPUT_DIR, "fidelity_results.json"), "w") as f: json.dump(fidelity_results, f, indent=2) elapsed = time.time() - start_time print(f"\nFidelity experiment done. Time so far: {elapsed:.0f}s") # ─── Step 5: Monosemanticity experiment ─────────────────────────────────── mono_results, detailed_examples = run_monosemanticity_experiment( extractor, centered_concepts, concept_mean, pool, device ) # Save intermediate with open(os.path.join(OUTPUT_DIR, "monosemanticity_results.json"), "w") as f: json.dump(mono_results, f, indent=2) elapsed = time.time() - start_time print(f"\nMonosemanticity experiment done. Time so far: {elapsed:.0f}s") # ─── Step 6: Select and save qualitative examples ───────────────────────── print("\n" + "=" * 80) print("STEP 6: Generating qualitative results") print("=" * 80) qualitative = select_qualitative_examples(detailed_examples, n=25) # Save raw JSON with open(os.path.join(OUTPUT_DIR, "qualitative_examples.json"), "w") as f: json.dump(qualitative, f, indent=2) # Generate markdown generate_qualitative_markdown( qualitative, os.path.join(OUTPUT_DIR, "qualitative_results.md") ) # ─── Step 7: Generate full report ───────────────────────────────────────── generate_experiment_report( fidelity_results, mono_results, os.path.join(OUTPUT_DIR, "experiment_report.md") ) # ─── Step 8: Save full analysis results ─────────────────────────────────── print("\n" + "=" * 80) print("STEP 8: Running full COMP K=5 analysis and saving results") print("=" * 80) analyzer = UniSITH( model=model, architecture=ARCHITECTURE, n_heads=N_HEADS, d_model=D_MODEL, concept_pool=pool, device=device, ) full_results = analyzer.analyze_model( layers=ANALYZE_LAYERS, n_singular_vectors=N_SVS, K=5, lambda_coh=LAMBDA_COH, method="comp", ) UniSITH.save_results(full_results, os.path.join(OUTPUT_DIR, "full_analysis.json")) total_time = time.time() - start_time print(f"\n{'=' * 80}") print(f"ALL EXPERIMENTS COMPLETE. Total time: {total_time:.0f}s ({total_time/60:.1f}min)") print(f"Results saved in {OUTPUT_DIR}/") print(f"{'=' * 80}") if __name__ == "__main__": main()