lorenzovaquero
/

UniSITH

Model card Files Files and versions

xet

Community

lorenzovaquero commited on 18 days ago

Commit

24e41d7

verified ·

1 Parent(s): 2f74fc4

Add CLI runner script

Browse files

Files changed (1) hide show

run_unisith.py +270 -0

run_unisith.py ADDED Viewed

	@@ -0,0 +1,270 @@

+#!/usr/bin/env python3
+"""
+UniSITH Demo: Analyze a DINOv2 model using captioned images as concept pool.
+This script demonstrates the full UniSITH pipeline:
+1. Load a unimodal ViT model (DINOv2-large)
+2. Build a visual concept pool from Recap-COCO-30K
+3. Analyze attention heads via SVD + COMP
+4. Display human-interpretable concept attributions
+Usage:
+    python run_unisith.py --model facebook/dinov2-large --max-concepts 1000
+    python run_unisith.py --model openai/clip-vit-large-patch14 --architecture clip
+"""
+import argparse
+import torch
+import os
+import sys
+import json
+from transformers import AutoModel, AutoProcessor, AutoImageProcessor
+from transformers import CLIPModel, CLIPProcessor
+from datasets import load_dataset
+# Add parent dir to path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from unimodal_sith.concept_pool import VisualConceptPool
+from unimodal_sith.unisith import UniSITH
+# Model configurations
+MODEL_CONFIGS = {
+    "facebook/dinov2-large": {
+        "architecture": "dinov2",
+        "n_heads": 16,
+        "d_model": 1024,
+    },
+    "facebook/dinov2-base": {
+        "architecture": "dinov2",
+        "n_heads": 12,
+        "d_model": 768,
+    },
+    "facebook/dinov2-small": {
+        "architecture": "dinov2",
+        "n_heads": 6,
+        "d_model": 384,
+    },
+    "openai/clip-vit-large-patch14": {
+        "architecture": "clip",
+        "n_heads": 16,
+        "d_model": 1024,
+    },
+    "openai/clip-vit-base-patch16": {
+        "architecture": "clip",
+        "n_heads": 12,
+        "d_model": 768,
+    },
+    "google/vit-large-patch16-224": {
+        "architecture": "vit",
+        "n_heads": 16,
+        "d_model": 1024,
+    },
+    "google/vit-base-patch16-224": {
+        "architecture": "vit",
+        "n_heads": 12,
+        "d_model": 768,
+    },
+}
+def load_model_and_processor(model_name: str, architecture: str):
+    """Load model and processor based on architecture type."""
+    print(f"Loading model: {model_name}")
+    if architecture == "clip":
+        model = CLIPModel.from_pretrained(model_name)
+        processor = CLIPProcessor.from_pretrained(model_name)
+    elif architecture == "dinov2":
+        model = AutoModel.from_pretrained(model_name)
+        processor = AutoImageProcessor.from_pretrained(model_name)
+    elif architecture == "vit":
+        model = AutoModel.from_pretrained(model_name)
+        processor = AutoImageProcessor.from_pretrained(model_name)
+    else:
+        raise ValueError(f"Unknown architecture: {architecture}")
+    model.eval()
+    return model, processor
+def build_concept_pool(
+    model,
+    processor,
+    architecture: str,
+    max_concepts: int = 1000,
+    cache_path: str = None,
+    device: str = "cpu",
+):
+    """Build visual concept pool from Recap-COCO-30K."""
+    print(f"Building concept pool with {max_concepts} concepts...")
+    # Load dataset
+    dataset = load_dataset("UCSC-VLAA/Recap-COCO-30K", split="train")
+    pool = VisualConceptPool.from_dataset(
+        dataset=dataset,
+        model=model,
+        processor=processor,
+        architecture=architecture,
+        image_column="image",
+        caption_column="caption",  # Short COCO captions for readability
+        image_id_column="image_id",
+        batch_size=32,
+        max_concepts=max_concepts,
+        device=device,
+        cache_path=cache_path,
+    )
+    return pool
+def print_results(results, max_sv=3, max_heads=4):
+    """Pretty-print analysis results."""
+    print("\n" + "=" * 80)
+    print("UniSITH Analysis Results")
+    print("=" * 80)
+    for layer_idx in sorted(results.keys()):
+        heads = results[layer_idx]
+        print(f"\n{'─' * 80}")
+        print(f"LAYER {layer_idx}")
+        print(f"{'─' * 80}")
+        for head in heads[:max_heads]:
+            print(f"\n  Head {head.head_idx}:")
+            for sv in head.singular_vectors[:max_sv]:
+                print(f"    SV {sv.sv_idx} (σ={sv.singular_value:.4f}, "
+                      f"fidelity={sv.fidelity:.4f}):")
+                for caption, coeff in zip(sv.concepts, sv.coefficients):
+                    print(f"      [{coeff:.4f}] {caption}")
+def main():
+    parser = argparse.ArgumentParser(description="UniSITH: Unimodal SITH Analysis")
+    parser.add_argument(
+        "--model", type=str, default="facebook/dinov2-base",
+        help="Model name/path"
+    )
+    parser.add_argument(
+        "--architecture", type=str, default=None,
+        help="Architecture type (auto-detected from model name if not set)"
+    )
+    parser.add_argument(
+        "--max-concepts", type=int, default=1000,
+        help="Maximum concepts in the pool"
+    )
+    parser.add_argument(
+        "--layers", type=int, nargs="+", default=None,
+        help="Layers to analyze (default: last 4)"
+    )
+    parser.add_argument(
+        "--n-sv", type=int, default=5,
+        help="Number of singular vectors per head"
+    )
+    parser.add_argument(
+        "--K", type=int, default=5,
+        help="Concepts per singular vector"
+    )
+    parser.add_argument(
+        "--lambda-coh", type=float, default=0.3,
+        help="COMP coherence weight"
+    )
+    parser.add_argument(
+        "--method", type=str, default="comp", choices=["comp", "top_k"],
+        help="Concept attribution method"
+    )
+    parser.add_argument(
+        "--device", type=str, default="cpu",
+        help="Device (cpu/cuda)"
+    )
+    parser.add_argument(
+        "--cache-dir", type=str, default="./cache",
+        help="Cache directory for concept embeddings"
+    )
+    parser.add_argument(
+        "--output", type=str, default="./results/unisith_results.json",
+        help="Output JSON path"
+    )
+    args = parser.parse_args()
+    # Auto-detect architecture
+    if args.architecture is None:
+        if args.model in MODEL_CONFIGS:
+            config = MODEL_CONFIGS[args.model]
+            args.architecture = config["architecture"]
+            n_heads = config["n_heads"]
+            d_model = config["d_model"]
+        else:
+            raise ValueError(
+                f"Unknown model {args.model}. Specify --architecture manually or use "
+                f"one of: {list(MODEL_CONFIGS.keys())}"
+            )
+    else:
+        if args.model in MODEL_CONFIGS:
+            config = MODEL_CONFIGS[args.model]
+            n_heads = config["n_heads"]
+            d_model = config["d_model"]
+        else:
+            raise ValueError(
+                f"Model {args.model} not in MODEL_CONFIGS. Add it or specify n_heads/d_model."
+            )
+    device = args.device
+    if device == "cuda" and not torch.cuda.is_available():
+        print("CUDA not available, falling back to CPU")
+        device = "cpu"
+    # Load model
+    model, processor = load_model_and_processor(args.model, args.architecture)
+    model = model.to(device)
+    # Build concept pool
+    cache_path = os.path.join(
+        args.cache_dir,
+        f"concept_pool_{args.model.replace('/', '_')}_{args.max_concepts}.pt"
+    )
+    pool = build_concept_pool(
+        model=model,
+        processor=processor,
+        architecture=args.architecture,
+        max_concepts=args.max_concepts,
+        cache_path=cache_path,
+        device=device,
+    )
+    print(f"Concept pool: {pool.num_concepts} concepts, dim={pool.embed_dim}")
+    # Create UniSITH analyzer
+    analyzer = UniSITH(
+        model=model,
+        architecture=args.architecture,
+        n_heads=n_heads,
+        d_model=d_model,
+        concept_pool=pool,
+        device=device,
+    )
+    # Run analysis
+    results = analyzer.analyze_model(
+        layers=args.layers,
+        n_singular_vectors=args.n_sv,
+        K=args.K,
+        lambda_coh=args.lambda_coh,
+        method=args.method,
+    )
+    # Print results
+    print_results(results)
+    # Save results
+    UniSITH.save_results(results, args.output)
+    print(f"\nDone! Results saved to {args.output}")
+if __name__ == "__main__":
+    main()