File size: 7,989 Bytes

24e41d7

#!/usr/bin/env python3
"""
UniSITH Demo: Analyze a DINOv2 model using captioned images as concept pool.

This script demonstrates the full UniSITH pipeline:
1. Load a unimodal ViT model (DINOv2-large)
2. Build a visual concept pool from Recap-COCO-30K
3. Analyze attention heads via SVD + COMP
4. Display human-interpretable concept attributions

Usage:
    python run_unisith.py --model facebook/dinov2-large --max-concepts 1000
    python run_unisith.py --model openai/clip-vit-large-patch14 --architecture clip
"""

import argparse
import torch
import os
import sys
import json
from transformers import AutoModel, AutoProcessor, AutoImageProcessor
from transformers import CLIPModel, CLIPProcessor
from datasets import load_dataset

# Add parent dir to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from unimodal_sith.concept_pool import VisualConceptPool
from unimodal_sith.unisith import UniSITH


# Model configurations
MODEL_CONFIGS = {
    "facebook/dinov2-large": {
        "architecture": "dinov2",
        "n_heads": 16,
        "d_model": 1024,
    },
    "facebook/dinov2-base": {
        "architecture": "dinov2",
        "n_heads": 12,
        "d_model": 768,
    },
    "facebook/dinov2-small": {
        "architecture": "dinov2",
        "n_heads": 6,
        "d_model": 384,
    },
    "openai/clip-vit-large-patch14": {
        "architecture": "clip",
        "n_heads": 16,
        "d_model": 1024,
    },
    "openai/clip-vit-base-patch16": {
        "architecture": "clip",
        "n_heads": 12,
        "d_model": 768,
    },
    "google/vit-large-patch16-224": {
        "architecture": "vit",
        "n_heads": 16,
        "d_model": 1024,
    },
    "google/vit-base-patch16-224": {
        "architecture": "vit",
        "n_heads": 12,
        "d_model": 768,
    },
}


def load_model_and_processor(model_name: str, architecture: str):
    """Load model and processor based on architecture type."""
    print(f"Loading model: {model_name}")
    
    if architecture == "clip":
        model = CLIPModel.from_pretrained(model_name)
        processor = CLIPProcessor.from_pretrained(model_name)
    elif architecture == "dinov2":
        model = AutoModel.from_pretrained(model_name)
        processor = AutoImageProcessor.from_pretrained(model_name)
    elif architecture == "vit":
        model = AutoModel.from_pretrained(model_name)
        processor = AutoImageProcessor.from_pretrained(model_name)
    else:
        raise ValueError(f"Unknown architecture: {architecture}")
    
    model.eval()
    return model, processor


def build_concept_pool(
    model,
    processor,
    architecture: str,
    max_concepts: int = 1000,
    cache_path: str = None,
    device: str = "cpu",
):
    """Build visual concept pool from Recap-COCO-30K."""
    print(f"Building concept pool with {max_concepts} concepts...")
    
    # Load dataset
    dataset = load_dataset("UCSC-VLAA/Recap-COCO-30K", split="train")
    
    pool = VisualConceptPool.from_dataset(
        dataset=dataset,
        model=model,
        processor=processor,
        architecture=architecture,
        image_column="image",
        caption_column="caption",  # Short COCO captions for readability
        image_id_column="image_id",
        batch_size=32,
        max_concepts=max_concepts,
        device=device,
        cache_path=cache_path,
    )
    
    return pool


def print_results(results, max_sv=3, max_heads=4):
    """Pretty-print analysis results."""
    print("\n" + "=" * 80)
    print("UniSITH Analysis Results")
    print("=" * 80)
    
    for layer_idx in sorted(results.keys()):
        heads = results[layer_idx]
        print(f"\n{'─' * 80}")
        print(f"LAYER {layer_idx}")
        print(f"{'─' * 80}")
        
        for head in heads[:max_heads]:
            print(f"\n  Head {head.head_idx}:")
            for sv in head.singular_vectors[:max_sv]:
                print(f"    SV {sv.sv_idx} (σ={sv.singular_value:.4f}, "
                      f"fidelity={sv.fidelity:.4f}):")
                for caption, coeff in zip(sv.concepts, sv.coefficients):
                    print(f"      [{coeff:.4f}] {caption}")


def main():
    parser = argparse.ArgumentParser(description="UniSITH: Unimodal SITH Analysis")
    parser.add_argument(
        "--model", type=str, default="facebook/dinov2-base",
        help="Model name/path"
    )
    parser.add_argument(
        "--architecture", type=str, default=None,
        help="Architecture type (auto-detected from model name if not set)"
    )
    parser.add_argument(
        "--max-concepts", type=int, default=1000,
        help="Maximum concepts in the pool"
    )
    parser.add_argument(
        "--layers", type=int, nargs="+", default=None,
        help="Layers to analyze (default: last 4)"
    )
    parser.add_argument(
        "--n-sv", type=int, default=5,
        help="Number of singular vectors per head"
    )
    parser.add_argument(
        "--K", type=int, default=5,
        help="Concepts per singular vector"
    )
    parser.add_argument(
        "--lambda-coh", type=float, default=0.3,
        help="COMP coherence weight"
    )
    parser.add_argument(
        "--method", type=str, default="comp", choices=["comp", "top_k"],
        help="Concept attribution method"
    )
    parser.add_argument(
        "--device", type=str, default="cpu",
        help="Device (cpu/cuda)"
    )
    parser.add_argument(
        "--cache-dir", type=str, default="./cache",
        help="Cache directory for concept embeddings"
    )
    parser.add_argument(
        "--output", type=str, default="./results/unisith_results.json",
        help="Output JSON path"
    )
    
    args = parser.parse_args()
    
    # Auto-detect architecture
    if args.architecture is None:
        if args.model in MODEL_CONFIGS:
            config = MODEL_CONFIGS[args.model]
            args.architecture = config["architecture"]
            n_heads = config["n_heads"]
            d_model = config["d_model"]
        else:
            raise ValueError(
                f"Unknown model {args.model}. Specify --architecture manually or use "
                f"one of: {list(MODEL_CONFIGS.keys())}"
            )
    else:
        if args.model in MODEL_CONFIGS:
            config = MODEL_CONFIGS[args.model]
            n_heads = config["n_heads"]
            d_model = config["d_model"]
        else:
            raise ValueError(
                f"Model {args.model} not in MODEL_CONFIGS. Add it or specify n_heads/d_model."
            )
    
    device = args.device
    if device == "cuda" and not torch.cuda.is_available():
        print("CUDA not available, falling back to CPU")
        device = "cpu"
    
    # Load model
    model, processor = load_model_and_processor(args.model, args.architecture)
    model = model.to(device)
    
    # Build concept pool
    cache_path = os.path.join(
        args.cache_dir,
        f"concept_pool_{args.model.replace('/', '_')}_{args.max_concepts}.pt"
    )
    
    pool = build_concept_pool(
        model=model,
        processor=processor,
        architecture=args.architecture,
        max_concepts=args.max_concepts,
        cache_path=cache_path,
        device=device,
    )
    
    print(f"Concept pool: {pool.num_concepts} concepts, dim={pool.embed_dim}")
    
    # Create UniSITH analyzer
    analyzer = UniSITH(
        model=model,
        architecture=args.architecture,
        n_heads=n_heads,
        d_model=d_model,
        concept_pool=pool,
        device=device,
    )
    
    # Run analysis
    results = analyzer.analyze_model(
        layers=args.layers,
        n_singular_vectors=args.n_sv,
        K=args.K,
        lambda_coh=args.lambda_coh,
        method=args.method,
    )
    
    # Print results
    print_results(results)
    
    # Save results
    UniSITH.save_results(results, args.output)
    
    print(f"\nDone! Results saved to {args.output}")


if __name__ == "__main__":
    main()