| |
| """ |
| UniSITH Demo: Analyze a DINOv2 model using captioned images as concept pool. |
| |
| This script demonstrates the full UniSITH pipeline: |
| 1. Load a unimodal ViT model (DINOv2-large) |
| 2. Build a visual concept pool from Recap-COCO-30K |
| 3. Analyze attention heads via SVD + COMP |
| 4. Display human-interpretable concept attributions |
| |
| Usage: |
| python run_unisith.py --model facebook/dinov2-large --max-concepts 1000 |
| python run_unisith.py --model openai/clip-vit-large-patch14 --architecture clip |
| """ |
|
|
| import argparse |
| import torch |
| import os |
| import sys |
| import json |
| from transformers import AutoModel, AutoProcessor, AutoImageProcessor |
| from transformers import CLIPModel, CLIPProcessor |
| from datasets import load_dataset |
|
|
| |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
|
|
| from unimodal_sith.concept_pool import VisualConceptPool |
| from unimodal_sith.unisith import UniSITH |
|
|
|
|
| |
| MODEL_CONFIGS = { |
| "facebook/dinov2-large": { |
| "architecture": "dinov2", |
| "n_heads": 16, |
| "d_model": 1024, |
| }, |
| "facebook/dinov2-base": { |
| "architecture": "dinov2", |
| "n_heads": 12, |
| "d_model": 768, |
| }, |
| "facebook/dinov2-small": { |
| "architecture": "dinov2", |
| "n_heads": 6, |
| "d_model": 384, |
| }, |
| "openai/clip-vit-large-patch14": { |
| "architecture": "clip", |
| "n_heads": 16, |
| "d_model": 1024, |
| }, |
| "openai/clip-vit-base-patch16": { |
| "architecture": "clip", |
| "n_heads": 12, |
| "d_model": 768, |
| }, |
| "google/vit-large-patch16-224": { |
| "architecture": "vit", |
| "n_heads": 16, |
| "d_model": 1024, |
| }, |
| "google/vit-base-patch16-224": { |
| "architecture": "vit", |
| "n_heads": 12, |
| "d_model": 768, |
| }, |
| } |
|
|
|
|
| def load_model_and_processor(model_name: str, architecture: str): |
| """Load model and processor based on architecture type.""" |
| print(f"Loading model: {model_name}") |
| |
| if architecture == "clip": |
| model = CLIPModel.from_pretrained(model_name) |
| processor = CLIPProcessor.from_pretrained(model_name) |
| elif architecture == "dinov2": |
| model = AutoModel.from_pretrained(model_name) |
| processor = AutoImageProcessor.from_pretrained(model_name) |
| elif architecture == "vit": |
| model = AutoModel.from_pretrained(model_name) |
| processor = AutoImageProcessor.from_pretrained(model_name) |
| else: |
| raise ValueError(f"Unknown architecture: {architecture}") |
| |
| model.eval() |
| return model, processor |
|
|
|
|
| def build_concept_pool( |
| model, |
| processor, |
| architecture: str, |
| max_concepts: int = 1000, |
| cache_path: str = None, |
| device: str = "cpu", |
| ): |
| """Build visual concept pool from Recap-COCO-30K.""" |
| print(f"Building concept pool with {max_concepts} concepts...") |
| |
| |
| dataset = load_dataset("UCSC-VLAA/Recap-COCO-30K", split="train") |
| |
| pool = VisualConceptPool.from_dataset( |
| dataset=dataset, |
| model=model, |
| processor=processor, |
| architecture=architecture, |
| image_column="image", |
| caption_column="caption", |
| image_id_column="image_id", |
| batch_size=32, |
| max_concepts=max_concepts, |
| device=device, |
| cache_path=cache_path, |
| ) |
| |
| return pool |
|
|
|
|
| def print_results(results, max_sv=3, max_heads=4): |
| """Pretty-print analysis results.""" |
| print("\n" + "=" * 80) |
| print("UniSITH Analysis Results") |
| print("=" * 80) |
| |
| for layer_idx in sorted(results.keys()): |
| heads = results[layer_idx] |
| print(f"\n{'─' * 80}") |
| print(f"LAYER {layer_idx}") |
| print(f"{'─' * 80}") |
| |
| for head in heads[:max_heads]: |
| print(f"\n Head {head.head_idx}:") |
| for sv in head.singular_vectors[:max_sv]: |
| print(f" SV {sv.sv_idx} (σ={sv.singular_value:.4f}, " |
| f"fidelity={sv.fidelity:.4f}):") |
| for caption, coeff in zip(sv.concepts, sv.coefficients): |
| print(f" [{coeff:.4f}] {caption}") |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="UniSITH: Unimodal SITH Analysis") |
| parser.add_argument( |
| "--model", type=str, default="facebook/dinov2-base", |
| help="Model name/path" |
| ) |
| parser.add_argument( |
| "--architecture", type=str, default=None, |
| help="Architecture type (auto-detected from model name if not set)" |
| ) |
| parser.add_argument( |
| "--max-concepts", type=int, default=1000, |
| help="Maximum concepts in the pool" |
| ) |
| parser.add_argument( |
| "--layers", type=int, nargs="+", default=None, |
| help="Layers to analyze (default: last 4)" |
| ) |
| parser.add_argument( |
| "--n-sv", type=int, default=5, |
| help="Number of singular vectors per head" |
| ) |
| parser.add_argument( |
| "--K", type=int, default=5, |
| help="Concepts per singular vector" |
| ) |
| parser.add_argument( |
| "--lambda-coh", type=float, default=0.3, |
| help="COMP coherence weight" |
| ) |
| parser.add_argument( |
| "--method", type=str, default="comp", choices=["comp", "top_k"], |
| help="Concept attribution method" |
| ) |
| parser.add_argument( |
| "--device", type=str, default="cpu", |
| help="Device (cpu/cuda)" |
| ) |
| parser.add_argument( |
| "--cache-dir", type=str, default="./cache", |
| help="Cache directory for concept embeddings" |
| ) |
| parser.add_argument( |
| "--output", type=str, default="./results/unisith_results.json", |
| help="Output JSON path" |
| ) |
| |
| args = parser.parse_args() |
| |
| |
| if args.architecture is None: |
| if args.model in MODEL_CONFIGS: |
| config = MODEL_CONFIGS[args.model] |
| args.architecture = config["architecture"] |
| n_heads = config["n_heads"] |
| d_model = config["d_model"] |
| else: |
| raise ValueError( |
| f"Unknown model {args.model}. Specify --architecture manually or use " |
| f"one of: {list(MODEL_CONFIGS.keys())}" |
| ) |
| else: |
| if args.model in MODEL_CONFIGS: |
| config = MODEL_CONFIGS[args.model] |
| n_heads = config["n_heads"] |
| d_model = config["d_model"] |
| else: |
| raise ValueError( |
| f"Model {args.model} not in MODEL_CONFIGS. Add it or specify n_heads/d_model." |
| ) |
| |
| device = args.device |
| if device == "cuda" and not torch.cuda.is_available(): |
| print("CUDA not available, falling back to CPU") |
| device = "cpu" |
| |
| |
| model, processor = load_model_and_processor(args.model, args.architecture) |
| model = model.to(device) |
| |
| |
| cache_path = os.path.join( |
| args.cache_dir, |
| f"concept_pool_{args.model.replace('/', '_')}_{args.max_concepts}.pt" |
| ) |
| |
| pool = build_concept_pool( |
| model=model, |
| processor=processor, |
| architecture=args.architecture, |
| max_concepts=args.max_concepts, |
| cache_path=cache_path, |
| device=device, |
| ) |
| |
| print(f"Concept pool: {pool.num_concepts} concepts, dim={pool.embed_dim}") |
| |
| |
| analyzer = UniSITH( |
| model=model, |
| architecture=args.architecture, |
| n_heads=n_heads, |
| d_model=d_model, |
| concept_pool=pool, |
| device=device, |
| ) |
| |
| |
| results = analyzer.analyze_model( |
| layers=args.layers, |
| n_singular_vectors=args.n_sv, |
| K=args.K, |
| lambda_coh=args.lambda_coh, |
| method=args.method, |
| ) |
| |
| |
| print_results(results) |
| |
| |
| UniSITH.save_results(results, args.output) |
| |
| print(f"\nDone! Results saved to {args.output}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|