Add UniSITH source code
Browse files- unimodal_sith/__init__.py +32 -0
unimodal_sith/__init__.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
UniSITH: Unimodal Semantic Inspection of Transformer Heads
|
| 3 |
+
|
| 4 |
+
A framework for interpreting unimodal vision transformer models by decomposing
|
| 5 |
+
attention head weights and attributing visual concepts from a captioned image pool.
|
| 6 |
+
|
| 7 |
+
Adapted from SITH (Vaquero et al., 2025): "From Weights to Concepts: Data-Free
|
| 8 |
+
Interpretability of CLIP via Singular Vector Decomposition" (arXiv:2603.24653)
|
| 9 |
+
|
| 10 |
+
Key difference from original SITH:
|
| 11 |
+
- Works with ANY ViT (not just CLIP)
|
| 12 |
+
- Uses captioned images as concept pool (not text from ConceptNet)
|
| 13 |
+
- Captions provide human interpretability
|
| 14 |
+
- No cross-modal alignment needed
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from .unisith import UniSITH, HeadInterpretation, SingularVectorInterpretation
|
| 18 |
+
from .concept_pool import VisualConceptPool
|
| 19 |
+
from .weight_extraction import WeightExtractor
|
| 20 |
+
from .comp import comp, comp_batch, top_k_selection
|
| 21 |
+
|
| 22 |
+
__version__ = "0.1.0"
|
| 23 |
+
__all__ = [
|
| 24 |
+
"UniSITH",
|
| 25 |
+
"HeadInterpretation",
|
| 26 |
+
"SingularVectorInterpretation",
|
| 27 |
+
"VisualConceptPool",
|
| 28 |
+
"WeightExtractor",
|
| 29 |
+
"comp",
|
| 30 |
+
"comp_batch",
|
| 31 |
+
"top_k_selection",
|
| 32 |
+
]
|