Brian Moran
Add failure geometry demo pipeline
09f4a33
"""
TF-IDF + Truncated SVD embedding for failure records.
No network downloads; no GPU required. Runs entirely with scikit-learn.
Feature construction combines:
- TF-IDF on a failure-context string (input text, expected label, prediction, reasoning type)
- TruncatedSVD to produce a dense, low-dimensional representation
Adapted from failure-induced-benchmarks/src/failure_geometry/embedding.py
(engineered sparse features) but extended to dense SVD projections for
KMeans and scatter visualisation.
"""
from __future__ import annotations
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
EMBED_DIMS = 32
def _failure_text(failure: dict) -> str:
return (
f"input: {failure['x']} "
f"expected: {failure['y']} "
f"prediction: {failure['prediction']} "
f"type: {failure['reasoning_type']} "
f"model: {failure['model_id']}"
)
def embed_failures(
failures: list[dict],
n_components: int = EMBED_DIMS,
) -> np.ndarray:
if not failures:
return np.empty((0, n_components))
texts = [_failure_text(f) for f in failures]
n = len(texts)
vectorizer = TfidfVectorizer(
max_features=800,
ngram_range=(1, 2),
sublinear_tf=True,
)
tfidf = vectorizer.fit_transform(texts)
effective_dims = min(n_components, tfidf.shape[1] - 1, n - 1)
if effective_dims < 2:
arr = tfidf.toarray()
return normalize(arr[:, : max(effective_dims, 1)])
svd = TruncatedSVD(n_components=effective_dims, random_state=42)
dense = svd.fit_transform(tfidf)
return normalize(dense)
def embed_for_scatter(failures: list[dict]) -> np.ndarray:
"""Return exactly 2 components for PCA-style scatter plots."""
if len(failures) < 3:
return np.zeros((len(failures), 2))
return embed_failures(failures, n_components=2)