""" TF-IDF + Truncated SVD embedding for failure records. No network downloads; no GPU required. Runs entirely with scikit-learn. Feature construction combines: - TF-IDF on a failure-context string (input text, expected label, prediction, reasoning type) - TruncatedSVD to produce a dense, low-dimensional representation Adapted from failure-induced-benchmarks/src/failure_geometry/embedding.py (engineered sparse features) but extended to dense SVD projections for KMeans and scatter visualisation. """ from __future__ import annotations import numpy as np from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import normalize EMBED_DIMS = 32 def _failure_text(failure: dict) -> str: return ( f"input: {failure['x']} " f"expected: {failure['y']} " f"prediction: {failure['prediction']} " f"type: {failure['reasoning_type']} " f"model: {failure['model_id']}" ) def embed_failures( failures: list[dict], n_components: int = EMBED_DIMS, ) -> np.ndarray: if not failures: return np.empty((0, n_components)) texts = [_failure_text(f) for f in failures] n = len(texts) vectorizer = TfidfVectorizer( max_features=800, ngram_range=(1, 2), sublinear_tf=True, ) tfidf = vectorizer.fit_transform(texts) effective_dims = min(n_components, tfidf.shape[1] - 1, n - 1) if effective_dims < 2: arr = tfidf.toarray() return normalize(arr[:, : max(effective_dims, 1)]) svd = TruncatedSVD(n_components=effective_dims, random_state=42) dense = svd.fit_transform(tfidf) return normalize(dense) def embed_for_scatter(failures: list[dict]) -> np.ndarray: """Return exactly 2 components for PCA-style scatter plots.""" if len(failures) < 3: return np.zeros((len(failures), 2)) return embed_failures(failures, n_components=2)