File size: 1,967 Bytes
09f4a33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | """
TF-IDF + Truncated SVD embedding for failure records.
No network downloads; no GPU required. Runs entirely with scikit-learn.
Feature construction combines:
- TF-IDF on a failure-context string (input text, expected label, prediction, reasoning type)
- TruncatedSVD to produce a dense, low-dimensional representation
Adapted from failure-induced-benchmarks/src/failure_geometry/embedding.py
(engineered sparse features) but extended to dense SVD projections for
KMeans and scatter visualisation.
"""
from __future__ import annotations
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
EMBED_DIMS = 32
def _failure_text(failure: dict) -> str:
return (
f"input: {failure['x']} "
f"expected: {failure['y']} "
f"prediction: {failure['prediction']} "
f"type: {failure['reasoning_type']} "
f"model: {failure['model_id']}"
)
def embed_failures(
failures: list[dict],
n_components: int = EMBED_DIMS,
) -> np.ndarray:
if not failures:
return np.empty((0, n_components))
texts = [_failure_text(f) for f in failures]
n = len(texts)
vectorizer = TfidfVectorizer(
max_features=800,
ngram_range=(1, 2),
sublinear_tf=True,
)
tfidf = vectorizer.fit_transform(texts)
effective_dims = min(n_components, tfidf.shape[1] - 1, n - 1)
if effective_dims < 2:
arr = tfidf.toarray()
return normalize(arr[:, : max(effective_dims, 1)])
svd = TruncatedSVD(n_components=effective_dims, random_state=42)
dense = svd.fit_transform(tfidf)
return normalize(dense)
def embed_for_scatter(failures: list[dict]) -> np.ndarray:
"""Return exactly 2 components for PCA-style scatter plots."""
if len(failures) < 3:
return np.zeros((len(failures), 2))
return embed_failures(failures, n_components=2)
|