Spaces:

obversarystudios
/

failure-geometry-demo

Running

Brian Moran

Add failure geometry demo pipeline

09f4a33 13 days ago

1.97 kB

	"""
	TF-IDF + Truncated SVD embedding for failure records.

	No network downloads; no GPU required. Runs entirely with scikit-learn.

	Feature construction combines:
	- TF-IDF on a failure-context string (input text, expected label, prediction, reasoning type)
	- TruncatedSVD to produce a dense, low-dimensional representation

	Adapted from failure-induced-benchmarks/src/failure_geometry/embedding.py
	(engineered sparse features) but extended to dense SVD projections for
	KMeans and scatter visualisation.
	"""

	from __future__ import annotations

	import numpy as np
	from sklearn.decomposition import TruncatedSVD
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.preprocessing import normalize


	EMBED_DIMS = 32


	def _failure_text(failure: dict) -> str:
	return (
	f"input: {failure['x']} "
	f"expected: {failure['y']} "
	f"prediction: {failure['prediction']} "
	f"type: {failure['reasoning_type']} "
	f"model: {failure['model_id']}"
	)


	def embed_failures(
	failures: list[dict],
	n_components: int = EMBED_DIMS,
	) -> np.ndarray:
	if not failures:
	return np.empty((0, n_components))

	texts = [_failure_text(f) for f in failures]
	n = len(texts)

	vectorizer = TfidfVectorizer(
	max_features=800,
	ngram_range=(1, 2),
	sublinear_tf=True,
	)
	tfidf = vectorizer.fit_transform(texts)

	effective_dims = min(n_components, tfidf.shape[1] - 1, n - 1)
	if effective_dims < 2:
	arr = tfidf.toarray()
	return normalize(arr[:, : max(effective_dims, 1)])

	svd = TruncatedSVD(n_components=effective_dims, random_state=42)
	dense = svd.fit_transform(tfidf)
	return normalize(dense)


	def embed_for_scatter(failures: list[dict]) -> np.ndarray:
	"""Return exactly 2 components for PCA-style scatter plots."""
	if len(failures) < 3:
	return np.zeros((len(failures), 2))
	return embed_failures(failures, n_components=2)