Spaces:

obversarystudios
/

failure-geometry-demo

Running

App Files Files Community

failure-geometry-demo / app.py

obversarystudios

Align requirements and mirror copies

3206a9e verified 5 days ago

raw

history blame contribute delete

7.22 kB

	"""
	Failure Geometry Demo — always runnable, no API key required.

	Two deliberately weak baselines (always_1, keyword_heuristic) are run against
	the CARB seed dataset. Failures from both models are pooled, embedded with
	TF-IDF + SVD, clustered with KMeans, and compared via mutual information.

	The demo illustrates three questions:

	1. Do failure clusters align with reasoning category (transitivity, negation,
	syllogism, distractor logic)?
	2. Do they align with model identity (which baseline failed)?
	3. Visually, do the 2-D SVD projections show cluster separation?

	No network requests, no GPU, no API token.
	"""

	from pathlib import Path

	import gradio as gr
	import pandas as pd

	from core.baseline import BASELINES, run_baselines
	from core.cluster import cluster_embeddings
	from core.dataset import load_dataset
	from core.embed import embed_failures, embed_for_scatter
	from core.metrics import accuracy_by_type, compute_mi_scores
	from viz.plots import plot_accuracy_by_type, plot_failure_scatter, plot_mi_comparison


	DATA_PATH = Path(__file__).parent / "data" / "carb_demo.json"

	_DESCRIPTION = """\
	## Failure Geometry Demo

	Structured failure analysis on the CARB reasoning dataset — no API key required.

	Two lightweight baselines are used intentionally:

	\| Baseline \| Strategy \| Expected failure shape \|
	\|----------\|----------\|------------------------\|
	\| `always_1` \| Predict 1 for everything \| Fails on all false-label items (systematic bias) \|
	\| `keyword_heuristic` \| Predict 0 when statement contains negation markers \| Fails on affirmative-false and negated-true items \|

	Because the baselines fail differently, pooling their failures and clustering lets us ask:

	> Do clusters separate by reasoning type, by model identity, or both?

	Mutual information quantifies this. A larger `MI(cluster, reasoning_type)` than
	`MI(cluster, model_identity)` supports the hypothesis that failure structure is
	organised around reasoning difficulty — not purely around which model ran.

	---

	Part of the [Obversary Studios](https://obversarystudios.org) evaluation systems research.
	Live pipeline version (HF Inference API):
	[carb-observability-space](https://huggingface.co/spaces/obversarystudios/carb-observability-space)
	"""


	def run_demo(
	selected_models: list[str],
	n_clusters: int,
	) -> tuple[str, object, object, object, object]:
	log_lines: list[str] = []

	def log(msg: str) -> None:
	log_lines.append(msg)

	if not selected_models:
	selected_models = list(BASELINES.keys())

	log(f"Loading dataset: {DATA_PATH.name}")
	try:
	dataset = load_dataset(DATA_PATH)
	except Exception as exc:
	err = f"Dataset error: {exc}"
	return err, None, None, None, None

	log(f" {len(dataset)} examples / {len({r['reasoning_type'] for r in dataset})} types")
	log(f"Running baselines: {', '.join(selected_models)}")

	all_results, failures = run_baselines(dataset, selected=selected_models)
	log(f" {len(all_results)} predictions · {len(failures)} failures "
	f"({100 * len(failures) / max(len(all_results), 1):.0f}% failure rate)")

	if len(failures) < 3:
	log("Too few failures to cluster. Try selecting both baselines.")
	return "\n".join(log_lines), None, None, None, _empty_table()

	log("Embedding failures (TF-IDF + SVD) …")
	embeddings = embed_failures(failures, n_components=32)
	scatter_coords = embed_for_scatter(failures)
	log(f" Embedding shape: {embeddings.shape}")

	log(f"Clustering into k={n_clusters} (KMeans) …")
	cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters)
	for failure, cid in zip(failures, cluster_ids, strict=True):
	failure["cluster_id"] = cid

	from collections import Counter
	cluster_dist = dict(Counter(cluster_ids))
	log(f" Sizes: { {k: cluster_dist[k] for k in sorted(cluster_dist)} }")

	reasoning_types = [f["reasoning_type"] for f in failures]
	model_ids_list = [f["model_id"] for f in failures]

	log("Computing mutual information …")
	mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list)
	for label, score in mi_scores.items():
	log(f" {label}: {score:.4f}")

	acc_table = accuracy_by_type(all_results)

	fig_mi = plot_mi_comparison(mi_scores)
	fig_acc = plot_accuracy_by_type(acc_table)
	fig_scatter = plot_failure_scatter(scatter_coords, cluster_ids, reasoning_types, model_ids_list)
	summary_df = _build_failure_table(failures)

	return "\n".join(log_lines), fig_mi, fig_acc, fig_scatter, summary_df


	def _build_failure_table(failures: list[dict]) -> pd.DataFrame:
	rows = [
	{
	"cluster": f["cluster_id"],
	"reasoning_type": f["reasoning_type"],
	"model_id": f["model_id"],
	"expected": f["y"],
	"predicted": f["prediction"],
	"input_preview": f["x"][:80] + ("…" if len(f["x"]) > 80 else ""),
	}
	for f in sorted(failures, key=lambda r: (r["cluster_id"], r["reasoning_type"]))
	]
	return pd.DataFrame(rows) if rows else _empty_table()


	def _empty_table() -> pd.DataFrame:
	return pd.DataFrame(
	columns=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"]
	)


	with gr.Blocks(title="Failure Geometry Demo", theme=gr.themes.Soft()) as demo:
	gr.Markdown(_DESCRIPTION)

	with gr.Row():
	with gr.Column(scale=1, min_width=260):
	model_selector = gr.CheckboxGroup(
	choices=list(BASELINES.keys()),
	value=list(BASELINES.keys()),
	label="Baselines to run",
	info="Select one to isolate a single failure geometry; both to see model-identity MI.",
	)
	n_clusters_slider = gr.Slider(
	minimum=2,
	maximum=6,
	step=1,
	value=4,
	label="KMeans clusters (k)",
	info="4 matches the number of reasoning types in the dataset.",
	)
	run_btn = gr.Button("Run Analysis", variant="primary", size="lg")

	with gr.Column(scale=2):
	status_log = gr.Textbox(
	label="Pipeline log",
	lines=10,
	interactive=False,
	placeholder="Click 'Run Analysis' to start …",
	)

	with gr.Row():
	mi_plot = gr.Plot(label="Mutual information comparison")
	acc_plot = gr.Plot(label="Accuracy by reasoning type and baseline")

	with gr.Row():
	scatter_plot = gr.Plot(
	label="2-D failure embedding scatter (SVD component 1 vs 2)"
	)

	with gr.Row():
	failure_table = gr.Dataframe(
	headers=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"],
	label="Failure records (sorted by cluster → reasoning type)",
	interactive=False,
	wrap=True,
	)

	run_btn.click(
	fn=run_demo,
	inputs=[model_selector, n_clusters_slider],
	outputs=[status_log, mi_plot, acc_plot, scatter_plot, failure_table],
	)


	if __name__ == "__main__":
	demo.launch()