""" Failure Geometry Demo — always runnable, no API key required. Two deliberately weak baselines (always_1, keyword_heuristic) are run against the CARB seed dataset. Failures from both models are pooled, embedded with TF-IDF + SVD, clustered with KMeans, and compared via mutual information. The demo illustrates three questions: 1. Do failure clusters align with reasoning category (transitivity, negation, syllogism, distractor logic)? 2. Do they align with model identity (which baseline failed)? 3. Visually, do the 2-D SVD projections show cluster separation? No network requests, no GPU, no API token. """ from pathlib import Path import gradio as gr import pandas as pd from core.baseline import BASELINES, run_baselines from core.cluster import cluster_embeddings from core.dataset import load_dataset from core.embed import embed_failures, embed_for_scatter from core.metrics import accuracy_by_type, compute_mi_scores from viz.plots import plot_accuracy_by_type, plot_failure_scatter, plot_mi_comparison DATA_PATH = Path(__file__).parent / "data" / "carb_demo.json" _DESCRIPTION = """\ ## Failure Geometry Demo Structured failure analysis on the CARB reasoning dataset — **no API key required**. Two lightweight baselines are used intentionally: | Baseline | Strategy | Expected failure shape | |----------|----------|------------------------| | `always_1` | Predict 1 for everything | Fails on all false-label items (systematic bias) | | `keyword_heuristic` | Predict 0 when statement contains negation markers | Fails on affirmative-false and negated-true items | Because the baselines fail *differently*, pooling their failures and clustering lets us ask: > **Do clusters separate by reasoning type, by model identity, or both?** Mutual information quantifies this. A larger `MI(cluster, reasoning_type)` than `MI(cluster, model_identity)` supports the hypothesis that failure structure is organised around reasoning difficulty — not purely around which model ran. --- *Part of the [Obversary Studios](https://obversarystudios.org) evaluation systems research.* *Live pipeline version (HF Inference API):* *[carb-observability-space](https://huggingface.co/spaces/obversarystudios/carb-observability-space)* """ def run_demo( selected_models: list[str], n_clusters: int, ) -> tuple[str, object, object, object, object]: log_lines: list[str] = [] def log(msg: str) -> None: log_lines.append(msg) if not selected_models: selected_models = list(BASELINES.keys()) log(f"Loading dataset: {DATA_PATH.name}") try: dataset = load_dataset(DATA_PATH) except Exception as exc: err = f"Dataset error: {exc}" return err, None, None, None, None log(f" {len(dataset)} examples / {len({r['reasoning_type'] for r in dataset})} types") log(f"Running baselines: {', '.join(selected_models)}") all_results, failures = run_baselines(dataset, selected=selected_models) log(f" {len(all_results)} predictions · {len(failures)} failures " f"({100 * len(failures) / max(len(all_results), 1):.0f}% failure rate)") if len(failures) < 3: log("Too few failures to cluster. Try selecting both baselines.") return "\n".join(log_lines), None, None, None, _empty_table() log("Embedding failures (TF-IDF + SVD) …") embeddings = embed_failures(failures, n_components=32) scatter_coords = embed_for_scatter(failures) log(f" Embedding shape: {embeddings.shape}") log(f"Clustering into k={n_clusters} (KMeans) …") cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters) for failure, cid in zip(failures, cluster_ids, strict=True): failure["cluster_id"] = cid from collections import Counter cluster_dist = dict(Counter(cluster_ids)) log(f" Sizes: { {k: cluster_dist[k] for k in sorted(cluster_dist)} }") reasoning_types = [f["reasoning_type"] for f in failures] model_ids_list = [f["model_id"] for f in failures] log("Computing mutual information …") mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list) for label, score in mi_scores.items(): log(f" {label}: {score:.4f}") acc_table = accuracy_by_type(all_results) fig_mi = plot_mi_comparison(mi_scores) fig_acc = plot_accuracy_by_type(acc_table) fig_scatter = plot_failure_scatter(scatter_coords, cluster_ids, reasoning_types, model_ids_list) summary_df = _build_failure_table(failures) return "\n".join(log_lines), fig_mi, fig_acc, fig_scatter, summary_df def _build_failure_table(failures: list[dict]) -> pd.DataFrame: rows = [ { "cluster": f["cluster_id"], "reasoning_type": f["reasoning_type"], "model_id": f["model_id"], "expected": f["y"], "predicted": f["prediction"], "input_preview": f["x"][:80] + ("…" if len(f["x"]) > 80 else ""), } for f in sorted(failures, key=lambda r: (r["cluster_id"], r["reasoning_type"])) ] return pd.DataFrame(rows) if rows else _empty_table() def _empty_table() -> pd.DataFrame: return pd.DataFrame( columns=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"] ) with gr.Blocks(title="Failure Geometry Demo", theme=gr.themes.Soft()) as demo: gr.Markdown(_DESCRIPTION) with gr.Row(): with gr.Column(scale=1, min_width=260): model_selector = gr.CheckboxGroup( choices=list(BASELINES.keys()), value=list(BASELINES.keys()), label="Baselines to run", info="Select one to isolate a single failure geometry; both to see model-identity MI.", ) n_clusters_slider = gr.Slider( minimum=2, maximum=6, step=1, value=4, label="KMeans clusters (k)", info="4 matches the number of reasoning types in the dataset.", ) run_btn = gr.Button("Run Analysis", variant="primary", size="lg") with gr.Column(scale=2): status_log = gr.Textbox( label="Pipeline log", lines=10, interactive=False, placeholder="Click 'Run Analysis' to start …", ) with gr.Row(): mi_plot = gr.Plot(label="Mutual information comparison") acc_plot = gr.Plot(label="Accuracy by reasoning type and baseline") with gr.Row(): scatter_plot = gr.Plot( label="2-D failure embedding scatter (SVD component 1 vs 2)" ) with gr.Row(): failure_table = gr.Dataframe( headers=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"], label="Failure records (sorted by cluster → reasoning type)", interactive=False, wrap=True, ) run_btn.click( fn=run_demo, inputs=[model_selector, n_clusters_slider], outputs=[status_log, mi_plot, acc_plot, scatter_plot, failure_table], ) if __name__ == "__main__": demo.launch()