| """ |
| Failure Geometry Demo — always runnable, no API key required. |
| |
| Two deliberately weak baselines (always_1, keyword_heuristic) are run against |
| the CARB seed dataset. Failures from both models are pooled, embedded with |
| TF-IDF + SVD, clustered with KMeans, and compared via mutual information. |
| |
| The demo illustrates three questions: |
| |
| 1. Do failure clusters align with reasoning category (transitivity, negation, |
| syllogism, distractor logic)? |
| 2. Do they align with model identity (which baseline failed)? |
| 3. Visually, do the 2-D SVD projections show cluster separation? |
| |
| No network requests, no GPU, no API token. |
| """ |
|
|
| from pathlib import Path |
|
|
| import gradio as gr |
| import pandas as pd |
|
|
| from core.baseline import BASELINES, run_baselines |
| from core.cluster import cluster_embeddings |
| from core.dataset import load_dataset |
| from core.embed import embed_failures, embed_for_scatter |
| from core.metrics import accuracy_by_type, compute_mi_scores |
| from viz.plots import plot_accuracy_by_type, plot_failure_scatter, plot_mi_comparison |
|
|
|
|
| DATA_PATH = Path(__file__).parent / "data" / "carb_demo.json" |
|
|
| _DESCRIPTION = """\ |
| ## Failure Geometry Demo |
| |
| Structured failure analysis on the CARB reasoning dataset — **no API key required**. |
| |
| Two lightweight baselines are used intentionally: |
| |
| | Baseline | Strategy | Expected failure shape | |
| |----------|----------|------------------------| |
| | `always_1` | Predict 1 for everything | Fails on all false-label items (systematic bias) | |
| | `keyword_heuristic` | Predict 0 when statement contains negation markers | Fails on affirmative-false and negated-true items | |
| |
| Because the baselines fail *differently*, pooling their failures and clustering lets us ask: |
| |
| > **Do clusters separate by reasoning type, by model identity, or both?** |
| |
| Mutual information quantifies this. A larger `MI(cluster, reasoning_type)` than |
| `MI(cluster, model_identity)` supports the hypothesis that failure structure is |
| organised around reasoning difficulty — not purely around which model ran. |
| |
| --- |
| |
| *Part of the [Obversary Studios](https://obversarystudios.org) evaluation systems research.* |
| *Live pipeline version (HF Inference API):* |
| *[carb-observability-space](https://huggingface.co/spaces/obversarystudios/carb-observability-space)* |
| """ |
|
|
|
|
| def run_demo( |
| selected_models: list[str], |
| n_clusters: int, |
| ) -> tuple[str, object, object, object, object]: |
| log_lines: list[str] = [] |
|
|
| def log(msg: str) -> None: |
| log_lines.append(msg) |
|
|
| if not selected_models: |
| selected_models = list(BASELINES.keys()) |
|
|
| log(f"Loading dataset: {DATA_PATH.name}") |
| try: |
| dataset = load_dataset(DATA_PATH) |
| except Exception as exc: |
| err = f"Dataset error: {exc}" |
| return err, None, None, None, None |
|
|
| log(f" {len(dataset)} examples / {len({r['reasoning_type'] for r in dataset})} types") |
| log(f"Running baselines: {', '.join(selected_models)}") |
|
|
| all_results, failures = run_baselines(dataset, selected=selected_models) |
| log(f" {len(all_results)} predictions · {len(failures)} failures " |
| f"({100 * len(failures) / max(len(all_results), 1):.0f}% failure rate)") |
|
|
| if len(failures) < 3: |
| log("Too few failures to cluster. Try selecting both baselines.") |
| return "\n".join(log_lines), None, None, None, _empty_table() |
|
|
| log("Embedding failures (TF-IDF + SVD) …") |
| embeddings = embed_failures(failures, n_components=32) |
| scatter_coords = embed_for_scatter(failures) |
| log(f" Embedding shape: {embeddings.shape}") |
|
|
| log(f"Clustering into k={n_clusters} (KMeans) …") |
| cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters) |
| for failure, cid in zip(failures, cluster_ids, strict=True): |
| failure["cluster_id"] = cid |
|
|
| from collections import Counter |
| cluster_dist = dict(Counter(cluster_ids)) |
| log(f" Sizes: { {k: cluster_dist[k] for k in sorted(cluster_dist)} }") |
|
|
| reasoning_types = [f["reasoning_type"] for f in failures] |
| model_ids_list = [f["model_id"] for f in failures] |
|
|
| log("Computing mutual information …") |
| mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list) |
| for label, score in mi_scores.items(): |
| log(f" {label}: {score:.4f}") |
|
|
| acc_table = accuracy_by_type(all_results) |
|
|
| fig_mi = plot_mi_comparison(mi_scores) |
| fig_acc = plot_accuracy_by_type(acc_table) |
| fig_scatter = plot_failure_scatter(scatter_coords, cluster_ids, reasoning_types, model_ids_list) |
| summary_df = _build_failure_table(failures) |
|
|
| return "\n".join(log_lines), fig_mi, fig_acc, fig_scatter, summary_df |
|
|
|
|
| def _build_failure_table(failures: list[dict]) -> pd.DataFrame: |
| rows = [ |
| { |
| "cluster": f["cluster_id"], |
| "reasoning_type": f["reasoning_type"], |
| "model_id": f["model_id"], |
| "expected": f["y"], |
| "predicted": f["prediction"], |
| "input_preview": f["x"][:80] + ("…" if len(f["x"]) > 80 else ""), |
| } |
| for f in sorted(failures, key=lambda r: (r["cluster_id"], r["reasoning_type"])) |
| ] |
| return pd.DataFrame(rows) if rows else _empty_table() |
|
|
|
|
| def _empty_table() -> pd.DataFrame: |
| return pd.DataFrame( |
| columns=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"] |
| ) |
|
|
|
|
| with gr.Blocks(title="Failure Geometry Demo", theme=gr.themes.Soft()) as demo: |
| gr.Markdown(_DESCRIPTION) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1, min_width=260): |
| model_selector = gr.CheckboxGroup( |
| choices=list(BASELINES.keys()), |
| value=list(BASELINES.keys()), |
| label="Baselines to run", |
| info="Select one to isolate a single failure geometry; both to see model-identity MI.", |
| ) |
| n_clusters_slider = gr.Slider( |
| minimum=2, |
| maximum=6, |
| step=1, |
| value=4, |
| label="KMeans clusters (k)", |
| info="4 matches the number of reasoning types in the dataset.", |
| ) |
| run_btn = gr.Button("Run Analysis", variant="primary", size="lg") |
|
|
| with gr.Column(scale=2): |
| status_log = gr.Textbox( |
| label="Pipeline log", |
| lines=10, |
| interactive=False, |
| placeholder="Click 'Run Analysis' to start …", |
| ) |
|
|
| with gr.Row(): |
| mi_plot = gr.Plot(label="Mutual information comparison") |
| acc_plot = gr.Plot(label="Accuracy by reasoning type and baseline") |
|
|
| with gr.Row(): |
| scatter_plot = gr.Plot( |
| label="2-D failure embedding scatter (SVD component 1 vs 2)" |
| ) |
|
|
| with gr.Row(): |
| failure_table = gr.Dataframe( |
| headers=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"], |
| label="Failure records (sorted by cluster → reasoning type)", |
| interactive=False, |
| wrap=True, |
| ) |
|
|
| run_btn.click( |
| fn=run_demo, |
| inputs=[model_selector, n_clusters_slider], |
| outputs=[status_log, mi_plot, acc_plot, scatter_plot, failure_table], |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|