Spaces:

obversarystudios
/

failure-geometry-demo

Running

File size: 7,217 Bytes

"""
Failure Geometry Demo — always runnable, no API key required.

Two deliberately weak baselines (always_1, keyword_heuristic) are run against
the CARB seed dataset.  Failures from both models are pooled, embedded with
TF-IDF + SVD, clustered with KMeans, and compared via mutual information.

The demo illustrates three questions:

  1. Do failure clusters align with reasoning category (transitivity, negation,
     syllogism, distractor logic)?
  2. Do they align with model identity (which baseline failed)?
  3. Visually, do the 2-D SVD projections show cluster separation?

No network requests, no GPU, no API token.
"""

from pathlib import Path

import gradio as gr
import pandas as pd

from core.baseline import BASELINES, run_baselines
from core.cluster import cluster_embeddings
from core.dataset import load_dataset
from core.embed import embed_failures, embed_for_scatter
from core.metrics import accuracy_by_type, compute_mi_scores
from viz.plots import plot_accuracy_by_type, plot_failure_scatter, plot_mi_comparison


DATA_PATH = Path(__file__).parent / "data" / "carb_demo.json"

_DESCRIPTION = """\
## Failure Geometry Demo

Structured failure analysis on the CARB reasoning dataset — **no API key required**.

Two lightweight baselines are used intentionally:

| Baseline | Strategy | Expected failure shape |
|----------|----------|------------------------|
| `always_1` | Predict 1 for everything | Fails on all false-label items (systematic bias) |
| `keyword_heuristic` | Predict 0 when statement contains negation markers | Fails on affirmative-false and negated-true items |

Because the baselines fail *differently*, pooling their failures and clustering lets us ask:

> **Do clusters separate by reasoning type, by model identity, or both?**

Mutual information quantifies this. A larger `MI(cluster, reasoning_type)` than
`MI(cluster, model_identity)` supports the hypothesis that failure structure is
organised around reasoning difficulty — not purely around which model ran.

---

*Part of the [Obversary Studios](https://obversarystudios.org) evaluation systems research.*
*Live pipeline version (HF Inference API):*
*[carb-observability-space](https://huggingface.co/spaces/obversarystudios/carb-observability-space)*
"""


def run_demo(
    selected_models: list[str],
    n_clusters: int,
) -> tuple[str, object, object, object, object]:
    log_lines: list[str] = []

    def log(msg: str) -> None:
        log_lines.append(msg)

    if not selected_models:
        selected_models = list(BASELINES.keys())

    log(f"Loading dataset: {DATA_PATH.name}")
    try:
        dataset = load_dataset(DATA_PATH)
    except Exception as exc:
        err = f"Dataset error: {exc}"
        return err, None, None, None, None

    log(f"  {len(dataset)} examples / {len({r['reasoning_type'] for r in dataset})} types")
    log(f"Running baselines: {', '.join(selected_models)}")

    all_results, failures = run_baselines(dataset, selected=selected_models)
    log(f"  {len(all_results)} predictions · {len(failures)} failures "
        f"({100 * len(failures) / max(len(all_results), 1):.0f}% failure rate)")

    if len(failures) < 3:
        log("Too few failures to cluster. Try selecting both baselines.")
        return "\n".join(log_lines), None, None, None, _empty_table()

    log("Embedding failures (TF-IDF + SVD) …")
    embeddings = embed_failures(failures, n_components=32)
    scatter_coords = embed_for_scatter(failures)
    log(f"  Embedding shape: {embeddings.shape}")

    log(f"Clustering into k={n_clusters} (KMeans) …")
    cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters)
    for failure, cid in zip(failures, cluster_ids, strict=True):
        failure["cluster_id"] = cid

    from collections import Counter
    cluster_dist = dict(Counter(cluster_ids))
    log(f"  Sizes: { {k: cluster_dist[k] for k in sorted(cluster_dist)} }")

    reasoning_types = [f["reasoning_type"] for f in failures]
    model_ids_list = [f["model_id"] for f in failures]

    log("Computing mutual information …")
    mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list)
    for label, score in mi_scores.items():
        log(f"  {label}: {score:.4f}")

    acc_table = accuracy_by_type(all_results)

    fig_mi = plot_mi_comparison(mi_scores)
    fig_acc = plot_accuracy_by_type(acc_table)
    fig_scatter = plot_failure_scatter(scatter_coords, cluster_ids, reasoning_types, model_ids_list)
    summary_df = _build_failure_table(failures)

    return "\n".join(log_lines), fig_mi, fig_acc, fig_scatter, summary_df


def _build_failure_table(failures: list[dict]) -> pd.DataFrame:
    rows = [
        {
            "cluster": f["cluster_id"],
            "reasoning_type": f["reasoning_type"],
            "model_id": f["model_id"],
            "expected": f["y"],
            "predicted": f["prediction"],
            "input_preview": f["x"][:80] + ("…" if len(f["x"]) > 80 else ""),
        }
        for f in sorted(failures, key=lambda r: (r["cluster_id"], r["reasoning_type"]))
    ]
    return pd.DataFrame(rows) if rows else _empty_table()


def _empty_table() -> pd.DataFrame:
    return pd.DataFrame(
        columns=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"]
    )


with gr.Blocks(title="Failure Geometry Demo", theme=gr.themes.Soft()) as demo:
    gr.Markdown(_DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=1, min_width=260):
            model_selector = gr.CheckboxGroup(
                choices=list(BASELINES.keys()),
                value=list(BASELINES.keys()),
                label="Baselines to run",
                info="Select one to isolate a single failure geometry; both to see model-identity MI.",
            )
            n_clusters_slider = gr.Slider(
                minimum=2,
                maximum=6,
                step=1,
                value=4,
                label="KMeans clusters (k)",
                info="4 matches the number of reasoning types in the dataset.",
            )
            run_btn = gr.Button("Run Analysis", variant="primary", size="lg")

        with gr.Column(scale=2):
            status_log = gr.Textbox(
                label="Pipeline log",
                lines=10,
                interactive=False,
                placeholder="Click 'Run Analysis' to start …",
            )

    with gr.Row():
        mi_plot = gr.Plot(label="Mutual information comparison")
        acc_plot = gr.Plot(label="Accuracy by reasoning type and baseline")

    with gr.Row():
        scatter_plot = gr.Plot(
            label="2-D failure embedding scatter (SVD component 1 vs 2)"
        )

    with gr.Row():
        failure_table = gr.Dataframe(
            headers=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"],
            label="Failure records (sorted by cluster → reasoning type)",
            interactive=False,
            wrap=True,
        )

    run_btn.click(
        fn=run_demo,
        inputs=[model_selector, n_clusters_slider],
        outputs=[status_log, mi_plot, acc_plot, scatter_plot, failure_table],
    )


if __name__ == "__main__":
    demo.launch()