File size: 7,217 Bytes
09f4a33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3206a9e
09f4a33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
"""
Failure Geometry Demo — always runnable, no API key required.

Two deliberately weak baselines (always_1, keyword_heuristic) are run against
the CARB seed dataset.  Failures from both models are pooled, embedded with
TF-IDF + SVD, clustered with KMeans, and compared via mutual information.

The demo illustrates three questions:

  1. Do failure clusters align with reasoning category (transitivity, negation,
     syllogism, distractor logic)?
  2. Do they align with model identity (which baseline failed)?
  3. Visually, do the 2-D SVD projections show cluster separation?

No network requests, no GPU, no API token.
"""

from pathlib import Path

import gradio as gr
import pandas as pd

from core.baseline import BASELINES, run_baselines
from core.cluster import cluster_embeddings
from core.dataset import load_dataset
from core.embed import embed_failures, embed_for_scatter
from core.metrics import accuracy_by_type, compute_mi_scores
from viz.plots import plot_accuracy_by_type, plot_failure_scatter, plot_mi_comparison


DATA_PATH = Path(__file__).parent / "data" / "carb_demo.json"

_DESCRIPTION = """\
## Failure Geometry Demo

Structured failure analysis on the CARB reasoning dataset — **no API key required**.

Two lightweight baselines are used intentionally:

| Baseline | Strategy | Expected failure shape |
|----------|----------|------------------------|
| `always_1` | Predict 1 for everything | Fails on all false-label items (systematic bias) |
| `keyword_heuristic` | Predict 0 when statement contains negation markers | Fails on affirmative-false and negated-true items |

Because the baselines fail *differently*, pooling their failures and clustering lets us ask:

> **Do clusters separate by reasoning type, by model identity, or both?**

Mutual information quantifies this. A larger `MI(cluster, reasoning_type)` than
`MI(cluster, model_identity)` supports the hypothesis that failure structure is
organised around reasoning difficulty — not purely around which model ran.

---

*Part of the [Obversary Studios](https://obversarystudios.org) evaluation systems research.*
*Live pipeline version (HF Inference API):*
*[carb-observability-space](https://huggingface.co/spaces/obversarystudios/carb-observability-space)*
"""


def run_demo(
    selected_models: list[str],
    n_clusters: int,
) -> tuple[str, object, object, object, object]:
    log_lines: list[str] = []

    def log(msg: str) -> None:
        log_lines.append(msg)

    if not selected_models:
        selected_models = list(BASELINES.keys())

    log(f"Loading dataset: {DATA_PATH.name}")
    try:
        dataset = load_dataset(DATA_PATH)
    except Exception as exc:
        err = f"Dataset error: {exc}"
        return err, None, None, None, None

    log(f"  {len(dataset)} examples / {len({r['reasoning_type'] for r in dataset})} types")
    log(f"Running baselines: {', '.join(selected_models)}")

    all_results, failures = run_baselines(dataset, selected=selected_models)
    log(f"  {len(all_results)} predictions · {len(failures)} failures "
        f"({100 * len(failures) / max(len(all_results), 1):.0f}% failure rate)")

    if len(failures) < 3:
        log("Too few failures to cluster. Try selecting both baselines.")
        return "\n".join(log_lines), None, None, None, _empty_table()

    log("Embedding failures (TF-IDF + SVD) …")
    embeddings = embed_failures(failures, n_components=32)
    scatter_coords = embed_for_scatter(failures)
    log(f"  Embedding shape: {embeddings.shape}")

    log(f"Clustering into k={n_clusters} (KMeans) …")
    cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters)
    for failure, cid in zip(failures, cluster_ids, strict=True):
        failure["cluster_id"] = cid

    from collections import Counter
    cluster_dist = dict(Counter(cluster_ids))
    log(f"  Sizes: { {k: cluster_dist[k] for k in sorted(cluster_dist)} }")

    reasoning_types = [f["reasoning_type"] for f in failures]
    model_ids_list = [f["model_id"] for f in failures]

    log("Computing mutual information …")
    mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list)
    for label, score in mi_scores.items():
        log(f"  {label}: {score:.4f}")

    acc_table = accuracy_by_type(all_results)

    fig_mi = plot_mi_comparison(mi_scores)
    fig_acc = plot_accuracy_by_type(acc_table)
    fig_scatter = plot_failure_scatter(scatter_coords, cluster_ids, reasoning_types, model_ids_list)
    summary_df = _build_failure_table(failures)

    return "\n".join(log_lines), fig_mi, fig_acc, fig_scatter, summary_df


def _build_failure_table(failures: list[dict]) -> pd.DataFrame:
    rows = [
        {
            "cluster": f["cluster_id"],
            "reasoning_type": f["reasoning_type"],
            "model_id": f["model_id"],
            "expected": f["y"],
            "predicted": f["prediction"],
            "input_preview": f["x"][:80] + ("…" if len(f["x"]) > 80 else ""),
        }
        for f in sorted(failures, key=lambda r: (r["cluster_id"], r["reasoning_type"]))
    ]
    return pd.DataFrame(rows) if rows else _empty_table()


def _empty_table() -> pd.DataFrame:
    return pd.DataFrame(
        columns=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"]
    )


with gr.Blocks(title="Failure Geometry Demo", theme=gr.themes.Soft()) as demo:
    gr.Markdown(_DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=1, min_width=260):
            model_selector = gr.CheckboxGroup(
                choices=list(BASELINES.keys()),
                value=list(BASELINES.keys()),
                label="Baselines to run",
                info="Select one to isolate a single failure geometry; both to see model-identity MI.",
            )
            n_clusters_slider = gr.Slider(
                minimum=2,
                maximum=6,
                step=1,
                value=4,
                label="KMeans clusters (k)",
                info="4 matches the number of reasoning types in the dataset.",
            )
            run_btn = gr.Button("Run Analysis", variant="primary", size="lg")

        with gr.Column(scale=2):
            status_log = gr.Textbox(
                label="Pipeline log",
                lines=10,
                interactive=False,
                placeholder="Click 'Run Analysis' to start …",
            )

    with gr.Row():
        mi_plot = gr.Plot(label="Mutual information comparison")
        acc_plot = gr.Plot(label="Accuracy by reasoning type and baseline")

    with gr.Row():
        scatter_plot = gr.Plot(
            label="2-D failure embedding scatter (SVD component 1 vs 2)"
        )

    with gr.Row():
        failure_table = gr.Dataframe(
            headers=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"],
            label="Failure records (sorted by cluster → reasoning type)",
            interactive=False,
            wrap=True,
        )

    run_btn.click(
        fn=run_demo,
        inputs=[model_selector, n_clusters_slider],
        outputs=[status_log, mi_plot, acc_plot, scatter_plot, failure_table],
    )


if __name__ == "__main__":
    demo.launch()