File size: 5,965 Bytes
1b435f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609c576
1b435f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from pathlib import Path

import gradio as gr
import pandas as pd

from core.cluster import cluster_embeddings
from core.dataset import load_dataset
from core.embed import embed_failures
from core.eval import evaluate
from core.metrics import compute_mi_scores
from core.model import DEFAULT_MODELS, query_model
from viz.plots import plot_mi_comparison


DATA_PATH = Path(__file__).parent / "data" / "carb_seed.json"

_DESCRIPTION = """\
## CARB Failure Observability

Research pipeline for structured failure analysis in language model reasoning.

```
CARB dataset β†’ HF Inference API β†’ failure extraction β†’ MiniLM embeddings β†’ KMeans β†’ mutual information
```

**Central question:** do failure clusters align with *reasoning category* more than with *model identity*?

The MI comparison plot answers this directly β€” a larger `MI(cluster, reasoning_type)` bar relative to
`MI(cluster, model_identity)` supports the hypothesis that failure structure is organized by reasoning
difficulty, not model choice alone.

> **Requires** `HF_TOKEN` set in Space secrets. See
> [failure-geometry-demo](https://huggingface.co/spaces/obversarystudios/failure-geometry-demo)
> for a fully self-contained version that needs no API key.
>
> Research context: [obversarystudios.org](https://obversarystudios.org)
"""


def run_experiment(
    selected_models: list[str],
    n_clusters: int,
) -> tuple[str, object, object]:
    log_lines: list[str] = []

    def log(msg: str) -> None:
        log_lines.append(msg)

    if not selected_models:
        selected_models = DEFAULT_MODELS[:1]

    log(f"Loading dataset from {DATA_PATH.name} …")
    try:
        dataset = load_dataset(DATA_PATH)
    except Exception as exc:
        return f"Dataset error: {exc}", None, None

    log(f"  {len(dataset)} examples across {len({r['reasoning_type'] for r in dataset})} reasoning types.")
    log(f"Querying models: {', '.join(selected_models)} …")

    try:
        failures = evaluate(dataset, query_model, model_ids=selected_models)
    except Exception as exc:
        return f"Evaluation error: {exc}", None, None

    log(f"  Found {len(failures)} failures from {len(dataset) * len(selected_models)} total predictions.")

    if not failures:
        log("No failures detected β€” all predictions were correct.")
        empty_mi = {
            "MI(cluster, reasoning_type)": 0.0,
            "MI(cluster, model_identity)": 0.0,
        }
        fig = plot_mi_comparison(empty_mi)
        return "\n".join(log_lines), fig, _empty_summary_table()

    log("Embedding failures with all-MiniLM-L6-v2 …")
    try:
        embeddings = embed_failures(failures)
    except Exception as exc:
        return "\n".join(log_lines) + f"\nEmbed error: {exc}", None, None

    log(f"  Embeddings shape: {embeddings.shape}")
    log(f"Clustering into k={n_clusters} clusters (KMeans) …")

    cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters)
    for failure, cluster_id in zip(failures, cluster_ids, strict=True):
        failure["cluster_id"] = cluster_id

    counts_per_cluster = {}
    for cid in cluster_ids:
        counts_per_cluster[cid] = counts_per_cluster.get(cid, 0) + 1
    log(f"  Cluster sizes: { {k: counts_per_cluster[k] for k in sorted(counts_per_cluster)} }")

    reasoning_types = [f["reasoning_type"] for f in failures]
    model_ids_list = [f["model_id"] for f in failures]

    log("Computing mutual information …")
    mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list)
    for label, score in mi_scores.items():
        log(f"  {label}: {score:.4f}")

    fig = plot_mi_comparison(mi_scores)
    summary_df = _build_summary_table(failures)

    return "\n".join(log_lines), fig, summary_df


def _build_summary_table(failures: list[dict]) -> pd.DataFrame:
    from collections import Counter
    counts: Counter = Counter()
    for f in failures:
        counts[(f["reasoning_type"], f["model_id"])] += 1

    rows = [
        {"reasoning_type": rtype, "model_id": mid, "failure_count": cnt}
        for (rtype, mid), cnt in sorted(counts.items())
    ]
    return pd.DataFrame(rows) if rows else _empty_summary_table()


def _empty_summary_table() -> pd.DataFrame:
    return pd.DataFrame(columns=["reasoning_type", "model_id", "failure_count"])


with gr.Blocks(title="CARB Failure Observability", theme=gr.themes.Soft()) as demo:
    gr.Markdown(_DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=1, min_width=260):
            model_selector = gr.CheckboxGroup(
                choices=DEFAULT_MODELS,
                value=DEFAULT_MODELS[:1],
                label="Models to query",
                info="Each model runs on all 50 examples. Multiple models increase failure pool diversity.",
            )
            n_clusters_slider = gr.Slider(
                minimum=2,
                maximum=6,
                step=1,
                value=4,
                label="KMeans clusters (k)",
                info="Should be ≀ number of reasoning types (4).",
            )
            run_btn = gr.Button("Run Experiment", variant="primary", size="lg")

        with gr.Column(scale=2):
            status_log = gr.Textbox(
                label="Pipeline log",
                lines=9,
                interactive=False,
                placeholder="Click 'Run Experiment' to start …",
            )

    with gr.Row():
        mi_plot = gr.Plot(
            label="Mutual information: cluster vs. reasoning type vs. model identity"
        )

    with gr.Row():
        summary_table = gr.Dataframe(
            headers=["reasoning_type", "model_id", "failure_count"],
            label="Failures by reasoning type and model",
            interactive=False,
        )

    run_btn.click(
        fn=run_experiment,
        inputs=[model_selector, n_clusters_slider],
        outputs=[status_log, mi_plot, summary_table],
    )


if __name__ == "__main__":
    demo.launch()