File size: 7,217 Bytes
09f4a33 3206a9e 09f4a33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | """
Failure Geometry Demo — always runnable, no API key required.
Two deliberately weak baselines (always_1, keyword_heuristic) are run against
the CARB seed dataset. Failures from both models are pooled, embedded with
TF-IDF + SVD, clustered with KMeans, and compared via mutual information.
The demo illustrates three questions:
1. Do failure clusters align with reasoning category (transitivity, negation,
syllogism, distractor logic)?
2. Do they align with model identity (which baseline failed)?
3. Visually, do the 2-D SVD projections show cluster separation?
No network requests, no GPU, no API token.
"""
from pathlib import Path
import gradio as gr
import pandas as pd
from core.baseline import BASELINES, run_baselines
from core.cluster import cluster_embeddings
from core.dataset import load_dataset
from core.embed import embed_failures, embed_for_scatter
from core.metrics import accuracy_by_type, compute_mi_scores
from viz.plots import plot_accuracy_by_type, plot_failure_scatter, plot_mi_comparison
DATA_PATH = Path(__file__).parent / "data" / "carb_demo.json"
_DESCRIPTION = """\
## Failure Geometry Demo
Structured failure analysis on the CARB reasoning dataset — **no API key required**.
Two lightweight baselines are used intentionally:
| Baseline | Strategy | Expected failure shape |
|----------|----------|------------------------|
| `always_1` | Predict 1 for everything | Fails on all false-label items (systematic bias) |
| `keyword_heuristic` | Predict 0 when statement contains negation markers | Fails on affirmative-false and negated-true items |
Because the baselines fail *differently*, pooling their failures and clustering lets us ask:
> **Do clusters separate by reasoning type, by model identity, or both?**
Mutual information quantifies this. A larger `MI(cluster, reasoning_type)` than
`MI(cluster, model_identity)` supports the hypothesis that failure structure is
organised around reasoning difficulty — not purely around which model ran.
---
*Part of the [Obversary Studios](https://obversarystudios.org) evaluation systems research.*
*Live pipeline version (HF Inference API):*
*[carb-observability-space](https://huggingface.co/spaces/obversarystudios/carb-observability-space)*
"""
def run_demo(
selected_models: list[str],
n_clusters: int,
) -> tuple[str, object, object, object, object]:
log_lines: list[str] = []
def log(msg: str) -> None:
log_lines.append(msg)
if not selected_models:
selected_models = list(BASELINES.keys())
log(f"Loading dataset: {DATA_PATH.name}")
try:
dataset = load_dataset(DATA_PATH)
except Exception as exc:
err = f"Dataset error: {exc}"
return err, None, None, None, None
log(f" {len(dataset)} examples / {len({r['reasoning_type'] for r in dataset})} types")
log(f"Running baselines: {', '.join(selected_models)}")
all_results, failures = run_baselines(dataset, selected=selected_models)
log(f" {len(all_results)} predictions · {len(failures)} failures "
f"({100 * len(failures) / max(len(all_results), 1):.0f}% failure rate)")
if len(failures) < 3:
log("Too few failures to cluster. Try selecting both baselines.")
return "\n".join(log_lines), None, None, None, _empty_table()
log("Embedding failures (TF-IDF + SVD) …")
embeddings = embed_failures(failures, n_components=32)
scatter_coords = embed_for_scatter(failures)
log(f" Embedding shape: {embeddings.shape}")
log(f"Clustering into k={n_clusters} (KMeans) …")
cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters)
for failure, cid in zip(failures, cluster_ids, strict=True):
failure["cluster_id"] = cid
from collections import Counter
cluster_dist = dict(Counter(cluster_ids))
log(f" Sizes: { {k: cluster_dist[k] for k in sorted(cluster_dist)} }")
reasoning_types = [f["reasoning_type"] for f in failures]
model_ids_list = [f["model_id"] for f in failures]
log("Computing mutual information …")
mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list)
for label, score in mi_scores.items():
log(f" {label}: {score:.4f}")
acc_table = accuracy_by_type(all_results)
fig_mi = plot_mi_comparison(mi_scores)
fig_acc = plot_accuracy_by_type(acc_table)
fig_scatter = plot_failure_scatter(scatter_coords, cluster_ids, reasoning_types, model_ids_list)
summary_df = _build_failure_table(failures)
return "\n".join(log_lines), fig_mi, fig_acc, fig_scatter, summary_df
def _build_failure_table(failures: list[dict]) -> pd.DataFrame:
rows = [
{
"cluster": f["cluster_id"],
"reasoning_type": f["reasoning_type"],
"model_id": f["model_id"],
"expected": f["y"],
"predicted": f["prediction"],
"input_preview": f["x"][:80] + ("…" if len(f["x"]) > 80 else ""),
}
for f in sorted(failures, key=lambda r: (r["cluster_id"], r["reasoning_type"]))
]
return pd.DataFrame(rows) if rows else _empty_table()
def _empty_table() -> pd.DataFrame:
return pd.DataFrame(
columns=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"]
)
with gr.Blocks(title="Failure Geometry Demo", theme=gr.themes.Soft()) as demo:
gr.Markdown(_DESCRIPTION)
with gr.Row():
with gr.Column(scale=1, min_width=260):
model_selector = gr.CheckboxGroup(
choices=list(BASELINES.keys()),
value=list(BASELINES.keys()),
label="Baselines to run",
info="Select one to isolate a single failure geometry; both to see model-identity MI.",
)
n_clusters_slider = gr.Slider(
minimum=2,
maximum=6,
step=1,
value=4,
label="KMeans clusters (k)",
info="4 matches the number of reasoning types in the dataset.",
)
run_btn = gr.Button("Run Analysis", variant="primary", size="lg")
with gr.Column(scale=2):
status_log = gr.Textbox(
label="Pipeline log",
lines=10,
interactive=False,
placeholder="Click 'Run Analysis' to start …",
)
with gr.Row():
mi_plot = gr.Plot(label="Mutual information comparison")
acc_plot = gr.Plot(label="Accuracy by reasoning type and baseline")
with gr.Row():
scatter_plot = gr.Plot(
label="2-D failure embedding scatter (SVD component 1 vs 2)"
)
with gr.Row():
failure_table = gr.Dataframe(
headers=["cluster", "reasoning_type", "model_id", "expected", "predicted", "input_preview"],
label="Failure records (sorted by cluster → reasoning type)",
interactive=False,
wrap=True,
)
run_btn.click(
fn=run_demo,
inputs=[model_selector, n_clusters_slider],
outputs=[status_log, mi_plot, acc_plot, scatter_plot, failure_table],
)
if __name__ == "__main__":
demo.launch()
|