File size: 5,965 Bytes
1b435f0 609c576 1b435f0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | from pathlib import Path
import gradio as gr
import pandas as pd
from core.cluster import cluster_embeddings
from core.dataset import load_dataset
from core.embed import embed_failures
from core.eval import evaluate
from core.metrics import compute_mi_scores
from core.model import DEFAULT_MODELS, query_model
from viz.plots import plot_mi_comparison
DATA_PATH = Path(__file__).parent / "data" / "carb_seed.json"
_DESCRIPTION = """\
## CARB Failure Observability
Research pipeline for structured failure analysis in language model reasoning.
```
CARB dataset β HF Inference API β failure extraction β MiniLM embeddings β KMeans β mutual information
```
**Central question:** do failure clusters align with *reasoning category* more than with *model identity*?
The MI comparison plot answers this directly β a larger `MI(cluster, reasoning_type)` bar relative to
`MI(cluster, model_identity)` supports the hypothesis that failure structure is organized by reasoning
difficulty, not model choice alone.
> **Requires** `HF_TOKEN` set in Space secrets. See
> [failure-geometry-demo](https://huggingface.co/spaces/obversarystudios/failure-geometry-demo)
> for a fully self-contained version that needs no API key.
>
> Research context: [obversarystudios.org](https://obversarystudios.org)
"""
def run_experiment(
selected_models: list[str],
n_clusters: int,
) -> tuple[str, object, object]:
log_lines: list[str] = []
def log(msg: str) -> None:
log_lines.append(msg)
if not selected_models:
selected_models = DEFAULT_MODELS[:1]
log(f"Loading dataset from {DATA_PATH.name} β¦")
try:
dataset = load_dataset(DATA_PATH)
except Exception as exc:
return f"Dataset error: {exc}", None, None
log(f" {len(dataset)} examples across {len({r['reasoning_type'] for r in dataset})} reasoning types.")
log(f"Querying models: {', '.join(selected_models)} β¦")
try:
failures = evaluate(dataset, query_model, model_ids=selected_models)
except Exception as exc:
return f"Evaluation error: {exc}", None, None
log(f" Found {len(failures)} failures from {len(dataset) * len(selected_models)} total predictions.")
if not failures:
log("No failures detected β all predictions were correct.")
empty_mi = {
"MI(cluster, reasoning_type)": 0.0,
"MI(cluster, model_identity)": 0.0,
}
fig = plot_mi_comparison(empty_mi)
return "\n".join(log_lines), fig, _empty_summary_table()
log("Embedding failures with all-MiniLM-L6-v2 β¦")
try:
embeddings = embed_failures(failures)
except Exception as exc:
return "\n".join(log_lines) + f"\nEmbed error: {exc}", None, None
log(f" Embeddings shape: {embeddings.shape}")
log(f"Clustering into k={n_clusters} clusters (KMeans) β¦")
cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters)
for failure, cluster_id in zip(failures, cluster_ids, strict=True):
failure["cluster_id"] = cluster_id
counts_per_cluster = {}
for cid in cluster_ids:
counts_per_cluster[cid] = counts_per_cluster.get(cid, 0) + 1
log(f" Cluster sizes: { {k: counts_per_cluster[k] for k in sorted(counts_per_cluster)} }")
reasoning_types = [f["reasoning_type"] for f in failures]
model_ids_list = [f["model_id"] for f in failures]
log("Computing mutual information β¦")
mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list)
for label, score in mi_scores.items():
log(f" {label}: {score:.4f}")
fig = plot_mi_comparison(mi_scores)
summary_df = _build_summary_table(failures)
return "\n".join(log_lines), fig, summary_df
def _build_summary_table(failures: list[dict]) -> pd.DataFrame:
from collections import Counter
counts: Counter = Counter()
for f in failures:
counts[(f["reasoning_type"], f["model_id"])] += 1
rows = [
{"reasoning_type": rtype, "model_id": mid, "failure_count": cnt}
for (rtype, mid), cnt in sorted(counts.items())
]
return pd.DataFrame(rows) if rows else _empty_summary_table()
def _empty_summary_table() -> pd.DataFrame:
return pd.DataFrame(columns=["reasoning_type", "model_id", "failure_count"])
with gr.Blocks(title="CARB Failure Observability", theme=gr.themes.Soft()) as demo:
gr.Markdown(_DESCRIPTION)
with gr.Row():
with gr.Column(scale=1, min_width=260):
model_selector = gr.CheckboxGroup(
choices=DEFAULT_MODELS,
value=DEFAULT_MODELS[:1],
label="Models to query",
info="Each model runs on all 50 examples. Multiple models increase failure pool diversity.",
)
n_clusters_slider = gr.Slider(
minimum=2,
maximum=6,
step=1,
value=4,
label="KMeans clusters (k)",
info="Should be β€ number of reasoning types (4).",
)
run_btn = gr.Button("Run Experiment", variant="primary", size="lg")
with gr.Column(scale=2):
status_log = gr.Textbox(
label="Pipeline log",
lines=9,
interactive=False,
placeholder="Click 'Run Experiment' to start β¦",
)
with gr.Row():
mi_plot = gr.Plot(
label="Mutual information: cluster vs. reasoning type vs. model identity"
)
with gr.Row():
summary_table = gr.Dataframe(
headers=["reasoning_type", "model_id", "failure_count"],
label="Failures by reasoning type and model",
interactive=False,
)
run_btn.click(
fn=run_experiment,
inputs=[model_selector, n_clusters_slider],
outputs=[status_log, mi_plot, summary_table],
)
if __name__ == "__main__":
demo.launch()
|