| from pathlib import Path |
|
|
| import gradio as gr |
| import pandas as pd |
|
|
| from core.cluster import cluster_embeddings |
| from core.dataset import load_dataset |
| from core.embed import embed_failures |
| from core.eval import evaluate |
| from core.metrics import compute_mi_scores |
| from core.model import DEFAULT_MODELS, query_model |
| from viz.plots import plot_mi_comparison |
|
|
|
|
| DATA_PATH = Path(__file__).parent / "data" / "carb_seed.json" |
|
|
| _DESCRIPTION = """\ |
| ## CARB Failure Observability |
| |
| Research pipeline for structured failure analysis in language model reasoning. |
| |
| ``` |
| CARB dataset β HF Inference API β failure extraction β MiniLM embeddings β KMeans β mutual information |
| ``` |
| |
| **Central question:** do failure clusters align with *reasoning category* more than with *model identity*? |
| |
| The MI comparison plot answers this directly β a larger `MI(cluster, reasoning_type)` bar relative to |
| `MI(cluster, model_identity)` supports the hypothesis that failure structure is organized by reasoning |
| difficulty, not model choice alone. |
| |
| > **Requires** `HF_TOKEN` set in Space secrets. See |
| > [failure-geometry-demo](https://huggingface.co/spaces/obversarystudios/failure-geometry-demo) |
| > for a fully self-contained version that needs no API key. |
| > |
| > Research context: [obversarystudios.org](https://obversarystudios.org) |
| """ |
|
|
|
|
| def run_experiment( |
| selected_models: list[str], |
| n_clusters: int, |
| ) -> tuple[str, object, object]: |
| log_lines: list[str] = [] |
|
|
| def log(msg: str) -> None: |
| log_lines.append(msg) |
|
|
| if not selected_models: |
| selected_models = DEFAULT_MODELS[:1] |
|
|
| log(f"Loading dataset from {DATA_PATH.name} β¦") |
| try: |
| dataset = load_dataset(DATA_PATH) |
| except Exception as exc: |
| return f"Dataset error: {exc}", None, None |
|
|
| log(f" {len(dataset)} examples across {len({r['reasoning_type'] for r in dataset})} reasoning types.") |
| log(f"Querying models: {', '.join(selected_models)} β¦") |
|
|
| try: |
| failures = evaluate(dataset, query_model, model_ids=selected_models) |
| except Exception as exc: |
| return f"Evaluation error: {exc}", None, None |
|
|
| log(f" Found {len(failures)} failures from {len(dataset) * len(selected_models)} total predictions.") |
|
|
| if not failures: |
| log("No failures detected β all predictions were correct.") |
| empty_mi = { |
| "MI(cluster, reasoning_type)": 0.0, |
| "MI(cluster, model_identity)": 0.0, |
| } |
| fig = plot_mi_comparison(empty_mi) |
| return "\n".join(log_lines), fig, _empty_summary_table() |
|
|
| log("Embedding failures with all-MiniLM-L6-v2 β¦") |
| try: |
| embeddings = embed_failures(failures) |
| except Exception as exc: |
| return "\n".join(log_lines) + f"\nEmbed error: {exc}", None, None |
|
|
| log(f" Embeddings shape: {embeddings.shape}") |
| log(f"Clustering into k={n_clusters} clusters (KMeans) β¦") |
|
|
| cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters) |
| for failure, cluster_id in zip(failures, cluster_ids, strict=True): |
| failure["cluster_id"] = cluster_id |
|
|
| counts_per_cluster = {} |
| for cid in cluster_ids: |
| counts_per_cluster[cid] = counts_per_cluster.get(cid, 0) + 1 |
| log(f" Cluster sizes: { {k: counts_per_cluster[k] for k in sorted(counts_per_cluster)} }") |
|
|
| reasoning_types = [f["reasoning_type"] for f in failures] |
| model_ids_list = [f["model_id"] for f in failures] |
|
|
| log("Computing mutual information β¦") |
| mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list) |
| for label, score in mi_scores.items(): |
| log(f" {label}: {score:.4f}") |
|
|
| fig = plot_mi_comparison(mi_scores) |
| summary_df = _build_summary_table(failures) |
|
|
| return "\n".join(log_lines), fig, summary_df |
|
|
|
|
| def _build_summary_table(failures: list[dict]) -> pd.DataFrame: |
| from collections import Counter |
| counts: Counter = Counter() |
| for f in failures: |
| counts[(f["reasoning_type"], f["model_id"])] += 1 |
|
|
| rows = [ |
| {"reasoning_type": rtype, "model_id": mid, "failure_count": cnt} |
| for (rtype, mid), cnt in sorted(counts.items()) |
| ] |
| return pd.DataFrame(rows) if rows else _empty_summary_table() |
|
|
|
|
| def _empty_summary_table() -> pd.DataFrame: |
| return pd.DataFrame(columns=["reasoning_type", "model_id", "failure_count"]) |
|
|
|
|
| with gr.Blocks(title="CARB Failure Observability", theme=gr.themes.Soft()) as demo: |
| gr.Markdown(_DESCRIPTION) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1, min_width=260): |
| model_selector = gr.CheckboxGroup( |
| choices=DEFAULT_MODELS, |
| value=DEFAULT_MODELS[:1], |
| label="Models to query", |
| info="Each model runs on all 50 examples. Multiple models increase failure pool diversity.", |
| ) |
| n_clusters_slider = gr.Slider( |
| minimum=2, |
| maximum=6, |
| step=1, |
| value=4, |
| label="KMeans clusters (k)", |
| info="Should be β€ number of reasoning types (4).", |
| ) |
| run_btn = gr.Button("Run Experiment", variant="primary", size="lg") |
|
|
| with gr.Column(scale=2): |
| status_log = gr.Textbox( |
| label="Pipeline log", |
| lines=9, |
| interactive=False, |
| placeholder="Click 'Run Experiment' to start β¦", |
| ) |
|
|
| with gr.Row(): |
| mi_plot = gr.Plot( |
| label="Mutual information: cluster vs. reasoning type vs. model identity" |
| ) |
|
|
| with gr.Row(): |
| summary_table = gr.Dataframe( |
| headers=["reasoning_type", "model_id", "failure_count"], |
| label="Failures by reasoning type and model", |
| interactive=False, |
| ) |
|
|
| run_btn.click( |
| fn=run_experiment, |
| inputs=[model_selector, n_clusters_slider], |
| outputs=[status_log, mi_plot, summary_table], |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|