from pathlib import Path import gradio as gr import pandas as pd from core.cluster import cluster_embeddings from core.dataset import load_dataset from core.embed import embed_failures from core.eval import evaluate from core.metrics import compute_mi_scores from core.model import DEFAULT_MODELS, query_model from viz.plots import plot_mi_comparison DATA_PATH = Path(__file__).parent / "data" / "carb_seed.json" _DESCRIPTION = """\ ## CARB Failure Observability Research pipeline for structured failure analysis in language model reasoning. ``` CARB dataset → HF Inference API → failure extraction → MiniLM embeddings → KMeans → mutual information ``` **Central question:** do failure clusters align with *reasoning category* more than with *model identity*? The MI comparison plot answers this directly — a larger `MI(cluster, reasoning_type)` bar relative to `MI(cluster, model_identity)` supports the hypothesis that failure structure is organized by reasoning difficulty, not model choice alone. > **Requires** `HF_TOKEN` set in Space secrets. See > [failure-geometry-demo](https://huggingface.co/spaces/obversarystudios/failure-geometry-demo) > for a fully self-contained version that needs no API key. > > Research context: [obversarystudios.org](https://obversarystudios.org) """ def run_experiment( selected_models: list[str], n_clusters: int, ) -> tuple[str, object, object]: log_lines: list[str] = [] def log(msg: str) -> None: log_lines.append(msg) if not selected_models: selected_models = DEFAULT_MODELS[:1] log(f"Loading dataset from {DATA_PATH.name} …") try: dataset = load_dataset(DATA_PATH) except Exception as exc: return f"Dataset error: {exc}", None, None log(f" {len(dataset)} examples across {len({r['reasoning_type'] for r in dataset})} reasoning types.") log(f"Querying models: {', '.join(selected_models)} …") try: failures = evaluate(dataset, query_model, model_ids=selected_models) except Exception as exc: return f"Evaluation error: {exc}", None, None log(f" Found {len(failures)} failures from {len(dataset) * len(selected_models)} total predictions.") if not failures: log("No failures detected — all predictions were correct.") empty_mi = { "MI(cluster, reasoning_type)": 0.0, "MI(cluster, model_identity)": 0.0, } fig = plot_mi_comparison(empty_mi) return "\n".join(log_lines), fig, _empty_summary_table() log("Embedding failures with all-MiniLM-L6-v2 …") try: embeddings = embed_failures(failures) except Exception as exc: return "\n".join(log_lines) + f"\nEmbed error: {exc}", None, None log(f" Embeddings shape: {embeddings.shape}") log(f"Clustering into k={n_clusters} clusters (KMeans) …") cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters) for failure, cluster_id in zip(failures, cluster_ids, strict=True): failure["cluster_id"] = cluster_id counts_per_cluster = {} for cid in cluster_ids: counts_per_cluster[cid] = counts_per_cluster.get(cid, 0) + 1 log(f" Cluster sizes: { {k: counts_per_cluster[k] for k in sorted(counts_per_cluster)} }") reasoning_types = [f["reasoning_type"] for f in failures] model_ids_list = [f["model_id"] for f in failures] log("Computing mutual information …") mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list) for label, score in mi_scores.items(): log(f" {label}: {score:.4f}") fig = plot_mi_comparison(mi_scores) summary_df = _build_summary_table(failures) return "\n".join(log_lines), fig, summary_df def _build_summary_table(failures: list[dict]) -> pd.DataFrame: from collections import Counter counts: Counter = Counter() for f in failures: counts[(f["reasoning_type"], f["model_id"])] += 1 rows = [ {"reasoning_type": rtype, "model_id": mid, "failure_count": cnt} for (rtype, mid), cnt in sorted(counts.items()) ] return pd.DataFrame(rows) if rows else _empty_summary_table() def _empty_summary_table() -> pd.DataFrame: return pd.DataFrame(columns=["reasoning_type", "model_id", "failure_count"]) with gr.Blocks(title="CARB Failure Observability", theme=gr.themes.Soft()) as demo: gr.Markdown(_DESCRIPTION) with gr.Row(): with gr.Column(scale=1, min_width=260): model_selector = gr.CheckboxGroup( choices=DEFAULT_MODELS, value=DEFAULT_MODELS[:1], label="Models to query", info="Each model runs on all 50 examples. Multiple models increase failure pool diversity.", ) n_clusters_slider = gr.Slider( minimum=2, maximum=6, step=1, value=4, label="KMeans clusters (k)", info="Should be ≤ number of reasoning types (4).", ) run_btn = gr.Button("Run Experiment", variant="primary", size="lg") with gr.Column(scale=2): status_log = gr.Textbox( label="Pipeline log", lines=9, interactive=False, placeholder="Click 'Run Experiment' to start …", ) with gr.Row(): mi_plot = gr.Plot( label="Mutual information: cluster vs. reasoning type vs. model identity" ) with gr.Row(): summary_table = gr.Dataframe( headers=["reasoning_type", "model_id", "failure_count"], label="Failures by reasoning type and model", interactive=False, ) run_btn.click( fn=run_experiment, inputs=[model_selector, n_clusters_slider], outputs=[status_log, mi_plot, summary_table], ) if __name__ == "__main__": demo.launch()