Spaces:

obversarystudios
/

carb-observability-space

Sleeping

File size: 5,965 Bytes

from pathlib import Path

import gradio as gr
import pandas as pd

from core.cluster import cluster_embeddings
from core.dataset import load_dataset
from core.embed import embed_failures
from core.eval import evaluate
from core.metrics import compute_mi_scores
from core.model import DEFAULT_MODELS, query_model
from viz.plots import plot_mi_comparison


DATA_PATH = Path(__file__).parent / "data" / "carb_seed.json"

_DESCRIPTION = """\
## CARB Failure Observability

Research pipeline for structured failure analysis in language model reasoning.

```
CARB dataset → HF Inference API → failure extraction → MiniLM embeddings → KMeans → mutual information
```

**Central question:** do failure clusters align with *reasoning category* more than with *model identity*?

The MI comparison plot answers this directly — a larger `MI(cluster, reasoning_type)` bar relative to
`MI(cluster, model_identity)` supports the hypothesis that failure structure is organized by reasoning
difficulty, not model choice alone.

> **Requires** `HF_TOKEN` set in Space secrets. See
> [failure-geometry-demo](https://huggingface.co/spaces/obversarystudios/failure-geometry-demo)
> for a fully self-contained version that needs no API key.
>
> Research context: [obversarystudios.org](https://obversarystudios.org)
"""


def run_experiment(
    selected_models: list[str],
    n_clusters: int,
) -> tuple[str, object, object]:
    log_lines: list[str] = []

    def log(msg: str) -> None:
        log_lines.append(msg)

    if not selected_models:
        selected_models = DEFAULT_MODELS[:1]

    log(f"Loading dataset from {DATA_PATH.name} …")
    try:
        dataset = load_dataset(DATA_PATH)
    except Exception as exc:
        return f"Dataset error: {exc}", None, None

    log(f"  {len(dataset)} examples across {len({r['reasoning_type'] for r in dataset})} reasoning types.")
    log(f"Querying models: {', '.join(selected_models)} …")

    try:
        failures = evaluate(dataset, query_model, model_ids=selected_models)
    except Exception as exc:
        return f"Evaluation error: {exc}", None, None

    log(f"  Found {len(failures)} failures from {len(dataset) * len(selected_models)} total predictions.")

    if not failures:
        log("No failures detected — all predictions were correct.")
        empty_mi = {
            "MI(cluster, reasoning_type)": 0.0,
            "MI(cluster, model_identity)": 0.0,
        }
        fig = plot_mi_comparison(empty_mi)
        return "\n".join(log_lines), fig, _empty_summary_table()

    log("Embedding failures with all-MiniLM-L6-v2 …")
    try:
        embeddings = embed_failures(failures)
    except Exception as exc:
        return "\n".join(log_lines) + f"\nEmbed error: {exc}", None, None

    log(f"  Embeddings shape: {embeddings.shape}")
    log(f"Clustering into k={n_clusters} clusters (KMeans) …")

    cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters)
    for failure, cluster_id in zip(failures, cluster_ids, strict=True):
        failure["cluster_id"] = cluster_id

    counts_per_cluster = {}
    for cid in cluster_ids:
        counts_per_cluster[cid] = counts_per_cluster.get(cid, 0) + 1
    log(f"  Cluster sizes: { {k: counts_per_cluster[k] for k in sorted(counts_per_cluster)} }")

    reasoning_types = [f["reasoning_type"] for f in failures]
    model_ids_list = [f["model_id"] for f in failures]

    log("Computing mutual information …")
    mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list)
    for label, score in mi_scores.items():
        log(f"  {label}: {score:.4f}")

    fig = plot_mi_comparison(mi_scores)
    summary_df = _build_summary_table(failures)

    return "\n".join(log_lines), fig, summary_df


def _build_summary_table(failures: list[dict]) -> pd.DataFrame:
    from collections import Counter
    counts: Counter = Counter()
    for f in failures:
        counts[(f["reasoning_type"], f["model_id"])] += 1

    rows = [
        {"reasoning_type": rtype, "model_id": mid, "failure_count": cnt}
        for (rtype, mid), cnt in sorted(counts.items())
    ]
    return pd.DataFrame(rows) if rows else _empty_summary_table()


def _empty_summary_table() -> pd.DataFrame:
    return pd.DataFrame(columns=["reasoning_type", "model_id", "failure_count"])


with gr.Blocks(title="CARB Failure Observability", theme=gr.themes.Soft()) as demo:
    gr.Markdown(_DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=1, min_width=260):
            model_selector = gr.CheckboxGroup(
                choices=DEFAULT_MODELS,
                value=DEFAULT_MODELS[:1],
                label="Models to query",
                info="Each model runs on all 50 examples. Multiple models increase failure pool diversity.",
            )
            n_clusters_slider = gr.Slider(
                minimum=2,
                maximum=6,
                step=1,
                value=4,
                label="KMeans clusters (k)",
                info="Should be ≤ number of reasoning types (4).",
            )
            run_btn = gr.Button("Run Experiment", variant="primary", size="lg")

        with gr.Column(scale=2):
            status_log = gr.Textbox(
                label="Pipeline log",
                lines=9,
                interactive=False,
                placeholder="Click 'Run Experiment' to start …",
            )

    with gr.Row():
        mi_plot = gr.Plot(
            label="Mutual information: cluster vs. reasoning type vs. model identity"
        )

    with gr.Row():
        summary_table = gr.Dataframe(
            headers=["reasoning_type", "model_id", "failure_count"],
            label="Failures by reasoning type and model",
            interactive=False,
        )

    run_btn.click(
        fn=run_experiment,
        inputs=[model_selector, n_clusters_slider],
        outputs=[status_log, mi_plot, summary_table],
    )


if __name__ == "__main__":
    demo.launch()