obversarystudios's picture
Use huggingface_hub InferenceClient (routed inference API)
609c576 verified
from pathlib import Path
import gradio as gr
import pandas as pd
from core.cluster import cluster_embeddings
from core.dataset import load_dataset
from core.embed import embed_failures
from core.eval import evaluate
from core.metrics import compute_mi_scores
from core.model import DEFAULT_MODELS, query_model
from viz.plots import plot_mi_comparison
DATA_PATH = Path(__file__).parent / "data" / "carb_seed.json"
_DESCRIPTION = """\
## CARB Failure Observability
Research pipeline for structured failure analysis in language model reasoning.
```
CARB dataset β†’ HF Inference API β†’ failure extraction β†’ MiniLM embeddings β†’ KMeans β†’ mutual information
```
**Central question:** do failure clusters align with *reasoning category* more than with *model identity*?
The MI comparison plot answers this directly β€” a larger `MI(cluster, reasoning_type)` bar relative to
`MI(cluster, model_identity)` supports the hypothesis that failure structure is organized by reasoning
difficulty, not model choice alone.
> **Requires** `HF_TOKEN` set in Space secrets. See
> [failure-geometry-demo](https://huggingface.co/spaces/obversarystudios/failure-geometry-demo)
> for a fully self-contained version that needs no API key.
>
> Research context: [obversarystudios.org](https://obversarystudios.org)
"""
def run_experiment(
selected_models: list[str],
n_clusters: int,
) -> tuple[str, object, object]:
log_lines: list[str] = []
def log(msg: str) -> None:
log_lines.append(msg)
if not selected_models:
selected_models = DEFAULT_MODELS[:1]
log(f"Loading dataset from {DATA_PATH.name} …")
try:
dataset = load_dataset(DATA_PATH)
except Exception as exc:
return f"Dataset error: {exc}", None, None
log(f" {len(dataset)} examples across {len({r['reasoning_type'] for r in dataset})} reasoning types.")
log(f"Querying models: {', '.join(selected_models)} …")
try:
failures = evaluate(dataset, query_model, model_ids=selected_models)
except Exception as exc:
return f"Evaluation error: {exc}", None, None
log(f" Found {len(failures)} failures from {len(dataset) * len(selected_models)} total predictions.")
if not failures:
log("No failures detected β€” all predictions were correct.")
empty_mi = {
"MI(cluster, reasoning_type)": 0.0,
"MI(cluster, model_identity)": 0.0,
}
fig = plot_mi_comparison(empty_mi)
return "\n".join(log_lines), fig, _empty_summary_table()
log("Embedding failures with all-MiniLM-L6-v2 …")
try:
embeddings = embed_failures(failures)
except Exception as exc:
return "\n".join(log_lines) + f"\nEmbed error: {exc}", None, None
log(f" Embeddings shape: {embeddings.shape}")
log(f"Clustering into k={n_clusters} clusters (KMeans) …")
cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters)
for failure, cluster_id in zip(failures, cluster_ids, strict=True):
failure["cluster_id"] = cluster_id
counts_per_cluster = {}
for cid in cluster_ids:
counts_per_cluster[cid] = counts_per_cluster.get(cid, 0) + 1
log(f" Cluster sizes: { {k: counts_per_cluster[k] for k in sorted(counts_per_cluster)} }")
reasoning_types = [f["reasoning_type"] for f in failures]
model_ids_list = [f["model_id"] for f in failures]
log("Computing mutual information …")
mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list)
for label, score in mi_scores.items():
log(f" {label}: {score:.4f}")
fig = plot_mi_comparison(mi_scores)
summary_df = _build_summary_table(failures)
return "\n".join(log_lines), fig, summary_df
def _build_summary_table(failures: list[dict]) -> pd.DataFrame:
from collections import Counter
counts: Counter = Counter()
for f in failures:
counts[(f["reasoning_type"], f["model_id"])] += 1
rows = [
{"reasoning_type": rtype, "model_id": mid, "failure_count": cnt}
for (rtype, mid), cnt in sorted(counts.items())
]
return pd.DataFrame(rows) if rows else _empty_summary_table()
def _empty_summary_table() -> pd.DataFrame:
return pd.DataFrame(columns=["reasoning_type", "model_id", "failure_count"])
with gr.Blocks(title="CARB Failure Observability", theme=gr.themes.Soft()) as demo:
gr.Markdown(_DESCRIPTION)
with gr.Row():
with gr.Column(scale=1, min_width=260):
model_selector = gr.CheckboxGroup(
choices=DEFAULT_MODELS,
value=DEFAULT_MODELS[:1],
label="Models to query",
info="Each model runs on all 50 examples. Multiple models increase failure pool diversity.",
)
n_clusters_slider = gr.Slider(
minimum=2,
maximum=6,
step=1,
value=4,
label="KMeans clusters (k)",
info="Should be ≀ number of reasoning types (4).",
)
run_btn = gr.Button("Run Experiment", variant="primary", size="lg")
with gr.Column(scale=2):
status_log = gr.Textbox(
label="Pipeline log",
lines=9,
interactive=False,
placeholder="Click 'Run Experiment' to start …",
)
with gr.Row():
mi_plot = gr.Plot(
label="Mutual information: cluster vs. reasoning type vs. model identity"
)
with gr.Row():
summary_table = gr.Dataframe(
headers=["reasoning_type", "model_id", "failure_count"],
label="Failures by reasoning type and model",
interactive=False,
)
run_btn.click(
fn=run_experiment,
inputs=[model_selector, n_clusters_slider],
outputs=[status_log, mi_plot, summary_table],
)
if __name__ == "__main__":
demo.launch()