Spaces:

obversarystudios
/

carb-observability-space

Sleeping

App Files Files Community

carb-observability-space / app.py

obversarystudios

Use huggingface_hub InferenceClient (routed inference API)

609c576 verified 5 days ago

raw

history blame contribute delete

5.97 kB

	from pathlib import Path

	import gradio as gr
	import pandas as pd

	from core.cluster import cluster_embeddings
	from core.dataset import load_dataset
	from core.embed import embed_failures
	from core.eval import evaluate
	from core.metrics import compute_mi_scores
	from core.model import DEFAULT_MODELS, query_model
	from viz.plots import plot_mi_comparison


	DATA_PATH = Path(__file__).parent / "data" / "carb_seed.json"

	_DESCRIPTION = """\
	## CARB Failure Observability

	Research pipeline for structured failure analysis in language model reasoning.

	```
	CARB dataset → HF Inference API → failure extraction → MiniLM embeddings → KMeans → mutual information
	```

	Central question: do failure clusters align with reasoning category more than with model identity?

	The MI comparison plot answers this directly — a larger `MI(cluster, reasoning_type)` bar relative to
	`MI(cluster, model_identity)` supports the hypothesis that failure structure is organized by reasoning
	difficulty, not model choice alone.

	> Requires `HF_TOKEN` set in Space secrets. See
	> [failure-geometry-demo](https://huggingface.co/spaces/obversarystudios/failure-geometry-demo)
	> for a fully self-contained version that needs no API key.
	>
	> Research context: [obversarystudios.org](https://obversarystudios.org)
	"""


	def run_experiment(
	selected_models: list[str],
	n_clusters: int,
	) -> tuple[str, object, object]:
	log_lines: list[str] = []

	def log(msg: str) -> None:
	log_lines.append(msg)

	if not selected_models:
	selected_models = DEFAULT_MODELS[:1]

	log(f"Loading dataset from {DATA_PATH.name} …")
	try:
	dataset = load_dataset(DATA_PATH)
	except Exception as exc:
	return f"Dataset error: {exc}", None, None

	log(f" {len(dataset)} examples across {len({r['reasoning_type'] for r in dataset})} reasoning types.")
	log(f"Querying models: {', '.join(selected_models)} …")

	try:
	failures = evaluate(dataset, query_model, model_ids=selected_models)
	except Exception as exc:
	return f"Evaluation error: {exc}", None, None

	log(f" Found {len(failures)} failures from {len(dataset) * len(selected_models)} total predictions.")

	if not failures:
	log("No failures detected — all predictions were correct.")
	empty_mi = {
	"MI(cluster, reasoning_type)": 0.0,
	"MI(cluster, model_identity)": 0.0,
	}
	fig = plot_mi_comparison(empty_mi)
	return "\n".join(log_lines), fig, _empty_summary_table()

	log("Embedding failures with all-MiniLM-L6-v2 …")
	try:
	embeddings = embed_failures(failures)
	except Exception as exc:
	return "\n".join(log_lines) + f"\nEmbed error: {exc}", None, None

	log(f" Embeddings shape: {embeddings.shape}")
	log(f"Clustering into k={n_clusters} clusters (KMeans) …")

	cluster_ids = cluster_embeddings(embeddings, n_clusters=n_clusters)
	for failure, cluster_id in zip(failures, cluster_ids, strict=True):
	failure["cluster_id"] = cluster_id

	counts_per_cluster = {}
	for cid in cluster_ids:
	counts_per_cluster[cid] = counts_per_cluster.get(cid, 0) + 1
	log(f" Cluster sizes: { {k: counts_per_cluster[k] for k in sorted(counts_per_cluster)} }")

	reasoning_types = [f["reasoning_type"] for f in failures]
	model_ids_list = [f["model_id"] for f in failures]

	log("Computing mutual information …")
	mi_scores = compute_mi_scores(cluster_ids, reasoning_types, model_ids_list)
	for label, score in mi_scores.items():
	log(f" {label}: {score:.4f}")

	fig = plot_mi_comparison(mi_scores)
	summary_df = _build_summary_table(failures)

	return "\n".join(log_lines), fig, summary_df


	def _build_summary_table(failures: list[dict]) -> pd.DataFrame:
	from collections import Counter
	counts: Counter = Counter()
	for f in failures:
	counts[(f["reasoning_type"], f["model_id"])] += 1

	rows = [
	{"reasoning_type": rtype, "model_id": mid, "failure_count": cnt}
	for (rtype, mid), cnt in sorted(counts.items())
	]
	return pd.DataFrame(rows) if rows else _empty_summary_table()


	def _empty_summary_table() -> pd.DataFrame:
	return pd.DataFrame(columns=["reasoning_type", "model_id", "failure_count"])


	with gr.Blocks(title="CARB Failure Observability", theme=gr.themes.Soft()) as demo:
	gr.Markdown(_DESCRIPTION)

	with gr.Row():
	with gr.Column(scale=1, min_width=260):
	model_selector = gr.CheckboxGroup(
	choices=DEFAULT_MODELS,
	value=DEFAULT_MODELS[:1],
	label="Models to query",
	info="Each model runs on all 50 examples. Multiple models increase failure pool diversity.",
	)
	n_clusters_slider = gr.Slider(
	minimum=2,
	maximum=6,
	step=1,
	value=4,
	label="KMeans clusters (k)",
	info="Should be ≤ number of reasoning types (4).",
	)
	run_btn = gr.Button("Run Experiment", variant="primary", size="lg")

	with gr.Column(scale=2):
	status_log = gr.Textbox(
	label="Pipeline log",
	lines=9,
	interactive=False,
	placeholder="Click 'Run Experiment' to start …",
	)

	with gr.Row():
	mi_plot = gr.Plot(
	label="Mutual information: cluster vs. reasoning type vs. model identity"
	)

	with gr.Row():
	summary_table = gr.Dataframe(
	headers=["reasoning_type", "model_id", "failure_count"],
	label="Failures by reasoning type and model",
	interactive=False,
	)

	run_btn.click(
	fn=run_experiment,
	inputs=[model_selector, n_clusters_slider],
	outputs=[status_log, mi_plot, summary_table],
	)


	if __name__ == "__main__":
	demo.launch()