Brian Moran
Add failure geometry demo pipeline
09f4a33
from __future__ import annotations
from collections import Counter
from typing import Any
from sklearn.metrics import mutual_info_score
def compute_mi_scores(
cluster_ids: list[int],
reasoning_types: list[str],
model_ids: list[str],
) -> dict[str, float]:
if not cluster_ids:
return {
"MI(cluster, reasoning_type)": 0.0,
"MI(cluster, model_identity)": 0.0,
}
return {
"MI(cluster, reasoning_type)": float(mutual_info_score(cluster_ids, reasoning_types)),
"MI(cluster, model_identity)": float(mutual_info_score(cluster_ids, model_ids)),
}
def accuracy_by_type(results: list[dict[str, Any]]) -> dict[str, dict[str, float]]:
"""Per-model, per-reasoning-type accuracy."""
totals: dict[tuple[str, str], int] = Counter()
correct: dict[tuple[str, str], int] = Counter()
for r in results:
key = (r["model_id"], r["reasoning_type"])
totals[key] += 1
if r["is_correct"]:
correct[key] += 1
out: dict[str, dict[str, float]] = {}
for (model_id, rtype), total in sorted(totals.items()):
out.setdefault(model_id, {})[rtype] = correct[(model_id, rtype)] / total if total else 0.0
return out