from __future__ import annotations from collections import Counter from typing import Any from sklearn.metrics import mutual_info_score def compute_mi_scores( cluster_ids: list[int], reasoning_types: list[str], model_ids: list[str], ) -> dict[str, float]: if not cluster_ids: return { "MI(cluster, reasoning_type)": 0.0, "MI(cluster, model_identity)": 0.0, } return { "MI(cluster, reasoning_type)": float(mutual_info_score(cluster_ids, reasoning_types)), "MI(cluster, model_identity)": float(mutual_info_score(cluster_ids, model_ids)), } def accuracy_by_type(results: list[dict[str, Any]]) -> dict[str, dict[str, float]]: """Per-model, per-reasoning-type accuracy.""" totals: dict[tuple[str, str], int] = Counter() correct: dict[tuple[str, str], int] = Counter() for r in results: key = (r["model_id"], r["reasoning_type"]) totals[key] += 1 if r["is_correct"]: correct[key] += 1 out: dict[str, dict[str, float]] = {} for (model_id, rtype), total in sorted(totals.items()): out.setdefault(model_id, {})[rtype] = correct[(model_id, rtype)] / total if total else 0.0 return out