| from __future__ import annotations |
|
|
| from collections import Counter |
| from typing import Any |
|
|
| from sklearn.metrics import mutual_info_score |
|
|
|
|
| def compute_mi_scores( |
| cluster_ids: list[int], |
| reasoning_types: list[str], |
| model_ids: list[str], |
| ) -> dict[str, float]: |
| if not cluster_ids: |
| return { |
| "MI(cluster, reasoning_type)": 0.0, |
| "MI(cluster, model_identity)": 0.0, |
| } |
| return { |
| "MI(cluster, reasoning_type)": float(mutual_info_score(cluster_ids, reasoning_types)), |
| "MI(cluster, model_identity)": float(mutual_info_score(cluster_ids, model_ids)), |
| } |
|
|
|
|
| def accuracy_by_type(results: list[dict[str, Any]]) -> dict[str, dict[str, float]]: |
| """Per-model, per-reasoning-type accuracy.""" |
| totals: dict[tuple[str, str], int] = Counter() |
| correct: dict[tuple[str, str], int] = Counter() |
|
|
| for r in results: |
| key = (r["model_id"], r["reasoning_type"]) |
| totals[key] += 1 |
| if r["is_correct"]: |
| correct[key] += 1 |
|
|
| out: dict[str, dict[str, float]] = {} |
| for (model_id, rtype), total in sorted(totals.items()): |
| out.setdefault(model_id, {})[rtype] = correct[(model_id, rtype)] / total if total else 0.0 |
|
|
| return out |
|
|