from __future__ import annotations

from collections import Counter
from typing import Any

from sklearn.metrics import mutual_info_score


def compute_mi_scores(
    cluster_ids: list[int],
    reasoning_types: list[str],
    model_ids: list[str],
) -> dict[str, float]:
    if not cluster_ids:
        return {
            "MI(cluster, reasoning_type)": 0.0,
            "MI(cluster, model_identity)": 0.0,
        }
    return {
        "MI(cluster, reasoning_type)": float(mutual_info_score(cluster_ids, reasoning_types)),
        "MI(cluster, model_identity)": float(mutual_info_score(cluster_ids, model_ids)),
    }


def accuracy_by_type(results: list[dict[str, Any]]) -> dict[str, dict[str, float]]:
    """Per-model, per-reasoning-type accuracy."""
    totals: dict[tuple[str, str], int] = Counter()
    correct: dict[tuple[str, str], int] = Counter()

    for r in results:
        key = (r["model_id"], r["reasoning_type"])
        totals[key] += 1
        if r["is_correct"]:
            correct[key] += 1

    out: dict[str, dict[str, float]] = {}
    for (model_id, rtype), total in sorted(totals.items()):
        out.setdefault(model_id, {})[rtype] = correct[(model_id, rtype)] / total if total else 0.0

    return out