"""
Evaluation metrics for anti-spoofing.

The standard metric in this field is Equal Error Rate (EER):
    The threshold at which the false-accept rate (FAR) equals
    the false-reject rate (FRR).

For a binary classifier outputting per-utterance scores:
    score > threshold  → predict spoof
    score <= threshold → predict bonafide

We sweep thresholds, compute FAR and FRR at each, and find the
threshold where they are equal. The error rate at that point is the EER.

Lower EER = better. State-of-the-art on ASVspoof 2019 LA is below 1%;
a strong student project lands in the 2-6% range.
"""

import numpy as np
from typing import Tuple
from sklearn.metrics import roc_curve, roc_auc_score


def compute_eer(
    scores: np.ndarray,
    labels: np.ndarray,
) -> Tuple[float, float]:
    """Compute Equal Error Rate (EER) and the threshold at which it occurs.

    Args:
        scores: 1-D array of per-sample scores. Higher = more spoof-like.
                We use P(spoof) from softmax for this.
        labels: 1-D array of binary ground-truth labels.
                0 = bonafide, 1 = spoof.

    Returns:
        eer: Equal error rate as a fraction in [0, 1].
        threshold: Score threshold at which FAR equals FRR.
    """
    scores = np.asarray(scores).ravel()
    labels = np.asarray(labels).ravel()

    # roc_curve returns false-positive rate, true-positive rate, thresholds.
    # FPR = FAR (spoofs accepted as bonafide ... wait, careful).
    # Convention here:
    #   "positive" = spoof (label=1)
    #   FAR = false alarm = bonafide flagged as spoof = FPR
    #   FRR = miss = spoof predicted as bonafide = 1 - TPR
    fpr, tpr, thresholds = roc_curve(labels, scores, pos_label=1)
    fnr = 1.0 - tpr  # false negative rate = miss rate

    # Find the threshold where FAR (fpr) is closest to FRR (fnr)
    abs_diffs = np.abs(fpr - fnr)
    idx_eer = np.argmin(abs_diffs)
    eer = (fpr[idx_eer] + fnr[idx_eer]) / 2.0
    threshold = thresholds[idx_eer]

    return float(eer), float(threshold)


def compute_auc(scores: np.ndarray, labels: np.ndarray) -> float:
    """Area under the ROC curve. 0.5 = random, 1.0 = perfect."""
    return float(roc_auc_score(labels, scores))


def aggregate_window_scores_to_utterance(
    window_scores: np.ndarray,
    window_utt_ids: list,
    method: str = "mean",
) -> Tuple[np.ndarray, list]:
    """Aggregate per-window scores back to per-utterance scores.

    Many utterances in our dataset produce multiple 4-sec windows.
    For evaluation we need ONE score per utterance, so we aggregate
    the per-window scores.

    Args:
        window_scores: shape (N_windows,) — scores for every window.
        window_utt_ids: list of N_windows utterance IDs (with duplicates).
        method: "mean" or "max" — how to aggregate within each utterance.

    Returns:
        utt_scores: shape (N_utterances,) — one score per unique utterance.
        utt_ids: list of N_utterances unique utterance IDs (sorted).
    """
    from collections import defaultdict
    grouped = defaultdict(list)
    for s, uid in zip(window_scores, window_utt_ids):
        grouped[uid].append(float(s))

    utt_ids_sorted = sorted(grouped.keys())
    if method == "mean":
        utt_scores = np.array([np.mean(grouped[u]) for u in utt_ids_sorted])
    elif method == "max":
        utt_scores = np.array([np.max(grouped[u]) for u in utt_ids_sorted])
    else:
        raise ValueError(f"Unknown aggregation method: {method}")

    return utt_scores, utt_ids_sorted