Spaces:

Sara1708
/

deepfake-audio-detector

Running

deepfake-audio-detector / src /evaluation /metrics.py

Saracasm

Phase 3: add model, metrics, and training loop

f35a6e2 24 days ago

3.52 kB

	"""
	Evaluation metrics for anti-spoofing.

	The standard metric in this field is Equal Error Rate (EER):
	The threshold at which the false-accept rate (FAR) equals
	the false-reject rate (FRR).

	For a binary classifier outputting per-utterance scores:
	score > threshold → predict spoof
	score <= threshold → predict bonafide

	We sweep thresholds, compute FAR and FRR at each, and find the
	threshold where they are equal. The error rate at that point is the EER.

	Lower EER = better. State-of-the-art on ASVspoof 2019 LA is below 1%;
	a strong student project lands in the 2-6% range.
	"""

	import numpy as np
	from typing import Tuple
	from sklearn.metrics import roc_curve, roc_auc_score


	def compute_eer(
	scores: np.ndarray,
	labels: np.ndarray,
	) -> Tuple[float, float]:
	"""Compute Equal Error Rate (EER) and the threshold at which it occurs.

	Args:
	scores: 1-D array of per-sample scores. Higher = more spoof-like.
	We use P(spoof) from softmax for this.
	labels: 1-D array of binary ground-truth labels.
	0 = bonafide, 1 = spoof.

	Returns:
	eer: Equal error rate as a fraction in [0, 1].
	threshold: Score threshold at which FAR equals FRR.
	"""
	scores = np.asarray(scores).ravel()
	labels = np.asarray(labels).ravel()

	# roc_curve returns false-positive rate, true-positive rate, thresholds.
	# FPR = FAR (spoofs accepted as bonafide ... wait, careful).
	# Convention here:
	# "positive" = spoof (label=1)
	# FAR = false alarm = bonafide flagged as spoof = FPR
	# FRR = miss = spoof predicted as bonafide = 1 - TPR
	fpr, tpr, thresholds = roc_curve(labels, scores, pos_label=1)
	fnr = 1.0 - tpr # false negative rate = miss rate

	# Find the threshold where FAR (fpr) is closest to FRR (fnr)
	abs_diffs = np.abs(fpr - fnr)
	idx_eer = np.argmin(abs_diffs)
	eer = (fpr[idx_eer] + fnr[idx_eer]) / 2.0
	threshold = thresholds[idx_eer]

	return float(eer), float(threshold)


	def compute_auc(scores: np.ndarray, labels: np.ndarray) -> float:
	"""Area under the ROC curve. 0.5 = random, 1.0 = perfect."""
	return float(roc_auc_score(labels, scores))


	def aggregate_window_scores_to_utterance(
	window_scores: np.ndarray,
	window_utt_ids: list,
	method: str = "mean",
	) -> Tuple[np.ndarray, list]:
	"""Aggregate per-window scores back to per-utterance scores.

	Many utterances in our dataset produce multiple 4-sec windows.
	For evaluation we need ONE score per utterance, so we aggregate
	the per-window scores.

	Args:
	window_scores: shape (N_windows,) — scores for every window.
	window_utt_ids: list of N_windows utterance IDs (with duplicates).
	method: "mean" or "max" — how to aggregate within each utterance.

	Returns:
	utt_scores: shape (N_utterances,) — one score per unique utterance.
	utt_ids: list of N_utterances unique utterance IDs (sorted).
	"""
	from collections import defaultdict
	grouped = defaultdict(list)
	for s, uid in zip(window_scores, window_utt_ids):
	grouped[uid].append(float(s))

	utt_ids_sorted = sorted(grouped.keys())
	if method == "mean":
	utt_scores = np.array([np.mean(grouped[u]) for u in utt_ids_sorted])
	elif method == "max":
	utt_scores = np.array([np.max(grouped[u]) for u in utt_ids_sorted])
	else:
	raise ValueError(f"Unknown aggregation method: {method}")

	return utt_scores, utt_ids_sorted