stevenkhan
/

clashcr

ml-intern

Model card Files Files and versions

xet

Community

stevenkhan commited on 3 days ago

Commit

5d6e14c

verified ·

1 Parent(s): 098b42d

Upload clashcr/core/evaluator.py with huggingface_hub

Browse files

Files changed (1) hide show

clashcr/core/evaluator.py +163 -0

clashcr/core/evaluator.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""Offline evaluator: replay recordings and report metrics.
+Metrics:
+- precision, recall
+- false positives per minute
+- missed events
+- timing error (seconds)
+- confusion matrix
+"""
+from __future__ import annotations
+import csv
+import json
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+logger = logging.getLogger(__name__)
+@dataclass
+class EvalResult:
+    precision: float
+    recall: float
+    f1: float
+    false_positives_per_minute: float
+    missed_events: int
+    mean_timing_error_seconds: float
+    median_timing_error_seconds: float
+    confusion_matrix: Dict[str, Dict[str, int]] = field(default_factory=dict)
+    total_predictions: int = 0
+    total_labels: int = 0
+    correct: int = 0
+    false_positives: int = 0
+    false_negatives: int = 0
+class OfflineEvaluator:
+    """Evaluate predictions against ground-truth labels from a recording."""
+    def __init__(self, timing_tolerance_seconds: float = 3.0):
+        self.timing_tolerance = timing_tolerance_seconds
+    def load_labels(self, labels_path: str) -> List[dict]:
+        rows = []
+        with open(labels_path, "r", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                rows.append({
+                    "timestamp": float(row["timestamp"]),
+                    "frame_idx": int(row.get("frame_idx", 0)),
+                    "side": row["side"],
+                    "card_key": row["card_key"],
+                    "confidence": float(row.get("confidence", 1.0)),
+                    "note": row.get("manual_note", ""),
+                })
+        return rows
+    def load_predictions(self, predictions_path: str) -> List[dict]:
+        rows = []
+        with open(predictions_path, "r", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                rows.append({
+                    "timestamp": float(row["timestamp"]),
+                    "frame_idx": int(row.get("frame_idx", 0)),
+                    "side": row["side"],
+                    "card_key": row["card_key"],
+                    "confidence": float(row.get("confidence", 0.0)),
+                    "evidence": row.get("evidence", ""),
+                    "resolver_reason": row.get("resolver_reason", ""),
+                })
+        return rows
+    def evaluate(self, labels: List[dict], predictions: List[dict],
+                 recording_duration_seconds: Optional[float] = None) -> EvalResult:
+        # Filter to opponent-side only
+        labels = [r for r in labels if r["side"] == "opponent"]
+        predictions = [r for r in predictions if r["side"] == "opponent"]
+        total_labels = len(labels)
+        total_predictions = len(predictions)
+        matched_labels = set()
+        matched_preds = set()
+        timing_errors = []
+        confusion: Dict[str, Dict[str, int]] = {}
+        for li, lab in enumerate(labels):
+            best_pi = -1
+            best_err = float("inf")
+            for pi, pred in enumerate(predictions):
+                if pi in matched_preds:
+                    continue
+                err = abs(pred["timestamp"] - lab["timestamp"])
+                if err <= self.timing_tolerance and err < best_err:
+                    best_err = err
+                    best_pi = pi
+            if best_pi >= 0:
+                matched_labels.add(li)
+                matched_preds.add(best_pi)
+                timing_errors.append(best_err)
+                pred_card = predictions[best_pi]["card_key"]
+                true_card = lab["card_key"]
+                confusion.setdefault(true_card, {}).setdefault(pred_card, 0)
+                confusion[true_card][pred_card] += 1
+        correct = sum(
+            1 for li in matched_labels
+            if labels[li]["card_key"] == predictions[
+                next(pi for pi in matched_preds if abs(predictions[pi]["timestamp"] - labels[li]["timestamp"]) <= self.timing_tolerance)
+            ]["card_key"]
+        )
+        # Simpler correct count from confusion diagonal
+        correct = sum(confusion.get(c, {}).get(c, 0) for c in confusion)
+        false_positives = total_predictions - len(matched_preds)
+        false_negatives = total_labels - len(matched_labels)
+        precision = correct / total_predictions if total_predictions > 0 else 0.0
+        recall = correct / total_labels if total_labels > 0 else 0.0
+        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
+        duration = recording_duration_seconds or (labels[-1]["timestamp"] - labels[0]["timestamp"]) if labels else 1.0
+        fp_per_min = (false_positives / duration) * 60.0 if duration > 0 else 0.0
+        mean_timing = float(np.mean(timing_errors)) if timing_errors else 0.0
+        median_timing = float(np.median(timing_errors)) if timing_errors else 0.0
+        return EvalResult(
+            precision=precision,
+            recall=recall,
+            f1=f1,
+            false_positives_per_minute=fp_per_min,
+            missed_events=false_negatives,
+            mean_timing_error_seconds=mean_timing,
+            median_timing_error_seconds=median_timing,
+            confusion_matrix=confusion,
+            total_predictions=total_predictions,
+            total_labels=total_labels,
+            correct=correct,
+            false_positives=false_positives,
+            false_negatives=false_negatives,
+        )
+    def evaluate_recording_dir(self, recording_dir: str) -> EvalResult:
+        rec = Path(recording_dir)
+        labels = self.load_labels(str(rec / "labels.csv"))
+        predictions = self.load_predictions(str(rec / "predictions.csv"))
+        meta_path = rec / "metadata.jsonl"
+        duration = None
+        if meta_path.exists():
+            with open(meta_path, "r") as f:
+                lines = f.readlines()
+                if lines:
+                    first = json.loads(lines[0])
+                    last = json.loads(lines[-1])
+                    duration = last["timestamp"] - first["timestamp"]
+        return self.evaluate(labels, predictions, duration)