"""Metrics and scoring utilities.""" import re import statistics from typing import List, Dict, Any def exact_match_score(prediction: str, target: str) -> float: """Binary exact-match score.""" return 1.0 if target.lower() in prediction.lower() else 0.0 def numeric_match(prediction: str, target: float, tolerance: float = 0.5) -> float: """Check if any number in the prediction is within tolerance of target.""" nums = re.findall(r"[\d,]+\.?\d*", prediction.replace(",", "")) if not nums: return 0.0 for n in nums: try: if abs(float(n) - target) < tolerance: return 1.0 except ValueError: continue return 0.0 def compute_accuracy(predictions: List[Dict[str, Any]], key: str = "correct") -> float: """Mean accuracy from a list of prediction records.""" vals = [p[key] for p in predictions] return statistics.mean(vals) if vals else 0.0 def position_bias_index(positions: List[float], accuracies: List[float]) -> float: """ Compute Position Bias Index (PBI): PBI = (acc_first + acc_last) / 2 - acc_middle Higher PBI = stronger U-shape (worse). """ if len(positions) < 3: return 0.0 mid_idx = len(positions) // 2 edge_acc = (accuracies[0] + accuracies[-1]) / 2.0 mid_acc = accuracies[mid_idx] return edge_acc - mid_acc