| """ |
| Position Bias Taxonomy Framework |
| |
| Defines formal taxonomy of position bias phenomena: |
| - Primacy Bias: Strong performance at beginning |
| - Recency Bias: Strong performance at end |
| - U-Shaped Bias: Both primacy and recency |
| - Middle-Sag: Worst at exact middle |
| - Flat: No position effect |
| - Inverted-U: Best at middle (rare) |
| |
| Provides Position Bias Index (PBI) and cross-task correlation. |
| """ |
| import numpy as np |
| from typing import List, Dict, Tuple |
| from scipy import stats |
|
|
|
|
| def classify_bias(positions: List[float], accuracies: List[float]) -> str: |
| """ |
| Classify the type of position bias from a curve. |
| |
| Args: |
| positions: Normalized positions [0, 1] |
| accuracies: Accuracy at each position |
| |
| Returns: |
| One of: "primacy", "recency", "u-shaped", "middle-sag", |
| "flat", "inverted-u", "insufficient-data" |
| """ |
| if len(positions) < 3: |
| return "insufficient-data" |
|
|
| first = accuracies[0] |
| last = accuracies[-1] |
| mid_idx = len(positions) // 2 |
| middle = accuracies[mid_idx] |
|
|
| edge_mean = (first + last) / 2.0 |
| overall_mean = np.mean(accuracies) |
| std = np.std(accuracies) |
|
|
| |
| if std < 0.05: |
| return "flat" |
|
|
| |
| if middle > max(first, last) + 0.1: |
| return "inverted-u" |
|
|
| |
| if edge_mean > middle + 0.1: |
| if first > middle + 0.05 and last > middle + 0.05: |
| return "u-shaped" |
| elif first > middle + 0.05: |
| return "primacy" |
| elif last > middle + 0.05: |
| return "recency" |
| return "middle-sag" |
|
|
| |
| if first > last + 0.1: |
| return "primacy" |
| if last > first + 0.1: |
| return "recency" |
|
|
| return "flat" |
|
|
|
|
| def position_bias_index(positions: List[float], accuracies: List[float]) -> float: |
| """ |
| PBI = (acc_start + acc_end) / 2 - acc_middle |
| Positive = U-shaped bias (bad). Zero = flat (good). Negative = inverted (rare). |
| """ |
| if len(positions) < 3: |
| return 0.0 |
| mid_idx = len(positions) // 2 |
| edge = (accuracies[0] + accuracies[-1]) / 2.0 |
| return edge - accuracies[mid_idx] |
|
|
|
|
| def weighted_position_bias_index(positions: List[float], accuracies: List[float]) -> float: |
| """ |
| Weighted PBI that accounts for full curve shape using Simpson's rule for AUC. |
| PBI_w = AUC_edge_region - AUC_middle_region |
| """ |
| if len(positions) < 3: |
| return 0.0 |
|
|
| |
| n = len(positions) |
| edge_start = int(0.25 * n) |
| edge_end = int(0.75 * n) |
|
|
| edge_acc = accuracies[:edge_start] + accuracies[edge_end:] |
| mid_acc = accuracies[edge_start:edge_end] |
|
|
| if not edge_acc or not mid_acc: |
| return position_bias_index(positions, accuracies) |
|
|
| return np.mean(edge_acc) - np.mean(mid_acc) |
|
|
|
|
| def cross_task_pbi_correlation( |
| model_results: Dict[str, Dict[str, List[float]]], |
| task_order: List[str], |
| ) -> Dict[str, float]: |
| """ |
| Compute correlation of PBI across tasks for each model. |
| |
| Args: |
| model_results: {model_name: {task_name: [accuracies]}} |
| task_order: Ordered list of task names for consistent indexing |
| |
| Returns: |
| {model_name: correlation_matrix} |
| """ |
| from scipy.spatial.distance import pdist, squareform |
|
|
| correlations = {} |
| for model_name, tasks in model_results.items(): |
| pbi_vector = [] |
| for task in task_order: |
| if task in tasks: |
| |
| positions = [i / (len(tasks[task]) - 1) for i in range(len(tasks[task]))] |
| pbi = position_bias_index(positions, tasks[task]) |
| pbi_vector.append(pbi) |
| else: |
| pbi_vector.append(0.0) |
|
|
| if len(pbi_vector) >= 2: |
| correlations[model_name] = pbi_vector |
|
|
| return correlations |
|
|
|
|
| def rank_models_by_bias(pbi_scores: Dict[str, float], ascending: bool = True) -> List[Tuple[str, float]]: |
| """Rank models by PBI. Lower = less bias (better).""" |
| sorted_scores = sorted(pbi_scores.items(), key=lambda x: x[1], reverse=not ascending) |
| return sorted_scores |
|
|