File size: 4,239 Bytes
7e9ce74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Position Bias Taxonomy Framework

Defines formal taxonomy of position bias phenomena:
- Primacy Bias: Strong performance at beginning
- Recency Bias: Strong performance at end
- U-Shaped Bias: Both primacy and recency
- Middle-Sag: Worst at exact middle
- Flat: No position effect
- Inverted-U: Best at middle (rare)

Provides Position Bias Index (PBI) and cross-task correlation.
"""
import numpy as np
from typing import List, Dict, Tuple
from scipy import stats


def classify_bias(positions: List[float], accuracies: List[float]) -> str:
    """
    Classify the type of position bias from a curve.

    Args:
        positions: Normalized positions [0, 1]
        accuracies: Accuracy at each position

    Returns:
        One of: "primacy", "recency", "u-shaped", "middle-sag",
                "flat", "inverted-u", "insufficient-data"
    """
    if len(positions) < 3:
        return "insufficient-data"

    first = accuracies[0]
    last = accuracies[-1]
    mid_idx = len(positions) // 2
    middle = accuracies[mid_idx]

    edge_mean = (first + last) / 2.0
    overall_mean = np.mean(accuracies)
    std = np.std(accuracies)

    # Flat if all within 1 std of mean
    if std < 0.05:
        return "flat"

    # Inverted-U: middle is best
    if middle > max(first, last) + 0.1:
        return "inverted-u"

    # U-shaped: edges better than middle
    if edge_mean > middle + 0.1:
        if first > middle + 0.05 and last > middle + 0.05:
            return "u-shaped"
        elif first > middle + 0.05:
            return "primacy"
        elif last > middle + 0.05:
            return "recency"
        return "middle-sag"

    # Only one edge is strong
    if first > last + 0.1:
        return "primacy"
    if last > first + 0.1:
        return "recency"

    return "flat"


def position_bias_index(positions: List[float], accuracies: List[float]) -> float:
    """
    PBI = (acc_start + acc_end) / 2 - acc_middle
    Positive = U-shaped bias (bad). Zero = flat (good). Negative = inverted (rare).
    """
    if len(positions) < 3:
        return 0.0
    mid_idx = len(positions) // 2
    edge = (accuracies[0] + accuracies[-1]) / 2.0
    return edge - accuracies[mid_idx]


def weighted_position_bias_index(positions: List[float], accuracies: List[float]) -> float:
    """
    Weighted PBI that accounts for full curve shape using Simpson's rule for AUC.
    PBI_w = AUC_edge_region - AUC_middle_region
    """
    if len(positions) < 3:
        return 0.0

    # Split into edge (first 25% + last 25%) and middle (50%)
    n = len(positions)
    edge_start = int(0.25 * n)
    edge_end = int(0.75 * n)

    edge_acc = accuracies[:edge_start] + accuracies[edge_end:]
    mid_acc = accuracies[edge_start:edge_end]

    if not edge_acc or not mid_acc:
        return position_bias_index(positions, accuracies)

    return np.mean(edge_acc) - np.mean(mid_acc)


def cross_task_pbi_correlation(
    model_results: Dict[str, Dict[str, List[float]]],
    task_order: List[str],
) -> Dict[str, float]:
    """
    Compute correlation of PBI across tasks for each model.

    Args:
        model_results: {model_name: {task_name: [accuracies]}}
        task_order: Ordered list of task names for consistent indexing

    Returns:
        {model_name: correlation_matrix}
    """
    from scipy.spatial.distance import pdist, squareform

    correlations = {}
    for model_name, tasks in model_results.items():
        pbi_vector = []
        for task in task_order:
            if task in tasks:
                # Compute PBI for this task (positions are assumed uniform)
                positions = [i / (len(tasks[task]) - 1) for i in range(len(tasks[task]))]
                pbi = position_bias_index(positions, tasks[task])
                pbi_vector.append(pbi)
            else:
                pbi_vector.append(0.0)

        if len(pbi_vector) >= 2:
            correlations[model_name] = pbi_vector

    return correlations


def rank_models_by_bias(pbi_scores: Dict[str, float], ascending: bool = True) -> List[Tuple[str, float]]:
    """Rank models by PBI. Lower = less bias (better)."""
    sorted_scores = sorted(pbi_scores.items(), key=lambda x: x[1], reverse=not ascending)
    return sorted_scores