File size: 4,239 Bytes

7e9ce74

"""
Position Bias Taxonomy Framework

Defines formal taxonomy of position bias phenomena:
- Primacy Bias: Strong performance at beginning
- Recency Bias: Strong performance at end
- U-Shaped Bias: Both primacy and recency
- Middle-Sag: Worst at exact middle
- Flat: No position effect
- Inverted-U: Best at middle (rare)

Provides Position Bias Index (PBI) and cross-task correlation.
"""
import numpy as np
from typing import List, Dict, Tuple
from scipy import stats


def classify_bias(positions: List[float], accuracies: List[float]) -> str:
    """
    Classify the type of position bias from a curve.

    Args:
        positions: Normalized positions [0, 1]
        accuracies: Accuracy at each position

    Returns:
        One of: "primacy", "recency", "u-shaped", "middle-sag",
                "flat", "inverted-u", "insufficient-data"
    """
    if len(positions) < 3:
        return "insufficient-data"

    first = accuracies[0]
    last = accuracies[-1]
    mid_idx = len(positions) // 2
    middle = accuracies[mid_idx]

    edge_mean = (first + last) / 2.0
    overall_mean = np.mean(accuracies)
    std = np.std(accuracies)

    # Flat if all within 1 std of mean
    if std < 0.05:
        return "flat"

    # Inverted-U: middle is best
    if middle > max(first, last) + 0.1:
        return "inverted-u"

    # U-shaped: edges better than middle
    if edge_mean > middle + 0.1:
        if first > middle + 0.05 and last > middle + 0.05:
            return "u-shaped"
        elif first > middle + 0.05:
            return "primacy"
        elif last > middle + 0.05:
            return "recency"
        return "middle-sag"

    # Only one edge is strong
    if first > last + 0.1:
        return "primacy"
    if last > first + 0.1:
        return "recency"

    return "flat"


def position_bias_index(positions: List[float], accuracies: List[float]) -> float:
    """
    PBI = (acc_start + acc_end) / 2 - acc_middle
    Positive = U-shaped bias (bad). Zero = flat (good). Negative = inverted (rare).
    """
    if len(positions) < 3:
        return 0.0
    mid_idx = len(positions) // 2
    edge = (accuracies[0] + accuracies[-1]) / 2.0
    return edge - accuracies[mid_idx]


def weighted_position_bias_index(positions: List[float], accuracies: List[float]) -> float:
    """
    Weighted PBI that accounts for full curve shape using Simpson's rule for AUC.
    PBI_w = AUC_edge_region - AUC_middle_region
    """
    if len(positions) < 3:
        return 0.0

    # Split into edge (first 25% + last 25%) and middle (50%)
    n = len(positions)
    edge_start = int(0.25 * n)
    edge_end = int(0.75 * n)

    edge_acc = accuracies[:edge_start] + accuracies[edge_end:]
    mid_acc = accuracies[edge_start:edge_end]

    if not edge_acc or not mid_acc:
        return position_bias_index(positions, accuracies)

    return np.mean(edge_acc) - np.mean(mid_acc)


def cross_task_pbi_correlation(
    model_results: Dict[str, Dict[str, List[float]]],
    task_order: List[str],
) -> Dict[str, float]:
    """
    Compute correlation of PBI across tasks for each model.

    Args:
        model_results: {model_name: {task_name: [accuracies]}}
        task_order: Ordered list of task names for consistent indexing

    Returns:
        {model_name: correlation_matrix}
    """
    from scipy.spatial.distance import pdist, squareform

    correlations = {}
    for model_name, tasks in model_results.items():
        pbi_vector = []
        for task in task_order:
            if task in tasks:
                # Compute PBI for this task (positions are assumed uniform)
                positions = [i / (len(tasks[task]) - 1) for i in range(len(tasks[task]))]
                pbi = position_bias_index(positions, tasks[task])
                pbi_vector.append(pbi)
            else:
                pbi_vector.append(0.0)

        if len(pbi_vector) >= 2:
            correlations[model_name] = pbi_vector

    return correlations


def rank_models_by_bias(pbi_scores: Dict[str, float], ascending: bool = True) -> List[Tuple[str, float]]:
    """Rank models by PBI. Lower = less bias (better)."""
    sorted_scores = sorted(pbi_scores.items(), key=lambda x: x[1], reverse=not ascending)
    return sorted_scores