Upload folder using huggingface_hub

7e9ce74 verified 2 days ago

4.24 kB

	"""
	Position Bias Taxonomy Framework

	Defines formal taxonomy of position bias phenomena:
	- Primacy Bias: Strong performance at beginning
	- Recency Bias: Strong performance at end
	- U-Shaped Bias: Both primacy and recency
	- Middle-Sag: Worst at exact middle
	- Flat: No position effect
	- Inverted-U: Best at middle (rare)

	Provides Position Bias Index (PBI) and cross-task correlation.
	"""
	import numpy as np
	from typing import List, Dict, Tuple
	from scipy import stats


	def classify_bias(positions: List[float], accuracies: List[float]) -> str:
	"""
	Classify the type of position bias from a curve.

	Args:
	positions: Normalized positions [0, 1]
	accuracies: Accuracy at each position

	Returns:
	One of: "primacy", "recency", "u-shaped", "middle-sag",
	"flat", "inverted-u", "insufficient-data"
	"""
	if len(positions) < 3:
	return "insufficient-data"

	first = accuracies[0]
	last = accuracies[-1]
	mid_idx = len(positions) // 2
	middle = accuracies[mid_idx]

	edge_mean = (first + last) / 2.0
	overall_mean = np.mean(accuracies)
	std = np.std(accuracies)

	# Flat if all within 1 std of mean
	if std < 0.05:
	return "flat"

	# Inverted-U: middle is best
	if middle > max(first, last) + 0.1:
	return "inverted-u"

	# U-shaped: edges better than middle
	if edge_mean > middle + 0.1:
	if first > middle + 0.05 and last > middle + 0.05:
	return "u-shaped"
	elif first > middle + 0.05:
	return "primacy"
	elif last > middle + 0.05:
	return "recency"
	return "middle-sag"

	# Only one edge is strong
	if first > last + 0.1:
	return "primacy"
	if last > first + 0.1:
	return "recency"

	return "flat"


	def position_bias_index(positions: List[float], accuracies: List[float]) -> float:
	"""
	PBI = (acc_start + acc_end) / 2 - acc_middle
	Positive = U-shaped bias (bad). Zero = flat (good). Negative = inverted (rare).
	"""
	if len(positions) < 3:
	return 0.0
	mid_idx = len(positions) // 2
	edge = (accuracies[0] + accuracies[-1]) / 2.0
	return edge - accuracies[mid_idx]


	def weighted_position_bias_index(positions: List[float], accuracies: List[float]) -> float:
	"""
	Weighted PBI that accounts for full curve shape using Simpson's rule for AUC.
	PBI_w = AUC_edge_region - AUC_middle_region
	"""
	if len(positions) < 3:
	return 0.0

	# Split into edge (first 25% + last 25%) and middle (50%)
	n = len(positions)
	edge_start = int(0.25 * n)
	edge_end = int(0.75 * n)

	edge_acc = accuracies[:edge_start] + accuracies[edge_end:]
	mid_acc = accuracies[edge_start:edge_end]

	if not edge_acc or not mid_acc:
	return position_bias_index(positions, accuracies)

	return np.mean(edge_acc) - np.mean(mid_acc)


	def cross_task_pbi_correlation(
	model_results: Dict[str, Dict[str, List[float]]],
	task_order: List[str],
	) -> Dict[str, float]:
	"""
	Compute correlation of PBI across tasks for each model.

	Args:
	model_results: {model_name: {task_name: [accuracies]}}
	task_order: Ordered list of task names for consistent indexing

	Returns:
	{model_name: correlation_matrix}
	"""
	from scipy.spatial.distance import pdist, squareform

	correlations = {}
	for model_name, tasks in model_results.items():
	pbi_vector = []
	for task in task_order:
	if task in tasks:
	# Compute PBI for this task (positions are assumed uniform)
	positions = [i / (len(tasks[task]) - 1) for i in range(len(tasks[task]))]
	pbi = position_bias_index(positions, tasks[task])
	pbi_vector.append(pbi)
	else:
	pbi_vector.append(0.0)

	if len(pbi_vector) >= 2:
	correlations[model_name] = pbi_vector

	return correlations


	def rank_models_by_bias(pbi_scores: Dict[str, float], ascending: bool = True) -> List[Tuple[str, float]]:
	"""Rank models by PBI. Lower = less bias (better)."""
	sorted_scores = sorted(pbi_scores.items(), key=lambda x: x[1], reverse=not ascending)
	return sorted_scores