Spaces:
Running
Running
File size: 7,231 Bytes
7c80934 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | // Benchmark Saturation Detector (v0.8.0 anti-bullshit pack #6)
// Pure logic — no human-readable strings. Returns codes+params; main.js
// does the i18n lookup.
//
// Quality bar: this addresses the explicit pain "MMLU is saturated, what
// should I use instead?" documented in survey arxiv 2508.15361 and across
// 2026 leaderboards. Validated 2026-05-07 against pre-registered cases:
// 3 clean pass, 3 borderline, 1 falsified (AIME 2025 saturated faster
// than expected). Tool ships with honest threshold-sensitivity disclaimer.
//
// Data sources: DemandSphere AI Frontier Tracker (CC BY-NC 4.0, primary)
// + baked snapshot fallback (data/saturation_kb.json).
const DEMANDSPHERE_API =
"https://www.demandsphere.com/research/demandsphere-radar/ai-frontier-model-tracker/api.json";
const FETCH_TIMEOUT_MS = 4000;
// Map DemandSphere benchmark key → our KB benchmark name.
const DS_KEY_TO_NAME = {
mmlu: "MMLU",
gpqa: "GPQA-Diamond",
swe: "SWE-bench-Verified",
he: "HumanEval",
lcb: "LiveCodeBench-Pro",
math: "MATH",
aime: "AIME-2025",
hle: "HLE",
};
// Saturation thresholds — pre-registered 2026-05-07. Borderline band ±1pp
// around each cutoff is flagged in the verdict params for honest UI.
const SATURATED_SPREAD_MAX = 2.0;
const NEAR_SAT_SPREAD_MAX = 5.0;
const SATURATED_MEAN_MIN = 90.0;
const NEAR_SAT_MEAN_MIN = 80.0;
const BORDERLINE_BAND_PP = 1.0;
let _kb = null;
let _liveData = null;
export async function loadSaturationKB(url = "./data/saturation_kb.json") {
if (_kb) return _kb;
const res = await fetch(url);
if (!res.ok) throw new Error(`Saturation KB fetch failed: ${res.status}`);
_kb = await res.json();
return _kb;
}
export function getSaturationKB() { return _kb; }
// Try to fetch fresh data from DemandSphere. Returns null on any failure
// (CORS, network, timeout) — caller falls back to baked KB.
export async function tryFetchLive() {
if (_liveData) return _liveData;
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
try {
const res = await fetch(DEMANDSPHERE_API, { signal: controller.signal });
clearTimeout(timer);
if (!res.ok) return null;
_liveData = await res.json();
return _liveData;
} catch (e) {
clearTimeout(timer);
return null;
}
}
// Compute top-3 (model, score) pairs for a DemandSphere benchmark key from
// the live data array. Returns null if fewer than 3 models report it.
function computeTop3FromLive(liveData, dsKey) {
if (!liveData || !Array.isArray(liveData.models)) return null;
const scored = liveData.models
.filter(m => typeof m[dsKey] === "number")
.map(m => ({ model: m.name || m.id, score: m[dsKey] }))
.sort((a, b) => b.score - a.score);
if (scored.length < 3) return scored.length === 0 ? null : scored;
return scored.slice(0, 3);
}
function computeStats(top3) {
if (!top3 || top3.length === 0) return null;
const scores = top3.map(x => x.score).filter(s => typeof s === "number");
if (scores.length === 0) return null;
if (scores.length < 3) {
return { count: scores.length, sparse: true };
}
const max = Math.max(...scores);
const min = Math.min(...scores);
const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
return {
count: scores.length,
spread: max - min,
mean,
max, min,
sparse: false,
};
}
function classify(stats) {
if (!stats || stats.sparse) return { code: "sparse_data", borderline: false };
const { spread, mean } = stats;
let code;
if (spread <= SATURATED_SPREAD_MAX && mean >= SATURATED_MEAN_MIN) {
code = "saturated";
} else if (spread <= NEAR_SAT_SPREAD_MAX && mean >= NEAR_SAT_MEAN_MIN) {
code = "near_saturated";
} else {
code = "discriminative";
}
// Borderline detection: any threshold within ±1pp of an observed value.
const borderline =
Math.abs(spread - SATURATED_SPREAD_MAX) <= BORDERLINE_BAND_PP ||
Math.abs(spread - NEAR_SAT_SPREAD_MAX) <= BORDERLINE_BAND_PP ||
Math.abs(mean - SATURATED_MEAN_MIN) <= BORDERLINE_BAND_PP ||
Math.abs(mean - NEAR_SAT_MEAN_MIN) <= BORDERLINE_BAND_PP;
return { code, borderline };
}
// Public: classify one benchmark by name (KB key, e.g. "MMLU", "GPQA-Diamond").
// Prefers live data when available; falls back to baked stats.
// Returns { code, params, top3, recommendations, note, source }.
export function classifyBenchmark(name, liveOverride = null) {
if (!_kb) throw new Error("Saturation KB not loaded; call loadSaturationKB() first");
const entry = _kb.benchmarks[name];
if (!entry) {
return { code: "unknown_benchmark", params: { name }, source: null };
}
const live = liveOverride !== null ? liveOverride : _liveData;
let top3 = null, stats = null, source = "baked";
if (live && entry.key && DS_KEY_TO_NAME[entry.key]) {
const liveTop3 = computeTop3FromLive(live, entry.key);
if (liveTop3 && liveTop3.length >= 3) {
top3 = liveTop3;
stats = computeStats(liveTop3);
source = "live";
}
}
if (!top3) {
// Fall back to baked. Filter out null scores (placeholder rows).
const baked = (entry.top_3 || []).filter(x => typeof x.score === "number");
if (baked.length >= 3) {
top3 = baked;
stats = computeStats(baked);
} else {
// Use baked classification verbatim (e.g. MMLU/HellaSwag/GSM8K declared
// saturated by consensus even when DemandSphere lists no scores).
return {
code: entry.classification || "sparse_data",
params: {
name,
spread: null,
mean: null,
n: baked.length,
basis: entry.classification_basis || null,
},
top3: baked,
recommendations: entry.recommendations || [],
note: entry.note || null,
source: "baked_consensus",
borderline: false,
};
}
}
const { code, borderline } = classify(stats);
return {
code,
params: {
name,
spread: stats.spread != null ? Math.round(stats.spread * 10) / 10 : null,
mean: stats.mean != null ? Math.round(stats.mean * 10) / 10 : null,
n: stats.count,
basis: entry.classification_basis || null,
},
top3,
recommendations: entry.recommendations || [],
note: entry.note || null,
source,
borderline,
};
}
// Classify every benchmark in the KB. Returns array of results.
export function classifyAll(liveOverride = null) {
if (!_kb) return [];
return Object.keys(_kb.benchmarks).map(name => classifyBenchmark(name, liveOverride));
}
// Recommend alternatives given a benchmark name (uses baked KB only since
// recommendations are curated, not derived from scores).
export function recommendAlternatives(name) {
if (!_kb) return [];
const entry = _kb.benchmarks[name];
return entry?.recommendations || [];
}
// List every benchmark known to the KB (for UI dropdowns).
export function listBenchmarks() {
if (!_kb) return [];
return Object.keys(_kb.benchmarks);
}
// Attribution metadata for the UI footer.
export function attribution() {
if (!_kb) return null;
return {
primary: _kb.primary_source,
secondary: _kb.secondary_sources,
fetched_at: _kb.fetched_at,
};
}
|