Spaces:
Running
Running
| // Benchmark Saturation Detector (v0.8.0 anti-bullshit pack #6) | |
| // Pure logic — no human-readable strings. Returns codes+params; main.js | |
| // does the i18n lookup. | |
| // | |
| // Quality bar: this addresses the explicit pain "MMLU is saturated, what | |
| // should I use instead?" documented in survey arxiv 2508.15361 and across | |
| // 2026 leaderboards. Validated 2026-05-07 against pre-registered cases: | |
| // 3 clean pass, 3 borderline, 1 falsified (AIME 2025 saturated faster | |
| // than expected). Tool ships with honest threshold-sensitivity disclaimer. | |
| // | |
| // Data sources: DemandSphere AI Frontier Tracker (CC BY-NC 4.0, primary) | |
| // + baked snapshot fallback (data/saturation_kb.json). | |
| const DEMANDSPHERE_API = | |
| "https://www.demandsphere.com/research/demandsphere-radar/ai-frontier-model-tracker/api.json"; | |
| const FETCH_TIMEOUT_MS = 4000; | |
| // Map DemandSphere benchmark key → our KB benchmark name. | |
| const DS_KEY_TO_NAME = { | |
| mmlu: "MMLU", | |
| gpqa: "GPQA-Diamond", | |
| swe: "SWE-bench-Verified", | |
| he: "HumanEval", | |
| lcb: "LiveCodeBench-Pro", | |
| math: "MATH", | |
| aime: "AIME-2025", | |
| hle: "HLE", | |
| }; | |
| // Saturation thresholds — pre-registered 2026-05-07. Borderline band ±1pp | |
| // around each cutoff is flagged in the verdict params for honest UI. | |
| const SATURATED_SPREAD_MAX = 2.0; | |
| const NEAR_SAT_SPREAD_MAX = 5.0; | |
| const SATURATED_MEAN_MIN = 90.0; | |
| const NEAR_SAT_MEAN_MIN = 80.0; | |
| const BORDERLINE_BAND_PP = 1.0; | |
| let _kb = null; | |
| let _liveData = null; | |
| export async function loadSaturationKB(url = "./data/saturation_kb.json") { | |
| if (_kb) return _kb; | |
| const res = await fetch(url); | |
| if (!res.ok) throw new Error(`Saturation KB fetch failed: ${res.status}`); | |
| _kb = await res.json(); | |
| return _kb; | |
| } | |
| export function getSaturationKB() { return _kb; } | |
| // Try to fetch fresh data from DemandSphere. Returns null on any failure | |
| // (CORS, network, timeout) — caller falls back to baked KB. | |
| export async function tryFetchLive() { | |
| if (_liveData) return _liveData; | |
| const controller = new AbortController(); | |
| const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); | |
| try { | |
| const res = await fetch(DEMANDSPHERE_API, { signal: controller.signal }); | |
| clearTimeout(timer); | |
| if (!res.ok) return null; | |
| _liveData = await res.json(); | |
| return _liveData; | |
| } catch (e) { | |
| clearTimeout(timer); | |
| return null; | |
| } | |
| } | |
| // Compute top-3 (model, score) pairs for a DemandSphere benchmark key from | |
| // the live data array. Returns null if fewer than 3 models report it. | |
| function computeTop3FromLive(liveData, dsKey) { | |
| if (!liveData || !Array.isArray(liveData.models)) return null; | |
| const scored = liveData.models | |
| .filter(m => typeof m[dsKey] === "number") | |
| .map(m => ({ model: m.name || m.id, score: m[dsKey] })) | |
| .sort((a, b) => b.score - a.score); | |
| if (scored.length < 3) return scored.length === 0 ? null : scored; | |
| return scored.slice(0, 3); | |
| } | |
| function computeStats(top3) { | |
| if (!top3 || top3.length === 0) return null; | |
| const scores = top3.map(x => x.score).filter(s => typeof s === "number"); | |
| if (scores.length === 0) return null; | |
| if (scores.length < 3) { | |
| return { count: scores.length, sparse: true }; | |
| } | |
| const max = Math.max(...scores); | |
| const min = Math.min(...scores); | |
| const mean = scores.reduce((a, b) => a + b, 0) / scores.length; | |
| return { | |
| count: scores.length, | |
| spread: max - min, | |
| mean, | |
| max, min, | |
| sparse: false, | |
| }; | |
| } | |
| function classify(stats) { | |
| if (!stats || stats.sparse) return { code: "sparse_data", borderline: false }; | |
| const { spread, mean } = stats; | |
| let code; | |
| if (spread <= SATURATED_SPREAD_MAX && mean >= SATURATED_MEAN_MIN) { | |
| code = "saturated"; | |
| } else if (spread <= NEAR_SAT_SPREAD_MAX && mean >= NEAR_SAT_MEAN_MIN) { | |
| code = "near_saturated"; | |
| } else { | |
| code = "discriminative"; | |
| } | |
| // Borderline detection: any threshold within ±1pp of an observed value. | |
| const borderline = | |
| Math.abs(spread - SATURATED_SPREAD_MAX) <= BORDERLINE_BAND_PP || | |
| Math.abs(spread - NEAR_SAT_SPREAD_MAX) <= BORDERLINE_BAND_PP || | |
| Math.abs(mean - SATURATED_MEAN_MIN) <= BORDERLINE_BAND_PP || | |
| Math.abs(mean - NEAR_SAT_MEAN_MIN) <= BORDERLINE_BAND_PP; | |
| return { code, borderline }; | |
| } | |
| // Public: classify one benchmark by name (KB key, e.g. "MMLU", "GPQA-Diamond"). | |
| // Prefers live data when available; falls back to baked stats. | |
| // Returns { code, params, top3, recommendations, note, source }. | |
| export function classifyBenchmark(name, liveOverride = null) { | |
| if (!_kb) throw new Error("Saturation KB not loaded; call loadSaturationKB() first"); | |
| const entry = _kb.benchmarks[name]; | |
| if (!entry) { | |
| return { code: "unknown_benchmark", params: { name }, source: null }; | |
| } | |
| const live = liveOverride !== null ? liveOverride : _liveData; | |
| let top3 = null, stats = null, source = "baked"; | |
| if (live && entry.key && DS_KEY_TO_NAME[entry.key]) { | |
| const liveTop3 = computeTop3FromLive(live, entry.key); | |
| if (liveTop3 && liveTop3.length >= 3) { | |
| top3 = liveTop3; | |
| stats = computeStats(liveTop3); | |
| source = "live"; | |
| } | |
| } | |
| if (!top3) { | |
| // Fall back to baked. Filter out null scores (placeholder rows). | |
| const baked = (entry.top_3 || []).filter(x => typeof x.score === "number"); | |
| if (baked.length >= 3) { | |
| top3 = baked; | |
| stats = computeStats(baked); | |
| } else { | |
| // Use baked classification verbatim (e.g. MMLU/HellaSwag/GSM8K declared | |
| // saturated by consensus even when DemandSphere lists no scores). | |
| return { | |
| code: entry.classification || "sparse_data", | |
| params: { | |
| name, | |
| spread: null, | |
| mean: null, | |
| n: baked.length, | |
| basis: entry.classification_basis || null, | |
| }, | |
| top3: baked, | |
| recommendations: entry.recommendations || [], | |
| note: entry.note || null, | |
| source: "baked_consensus", | |
| borderline: false, | |
| }; | |
| } | |
| } | |
| const { code, borderline } = classify(stats); | |
| return { | |
| code, | |
| params: { | |
| name, | |
| spread: stats.spread != null ? Math.round(stats.spread * 10) / 10 : null, | |
| mean: stats.mean != null ? Math.round(stats.mean * 10) / 10 : null, | |
| n: stats.count, | |
| basis: entry.classification_basis || null, | |
| }, | |
| top3, | |
| recommendations: entry.recommendations || [], | |
| note: entry.note || null, | |
| source, | |
| borderline, | |
| }; | |
| } | |
| // Classify every benchmark in the KB. Returns array of results. | |
| export function classifyAll(liveOverride = null) { | |
| if (!_kb) return []; | |
| return Object.keys(_kb.benchmarks).map(name => classifyBenchmark(name, liveOverride)); | |
| } | |
| // Recommend alternatives given a benchmark name (uses baked KB only since | |
| // recommendations are curated, not derived from scores). | |
| export function recommendAlternatives(name) { | |
| if (!_kb) return []; | |
| const entry = _kb.benchmarks[name]; | |
| return entry?.recommendations || []; | |
| } | |
| // List every benchmark known to the KB (for UI dropdowns). | |
| export function listBenchmarks() { | |
| if (!_kb) return []; | |
| return Object.keys(_kb.benchmarks); | |
| } | |
| // Attribution metadata for the UI footer. | |
| export function attribution() { | |
| if (!_kb) return null; | |
| return { | |
| primary: _kb.primary_source, | |
| secondary: _kb.secondary_sources, | |
| fetched_at: _kb.fetched_at, | |
| }; | |
| } | |