taf-agent / js /saturation_detector.js
karlexmarin's picture
v0.8.0 Benchmark Saturation Detector — anti-bullshit pack #6
7c80934
// Benchmark Saturation Detector (v0.8.0 anti-bullshit pack #6)
// Pure logic — no human-readable strings. Returns codes+params; main.js
// does the i18n lookup.
//
// Quality bar: this addresses the explicit pain "MMLU is saturated, what
// should I use instead?" documented in survey arxiv 2508.15361 and across
// 2026 leaderboards. Validated 2026-05-07 against pre-registered cases:
// 3 clean pass, 3 borderline, 1 falsified (AIME 2025 saturated faster
// than expected). Tool ships with honest threshold-sensitivity disclaimer.
//
// Data sources: DemandSphere AI Frontier Tracker (CC BY-NC 4.0, primary)
// + baked snapshot fallback (data/saturation_kb.json).
const DEMANDSPHERE_API =
"https://www.demandsphere.com/research/demandsphere-radar/ai-frontier-model-tracker/api.json";
const FETCH_TIMEOUT_MS = 4000;
// Map DemandSphere benchmark key → our KB benchmark name.
const DS_KEY_TO_NAME = {
mmlu: "MMLU",
gpqa: "GPQA-Diamond",
swe: "SWE-bench-Verified",
he: "HumanEval",
lcb: "LiveCodeBench-Pro",
math: "MATH",
aime: "AIME-2025",
hle: "HLE",
};
// Saturation thresholds — pre-registered 2026-05-07. Borderline band ±1pp
// around each cutoff is flagged in the verdict params for honest UI.
const SATURATED_SPREAD_MAX = 2.0;
const NEAR_SAT_SPREAD_MAX = 5.0;
const SATURATED_MEAN_MIN = 90.0;
const NEAR_SAT_MEAN_MIN = 80.0;
const BORDERLINE_BAND_PP = 1.0;
let _kb = null;
let _liveData = null;
export async function loadSaturationKB(url = "./data/saturation_kb.json") {
if (_kb) return _kb;
const res = await fetch(url);
if (!res.ok) throw new Error(`Saturation KB fetch failed: ${res.status}`);
_kb = await res.json();
return _kb;
}
export function getSaturationKB() { return _kb; }
// Try to fetch fresh data from DemandSphere. Returns null on any failure
// (CORS, network, timeout) — caller falls back to baked KB.
export async function tryFetchLive() {
if (_liveData) return _liveData;
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
try {
const res = await fetch(DEMANDSPHERE_API, { signal: controller.signal });
clearTimeout(timer);
if (!res.ok) return null;
_liveData = await res.json();
return _liveData;
} catch (e) {
clearTimeout(timer);
return null;
}
}
// Compute top-3 (model, score) pairs for a DemandSphere benchmark key from
// the live data array. Returns null if fewer than 3 models report it.
function computeTop3FromLive(liveData, dsKey) {
if (!liveData || !Array.isArray(liveData.models)) return null;
const scored = liveData.models
.filter(m => typeof m[dsKey] === "number")
.map(m => ({ model: m.name || m.id, score: m[dsKey] }))
.sort((a, b) => b.score - a.score);
if (scored.length < 3) return scored.length === 0 ? null : scored;
return scored.slice(0, 3);
}
function computeStats(top3) {
if (!top3 || top3.length === 0) return null;
const scores = top3.map(x => x.score).filter(s => typeof s === "number");
if (scores.length === 0) return null;
if (scores.length < 3) {
return { count: scores.length, sparse: true };
}
const max = Math.max(...scores);
const min = Math.min(...scores);
const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
return {
count: scores.length,
spread: max - min,
mean,
max, min,
sparse: false,
};
}
function classify(stats) {
if (!stats || stats.sparse) return { code: "sparse_data", borderline: false };
const { spread, mean } = stats;
let code;
if (spread <= SATURATED_SPREAD_MAX && mean >= SATURATED_MEAN_MIN) {
code = "saturated";
} else if (spread <= NEAR_SAT_SPREAD_MAX && mean >= NEAR_SAT_MEAN_MIN) {
code = "near_saturated";
} else {
code = "discriminative";
}
// Borderline detection: any threshold within ±1pp of an observed value.
const borderline =
Math.abs(spread - SATURATED_SPREAD_MAX) <= BORDERLINE_BAND_PP ||
Math.abs(spread - NEAR_SAT_SPREAD_MAX) <= BORDERLINE_BAND_PP ||
Math.abs(mean - SATURATED_MEAN_MIN) <= BORDERLINE_BAND_PP ||
Math.abs(mean - NEAR_SAT_MEAN_MIN) <= BORDERLINE_BAND_PP;
return { code, borderline };
}
// Public: classify one benchmark by name (KB key, e.g. "MMLU", "GPQA-Diamond").
// Prefers live data when available; falls back to baked stats.
// Returns { code, params, top3, recommendations, note, source }.
export function classifyBenchmark(name, liveOverride = null) {
if (!_kb) throw new Error("Saturation KB not loaded; call loadSaturationKB() first");
const entry = _kb.benchmarks[name];
if (!entry) {
return { code: "unknown_benchmark", params: { name }, source: null };
}
const live = liveOverride !== null ? liveOverride : _liveData;
let top3 = null, stats = null, source = "baked";
if (live && entry.key && DS_KEY_TO_NAME[entry.key]) {
const liveTop3 = computeTop3FromLive(live, entry.key);
if (liveTop3 && liveTop3.length >= 3) {
top3 = liveTop3;
stats = computeStats(liveTop3);
source = "live";
}
}
if (!top3) {
// Fall back to baked. Filter out null scores (placeholder rows).
const baked = (entry.top_3 || []).filter(x => typeof x.score === "number");
if (baked.length >= 3) {
top3 = baked;
stats = computeStats(baked);
} else {
// Use baked classification verbatim (e.g. MMLU/HellaSwag/GSM8K declared
// saturated by consensus even when DemandSphere lists no scores).
return {
code: entry.classification || "sparse_data",
params: {
name,
spread: null,
mean: null,
n: baked.length,
basis: entry.classification_basis || null,
},
top3: baked,
recommendations: entry.recommendations || [],
note: entry.note || null,
source: "baked_consensus",
borderline: false,
};
}
}
const { code, borderline } = classify(stats);
return {
code,
params: {
name,
spread: stats.spread != null ? Math.round(stats.spread * 10) / 10 : null,
mean: stats.mean != null ? Math.round(stats.mean * 10) / 10 : null,
n: stats.count,
basis: entry.classification_basis || null,
},
top3,
recommendations: entry.recommendations || [],
note: entry.note || null,
source,
borderline,
};
}
// Classify every benchmark in the KB. Returns array of results.
export function classifyAll(liveOverride = null) {
if (!_kb) return [];
return Object.keys(_kb.benchmarks).map(name => classifyBenchmark(name, liveOverride));
}
// Recommend alternatives given a benchmark name (uses baked KB only since
// recommendations are curated, not derived from scores).
export function recommendAlternatives(name) {
if (!_kb) return [];
const entry = _kb.benchmarks[name];
return entry?.recommendations || [];
}
// List every benchmark known to the KB (for UI dropdowns).
export function listBenchmarks() {
if (!_kb) return [];
return Object.keys(_kb.benchmarks);
}
// Attribution metadata for the UI footer.
export function attribution() {
if (!_kb) return null;
return {
primary: _kb.primary_source,
secondary: _kb.secondary_sources,
fetched_at: _kb.fetched_at,
};
}