File size: 7,231 Bytes
7c80934
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
// Benchmark Saturation Detector (v0.8.0 anti-bullshit pack #6)
// Pure logic — no human-readable strings. Returns codes+params; main.js
// does the i18n lookup.
//
// Quality bar: this addresses the explicit pain "MMLU is saturated, what
// should I use instead?" documented in survey arxiv 2508.15361 and across
// 2026 leaderboards. Validated 2026-05-07 against pre-registered cases:
// 3 clean pass, 3 borderline, 1 falsified (AIME 2025 saturated faster
// than expected). Tool ships with honest threshold-sensitivity disclaimer.
//
// Data sources: DemandSphere AI Frontier Tracker (CC BY-NC 4.0, primary)
// + baked snapshot fallback (data/saturation_kb.json).

const DEMANDSPHERE_API =
  "https://www.demandsphere.com/research/demandsphere-radar/ai-frontier-model-tracker/api.json";

const FETCH_TIMEOUT_MS = 4000;

// Map DemandSphere benchmark key → our KB benchmark name.
const DS_KEY_TO_NAME = {
  mmlu: "MMLU",
  gpqa: "GPQA-Diamond",
  swe: "SWE-bench-Verified",
  he: "HumanEval",
  lcb: "LiveCodeBench-Pro",
  math: "MATH",
  aime: "AIME-2025",
  hle: "HLE",
};

// Saturation thresholds — pre-registered 2026-05-07. Borderline band ±1pp
// around each cutoff is flagged in the verdict params for honest UI.
const SATURATED_SPREAD_MAX = 2.0;
const NEAR_SAT_SPREAD_MAX = 5.0;
const SATURATED_MEAN_MIN = 90.0;
const NEAR_SAT_MEAN_MIN = 80.0;
const BORDERLINE_BAND_PP = 1.0;

let _kb = null;
let _liveData = null;

export async function loadSaturationKB(url = "./data/saturation_kb.json") {
  if (_kb) return _kb;
  const res = await fetch(url);
  if (!res.ok) throw new Error(`Saturation KB fetch failed: ${res.status}`);
  _kb = await res.json();
  return _kb;
}

export function getSaturationKB() { return _kb; }

// Try to fetch fresh data from DemandSphere. Returns null on any failure
// (CORS, network, timeout) — caller falls back to baked KB.
export async function tryFetchLive() {
  if (_liveData) return _liveData;
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
  try {
    const res = await fetch(DEMANDSPHERE_API, { signal: controller.signal });
    clearTimeout(timer);
    if (!res.ok) return null;
    _liveData = await res.json();
    return _liveData;
  } catch (e) {
    clearTimeout(timer);
    return null;
  }
}

// Compute top-3 (model, score) pairs for a DemandSphere benchmark key from
// the live data array. Returns null if fewer than 3 models report it.
function computeTop3FromLive(liveData, dsKey) {
  if (!liveData || !Array.isArray(liveData.models)) return null;
  const scored = liveData.models
    .filter(m => typeof m[dsKey] === "number")
    .map(m => ({ model: m.name || m.id, score: m[dsKey] }))
    .sort((a, b) => b.score - a.score);
  if (scored.length < 3) return scored.length === 0 ? null : scored;
  return scored.slice(0, 3);
}

function computeStats(top3) {
  if (!top3 || top3.length === 0) return null;
  const scores = top3.map(x => x.score).filter(s => typeof s === "number");
  if (scores.length === 0) return null;
  if (scores.length < 3) {
    return { count: scores.length, sparse: true };
  }
  const max = Math.max(...scores);
  const min = Math.min(...scores);
  const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
  return {
    count: scores.length,
    spread: max - min,
    mean,
    max, min,
    sparse: false,
  };
}

function classify(stats) {
  if (!stats || stats.sparse) return { code: "sparse_data", borderline: false };
  const { spread, mean } = stats;
  let code;
  if (spread <= SATURATED_SPREAD_MAX && mean >= SATURATED_MEAN_MIN) {
    code = "saturated";
  } else if (spread <= NEAR_SAT_SPREAD_MAX && mean >= NEAR_SAT_MEAN_MIN) {
    code = "near_saturated";
  } else {
    code = "discriminative";
  }
  // Borderline detection: any threshold within ±1pp of an observed value.
  const borderline =
    Math.abs(spread - SATURATED_SPREAD_MAX) <= BORDERLINE_BAND_PP ||
    Math.abs(spread - NEAR_SAT_SPREAD_MAX) <= BORDERLINE_BAND_PP ||
    Math.abs(mean - SATURATED_MEAN_MIN) <= BORDERLINE_BAND_PP ||
    Math.abs(mean - NEAR_SAT_MEAN_MIN) <= BORDERLINE_BAND_PP;
  return { code, borderline };
}

// Public: classify one benchmark by name (KB key, e.g. "MMLU", "GPQA-Diamond").
// Prefers live data when available; falls back to baked stats.
// Returns { code, params, top3, recommendations, note, source }.
export function classifyBenchmark(name, liveOverride = null) {
  if (!_kb) throw new Error("Saturation KB not loaded; call loadSaturationKB() first");
  const entry = _kb.benchmarks[name];
  if (!entry) {
    return { code: "unknown_benchmark", params: { name }, source: null };
  }
  const live = liveOverride !== null ? liveOverride : _liveData;
  let top3 = null, stats = null, source = "baked";
  if (live && entry.key && DS_KEY_TO_NAME[entry.key]) {
    const liveTop3 = computeTop3FromLive(live, entry.key);
    if (liveTop3 && liveTop3.length >= 3) {
      top3 = liveTop3;
      stats = computeStats(liveTop3);
      source = "live";
    }
  }
  if (!top3) {
    // Fall back to baked. Filter out null scores (placeholder rows).
    const baked = (entry.top_3 || []).filter(x => typeof x.score === "number");
    if (baked.length >= 3) {
      top3 = baked;
      stats = computeStats(baked);
    } else {
      // Use baked classification verbatim (e.g. MMLU/HellaSwag/GSM8K declared
      // saturated by consensus even when DemandSphere lists no scores).
      return {
        code: entry.classification || "sparse_data",
        params: {
          name,
          spread: null,
          mean: null,
          n: baked.length,
          basis: entry.classification_basis || null,
        },
        top3: baked,
        recommendations: entry.recommendations || [],
        note: entry.note || null,
        source: "baked_consensus",
        borderline: false,
      };
    }
  }
  const { code, borderline } = classify(stats);
  return {
    code,
    params: {
      name,
      spread: stats.spread != null ? Math.round(stats.spread * 10) / 10 : null,
      mean: stats.mean != null ? Math.round(stats.mean * 10) / 10 : null,
      n: stats.count,
      basis: entry.classification_basis || null,
    },
    top3,
    recommendations: entry.recommendations || [],
    note: entry.note || null,
    source,
    borderline,
  };
}

// Classify every benchmark in the KB. Returns array of results.
export function classifyAll(liveOverride = null) {
  if (!_kb) return [];
  return Object.keys(_kb.benchmarks).map(name => classifyBenchmark(name, liveOverride));
}

// Recommend alternatives given a benchmark name (uses baked KB only since
// recommendations are curated, not derived from scores).
export function recommendAlternatives(name) {
  if (!_kb) return [];
  const entry = _kb.benchmarks[name];
  return entry?.recommendations || [];
}

// List every benchmark known to the KB (for UI dropdowns).
export function listBenchmarks() {
  if (!_kb) return [];
  return Object.keys(_kb.benchmarks);
}

// Attribution metadata for the UI footer.
export function attribution() {
  if (!_kb) return null;
  return {
    primary: _kb.primary_source,
    secondary: _kb.secondary_sources,
    fetched_at: _kb.fetched_at,
  };
}