Spaces:

karlexmarin
/

taf-agent

Running

File size: 7,231 Bytes

7c80934

// Benchmark Saturation Detector (v0.8.0 anti-bullshit pack #6)
// Pure logic — no human-readable strings. Returns codes+params; main.js
// does the i18n lookup.
//
// Quality bar: this addresses the explicit pain "MMLU is saturated, what
// should I use instead?" documented in survey arxiv 2508.15361 and across
// 2026 leaderboards. Validated 2026-05-07 against pre-registered cases:
// 3 clean pass, 3 borderline, 1 falsified (AIME 2025 saturated faster
// than expected). Tool ships with honest threshold-sensitivity disclaimer.
//
// Data sources: DemandSphere AI Frontier Tracker (CC BY-NC 4.0, primary)
// + baked snapshot fallback (data/saturation_kb.json).

const DEMANDSPHERE_API =
  "https://www.demandsphere.com/research/demandsphere-radar/ai-frontier-model-tracker/api.json";

const FETCH_TIMEOUT_MS = 4000;

// Map DemandSphere benchmark key → our KB benchmark name.
const DS_KEY_TO_NAME = {
  mmlu: "MMLU",
  gpqa: "GPQA-Diamond",
  swe: "SWE-bench-Verified",
  he: "HumanEval",
  lcb: "LiveCodeBench-Pro",
  math: "MATH",
  aime: "AIME-2025",
  hle: "HLE",
};

// Saturation thresholds — pre-registered 2026-05-07. Borderline band ±1pp
// around each cutoff is flagged in the verdict params for honest UI.
const SATURATED_SPREAD_MAX = 2.0;
const NEAR_SAT_SPREAD_MAX = 5.0;
const SATURATED_MEAN_MIN = 90.0;
const NEAR_SAT_MEAN_MIN = 80.0;
const BORDERLINE_BAND_PP = 1.0;

let _kb = null;
let _liveData = null;

export async function loadSaturationKB(url = "./data/saturation_kb.json") {
  if (_kb) return _kb;
  const res = await fetch(url);
  if (!res.ok) throw new Error(`Saturation KB fetch failed: ${res.status}`);
  _kb = await res.json();
  return _kb;
}

export function getSaturationKB() { return _kb; }

// Try to fetch fresh data from DemandSphere. Returns null on any failure
// (CORS, network, timeout) — caller falls back to baked KB.
export async function tryFetchLive() {
  if (_liveData) return _liveData;
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
  try {
    const res = await fetch(DEMANDSPHERE_API, { signal: controller.signal });
    clearTimeout(timer);
    if (!res.ok) return null;
    _liveData = await res.json();
    return _liveData;
  } catch (e) {
    clearTimeout(timer);
    return null;
  }
}

// Compute top-3 (model, score) pairs for a DemandSphere benchmark key from
// the live data array. Returns null if fewer than 3 models report it.
function computeTop3FromLive(liveData, dsKey) {
  if (!liveData || !Array.isArray(liveData.models)) return null;
  const scored = liveData.models
    .filter(m => typeof m[dsKey] === "number")
    .map(m => ({ model: m.name || m.id, score: m[dsKey] }))
    .sort((a, b) => b.score - a.score);
  if (scored.length < 3) return scored.length === 0 ? null : scored;
  return scored.slice(0, 3);
}

function computeStats(top3) {
  if (!top3 || top3.length === 0) return null;
  const scores = top3.map(x => x.score).filter(s => typeof s === "number");
  if (scores.length === 0) return null;
  if (scores.length < 3) {
    return { count: scores.length, sparse: true };
  }
  const max = Math.max(...scores);
  const min = Math.min(...scores);
  const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
  return {
    count: scores.length,
    spread: max - min,
    mean,
    max, min,
    sparse: false,
  };
}

function classify(stats) {
  if (!stats || stats.sparse) return { code: "sparse_data", borderline: false };
  const { spread, mean } = stats;
  let code;
  if (spread <= SATURATED_SPREAD_MAX && mean >= SATURATED_MEAN_MIN) {
    code = "saturated";
  } else if (spread <= NEAR_SAT_SPREAD_MAX && mean >= NEAR_SAT_MEAN_MIN) {
    code = "near_saturated";
  } else {
    code = "discriminative";
  }
  // Borderline detection: any threshold within ±1pp of an observed value.
  const borderline =
    Math.abs(spread - SATURATED_SPREAD_MAX) <= BORDERLINE_BAND_PP ||
    Math.abs(spread - NEAR_SAT_SPREAD_MAX) <= BORDERLINE_BAND_PP ||
    Math.abs(mean - SATURATED_MEAN_MIN) <= BORDERLINE_BAND_PP ||
    Math.abs(mean - NEAR_SAT_MEAN_MIN) <= BORDERLINE_BAND_PP;
  return { code, borderline };
}

// Public: classify one benchmark by name (KB key, e.g. "MMLU", "GPQA-Diamond").
// Prefers live data when available; falls back to baked stats.
// Returns { code, params, top3, recommendations, note, source }.
export function classifyBenchmark(name, liveOverride = null) {
  if (!_kb) throw new Error("Saturation KB not loaded; call loadSaturationKB() first");
  const entry = _kb.benchmarks[name];
  if (!entry) {
    return { code: "unknown_benchmark", params: { name }, source: null };
  }
  const live = liveOverride !== null ? liveOverride : _liveData;
  let top3 = null, stats = null, source = "baked";
  if (live && entry.key && DS_KEY_TO_NAME[entry.key]) {
    const liveTop3 = computeTop3FromLive(live, entry.key);
    if (liveTop3 && liveTop3.length >= 3) {
      top3 = liveTop3;
      stats = computeStats(liveTop3);
      source = "live";
    }
  }
  if (!top3) {
    // Fall back to baked. Filter out null scores (placeholder rows).
    const baked = (entry.top_3 || []).filter(x => typeof x.score === "number");
    if (baked.length >= 3) {
      top3 = baked;
      stats = computeStats(baked);
    } else {
      // Use baked classification verbatim (e.g. MMLU/HellaSwag/GSM8K declared
      // saturated by consensus even when DemandSphere lists no scores).
      return {
        code: entry.classification || "sparse_data",
        params: {
          name,
          spread: null,
          mean: null,
          n: baked.length,
          basis: entry.classification_basis || null,
        },
        top3: baked,
        recommendations: entry.recommendations || [],
        note: entry.note || null,
        source: "baked_consensus",
        borderline: false,
      };
    }
  }
  const { code, borderline } = classify(stats);
  return {
    code,
    params: {
      name,
      spread: stats.spread != null ? Math.round(stats.spread * 10) / 10 : null,
      mean: stats.mean != null ? Math.round(stats.mean * 10) / 10 : null,
      n: stats.count,
      basis: entry.classification_basis || null,
    },
    top3,
    recommendations: entry.recommendations || [],
    note: entry.note || null,
    source,
    borderline,
  };
}

// Classify every benchmark in the KB. Returns array of results.
export function classifyAll(liveOverride = null) {
  if (!_kb) return [];
  return Object.keys(_kb.benchmarks).map(name => classifyBenchmark(name, liveOverride));
}

// Recommend alternatives given a benchmark name (uses baked KB only since
// recommendations are curated, not derived from scores).
export function recommendAlternatives(name) {
  if (!_kb) return [];
  const entry = _kb.benchmarks[name];
  return entry?.recommendations || [];
}

// List every benchmark known to the KB (for UI dropdowns).
export function listBenchmarks() {
  if (!_kb) return [];
  return Object.keys(_kb.benchmarks);
}

// Attribution metadata for the UI footer.
export function attribution() {
  if (!_kb) return null;
  return {
    primary: _kb.primary_source,
    secondary: _kb.secondary_sources,
    fetched_at: _kb.fetched_at,
  };
}