Spaces:

karlexmarin
/

taf-agent

Running

File size: 5,552 Bytes

edb4038

// Cross-framework drift bound (v0.7.5 anti-bullshit pack #6)
// Given two benchmark scores from different (framework, dtype, batch, chat_template)
// configurations, predicts the maximum allowable drift from numerical noise alone.
// If the observed gap exceeds this bound, flags it as a real bug — typically
// chat-template mismatch, KV-cache layout, or aggressive batching.
//
// Refs: arxiv 2506.09501 (numerical sources of LLM eval irreproducibility),
//       lm-evaluation-harness issue #1841 (chat_template auto-apply).

// dtype-pair bands (additive contribution to expected drift in benchmark points,
// assuming a 0-100 benchmark scale; halve roughly for 0-50 scale tasks).
const DTYPE_DRIFT = {
  // same dtype, different runs → non-determinism floor
  "bf16-bf16": 0.05, "fp16-fp16": 0.05, "fp32-fp32": 0.02, "nf4-nf4": 0.10, "int8-int8": 0.08,
  // cross-precision
  "bf16-fp16": 0.30, "fp16-bf16": 0.30,
  "bf16-fp32": 0.05, "fp32-bf16": 0.05,
  "fp16-fp32": 0.10, "fp32-fp16": 0.10,
  // any quantized vs full-precision
  "bf16-int8": 0.40, "int8-bf16": 0.40,
  "bf16-nf4":  0.80, "nf4-bf16":  0.80,
  "fp16-int8": 0.40, "int8-fp16": 0.40,
  "fp16-nf4":  0.80, "nf4-fp16":  0.80,
  "int8-nf4":  0.50, "nf4-int8":  0.50,
};

// framework-pair drift (different attention kernels, KV layouts, etc.).
// Conservative — empirical reports vary by model.
const FRAMEWORK_DRIFT = {
  "lm-eval-hf-vllm-served":     0.30, "lm-eval-hf-vllm-batched": 0.25,
  "lm-eval-hf-tgi":             0.20, "lm-eval-hf-transformers": 0.05,
  "vllm-served-vllm-batched":   0.10, "vllm-served-tgi":         0.20,
  "vllm-batched-tgi":           0.20, "vllm-served-transformers": 0.30,
  "vllm-batched-transformers":  0.30, "tgi-transformers":         0.25,
};

const FRAMEWORKS = [
  { id: "lm-eval-hf",      label: "lm-eval-harness (hf)" },
  { id: "vllm-served",     label: "vLLM serve (OpenAI API)" },
  { id: "vllm-batched",    label: "vLLM batched (offline)" },
  { id: "tgi",             label: "Text Generation Inference (TGI)" },
  { id: "transformers",    label: "transformers (raw .generate)" },
];

const DTYPES = [
  { id: "bf16", label: "BF16" },
  { id: "fp16", label: "FP16" },
  { id: "fp32", label: "FP32" },
  { id: "int8", label: "int8" },
  { id: "nf4",  label: "NF4 (4-bit)" },
];

function dtypeDrift(a, b) {
  const k1 = `${a}-${b}`;
  const k2 = `${b}-${a}`;
  return DTYPE_DRIFT[k1] ?? DTYPE_DRIFT[k2] ?? 0.20; // generic upper bound for unknown pairs
}

function frameworkDrift(a, b) {
  if (a === b) return 0.05; // same framework, different runs/seeds
  // sort the pair so lookup is symmetric
  const [x, y] = [a, b].sort();
  return FRAMEWORK_DRIFT[`${x}-${y}`] ?? 0.30; // default upper bound for any cross-framework
}

function batchDrift(batchA, batchB) {
  if (!batchA || !batchB || batchA === batchB) return 0;
  const ratio = Math.max(batchA, batchB) / Math.max(1, Math.min(batchA, batchB));
  if (ratio <= 2)  return 0.05;
  if (ratio <= 8)  return 0.10;
  if (ratio <= 32) return 0.15;
  return 0.20;
}

// Chat-template mismatch is the dominant failure mode — separated from numerical
// drift because the cause is structural, not floating point.
function templateDriftHuge(templateA, templateB) {
  if (templateA === templateB) return null;       // both same → numerical only
  if (templateA === "unknown" || templateB === "unknown") return null;
  return 25.0; // typical drop on multi-turn evals; user will swamp this
}

export function computeDriftBound(setupA, setupB) {
  // setup = { score, framework, dtype, batch, chat_template, benchmark }
  const dDtype = dtypeDrift(setupA.dtype, setupB.dtype);
  const dFw    = frameworkDrift(setupA.framework, setupB.framework);
  const dBatch = batchDrift(setupA.batch, setupB.batch);
  const dTpl   = templateDriftHuge(setupA.chat_template, setupB.chat_template);

  // Numerical-only bound (additive worst-case). Floor at 0.3 pts to account
  // for random-seed + run-to-run non-determinism that ALL setups have, even
  // when the configs match exactly.
  const numericalBand = Math.max(0.3, dDtype + dFw + dBatch);

  const observedGap = Math.abs((setupA.score ?? 0) - (setupB.score ?? 0));
  let verdict, dominantCause = null;

  if (dTpl !== null) {
    // chat-template mismatch dominates anything else by orders of magnitude
    verdict = "bug_template";
    dominantCause = "template_mismatch";
  } else if (observedGap <= numericalBand) {
    verdict = "noise";
  } else if (observedGap <= 2.5 * numericalBand) {
    // 1× to 2.5× the noise band → borderline. Could be a real bug or just an
    // unlucky run combination. User should investigate before claiming a fix.
    verdict = "suspicious";
    const contrib = { dtype: dDtype, framework: dFw, batch: dBatch };
    dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0];
  } else {
    // > 2.5× → definitely beyond what numerical noise can explain.
    verdict = "bug";
    const contrib = { dtype: dDtype, framework: dFw, batch: dBatch };
    dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0];
  }

  return {
    observed_gap: Math.round(observedGap * 100) / 100,
    numerical_band: Math.round(numericalBand * 100) / 100,
    breakdown: {
      dtype: Math.round(dDtype * 100) / 100,
      framework: Math.round(dFw * 100) / 100,
      batch: Math.round(dBatch * 100) / 100,
      template_mismatch: dTpl,
    },
    verdict,
    dominant_cause: dominantCause,
    setup_a: setupA,
    setup_b: setupB,
  };
}

export { FRAMEWORKS, DTYPES };