taf-agent / js /cross_drift.js
karlexmarin's picture
v0.7.5: Cross-framework drift bound (anti-bullshit #6) + autocomplete UX
edb4038
// Cross-framework drift bound (v0.7.5 anti-bullshit pack #6)
// Given two benchmark scores from different (framework, dtype, batch, chat_template)
// configurations, predicts the maximum allowable drift from numerical noise alone.
// If the observed gap exceeds this bound, flags it as a real bug — typically
// chat-template mismatch, KV-cache layout, or aggressive batching.
//
// Refs: arxiv 2506.09501 (numerical sources of LLM eval irreproducibility),
// lm-evaluation-harness issue #1841 (chat_template auto-apply).
// dtype-pair bands (additive contribution to expected drift in benchmark points,
// assuming a 0-100 benchmark scale; halve roughly for 0-50 scale tasks).
const DTYPE_DRIFT = {
// same dtype, different runs → non-determinism floor
"bf16-bf16": 0.05, "fp16-fp16": 0.05, "fp32-fp32": 0.02, "nf4-nf4": 0.10, "int8-int8": 0.08,
// cross-precision
"bf16-fp16": 0.30, "fp16-bf16": 0.30,
"bf16-fp32": 0.05, "fp32-bf16": 0.05,
"fp16-fp32": 0.10, "fp32-fp16": 0.10,
// any quantized vs full-precision
"bf16-int8": 0.40, "int8-bf16": 0.40,
"bf16-nf4": 0.80, "nf4-bf16": 0.80,
"fp16-int8": 0.40, "int8-fp16": 0.40,
"fp16-nf4": 0.80, "nf4-fp16": 0.80,
"int8-nf4": 0.50, "nf4-int8": 0.50,
};
// framework-pair drift (different attention kernels, KV layouts, etc.).
// Conservative — empirical reports vary by model.
const FRAMEWORK_DRIFT = {
"lm-eval-hf-vllm-served": 0.30, "lm-eval-hf-vllm-batched": 0.25,
"lm-eval-hf-tgi": 0.20, "lm-eval-hf-transformers": 0.05,
"vllm-served-vllm-batched": 0.10, "vllm-served-tgi": 0.20,
"vllm-batched-tgi": 0.20, "vllm-served-transformers": 0.30,
"vllm-batched-transformers": 0.30, "tgi-transformers": 0.25,
};
const FRAMEWORKS = [
{ id: "lm-eval-hf", label: "lm-eval-harness (hf)" },
{ id: "vllm-served", label: "vLLM serve (OpenAI API)" },
{ id: "vllm-batched", label: "vLLM batched (offline)" },
{ id: "tgi", label: "Text Generation Inference (TGI)" },
{ id: "transformers", label: "transformers (raw .generate)" },
];
const DTYPES = [
{ id: "bf16", label: "BF16" },
{ id: "fp16", label: "FP16" },
{ id: "fp32", label: "FP32" },
{ id: "int8", label: "int8" },
{ id: "nf4", label: "NF4 (4-bit)" },
];
function dtypeDrift(a, b) {
const k1 = `${a}-${b}`;
const k2 = `${b}-${a}`;
return DTYPE_DRIFT[k1] ?? DTYPE_DRIFT[k2] ?? 0.20; // generic upper bound for unknown pairs
}
function frameworkDrift(a, b) {
if (a === b) return 0.05; // same framework, different runs/seeds
// sort the pair so lookup is symmetric
const [x, y] = [a, b].sort();
return FRAMEWORK_DRIFT[`${x}-${y}`] ?? 0.30; // default upper bound for any cross-framework
}
function batchDrift(batchA, batchB) {
if (!batchA || !batchB || batchA === batchB) return 0;
const ratio = Math.max(batchA, batchB) / Math.max(1, Math.min(batchA, batchB));
if (ratio <= 2) return 0.05;
if (ratio <= 8) return 0.10;
if (ratio <= 32) return 0.15;
return 0.20;
}
// Chat-template mismatch is the dominant failure mode — separated from numerical
// drift because the cause is structural, not floating point.
function templateDriftHuge(templateA, templateB) {
if (templateA === templateB) return null; // both same → numerical only
if (templateA === "unknown" || templateB === "unknown") return null;
return 25.0; // typical drop on multi-turn evals; user will swamp this
}
export function computeDriftBound(setupA, setupB) {
// setup = { score, framework, dtype, batch, chat_template, benchmark }
const dDtype = dtypeDrift(setupA.dtype, setupB.dtype);
const dFw = frameworkDrift(setupA.framework, setupB.framework);
const dBatch = batchDrift(setupA.batch, setupB.batch);
const dTpl = templateDriftHuge(setupA.chat_template, setupB.chat_template);
// Numerical-only bound (additive worst-case). Floor at 0.3 pts to account
// for random-seed + run-to-run non-determinism that ALL setups have, even
// when the configs match exactly.
const numericalBand = Math.max(0.3, dDtype + dFw + dBatch);
const observedGap = Math.abs((setupA.score ?? 0) - (setupB.score ?? 0));
let verdict, dominantCause = null;
if (dTpl !== null) {
// chat-template mismatch dominates anything else by orders of magnitude
verdict = "bug_template";
dominantCause = "template_mismatch";
} else if (observedGap <= numericalBand) {
verdict = "noise";
} else if (observedGap <= 2.5 * numericalBand) {
// 1× to 2.5× the noise band → borderline. Could be a real bug or just an
// unlucky run combination. User should investigate before claiming a fix.
verdict = "suspicious";
const contrib = { dtype: dDtype, framework: dFw, batch: dBatch };
dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0];
} else {
// > 2.5× → definitely beyond what numerical noise can explain.
verdict = "bug";
const contrib = { dtype: dDtype, framework: dFw, batch: dBatch };
dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0];
}
return {
observed_gap: Math.round(observedGap * 100) / 100,
numerical_band: Math.round(numericalBand * 100) / 100,
breakdown: {
dtype: Math.round(dDtype * 100) / 100,
framework: Math.round(dFw * 100) / 100,
batch: Math.round(dBatch * 100) / 100,
template_mismatch: dTpl,
},
verdict,
dominant_cause: dominantCause,
setup_a: setupA,
setup_b: setupB,
};
}
export { FRAMEWORKS, DTYPES };