Spaces:
Running
Running
File size: 5,552 Bytes
edb4038 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | // Cross-framework drift bound (v0.7.5 anti-bullshit pack #6)
// Given two benchmark scores from different (framework, dtype, batch, chat_template)
// configurations, predicts the maximum allowable drift from numerical noise alone.
// If the observed gap exceeds this bound, flags it as a real bug — typically
// chat-template mismatch, KV-cache layout, or aggressive batching.
//
// Refs: arxiv 2506.09501 (numerical sources of LLM eval irreproducibility),
// lm-evaluation-harness issue #1841 (chat_template auto-apply).
// dtype-pair bands (additive contribution to expected drift in benchmark points,
// assuming a 0-100 benchmark scale; halve roughly for 0-50 scale tasks).
const DTYPE_DRIFT = {
// same dtype, different runs → non-determinism floor
"bf16-bf16": 0.05, "fp16-fp16": 0.05, "fp32-fp32": 0.02, "nf4-nf4": 0.10, "int8-int8": 0.08,
// cross-precision
"bf16-fp16": 0.30, "fp16-bf16": 0.30,
"bf16-fp32": 0.05, "fp32-bf16": 0.05,
"fp16-fp32": 0.10, "fp32-fp16": 0.10,
// any quantized vs full-precision
"bf16-int8": 0.40, "int8-bf16": 0.40,
"bf16-nf4": 0.80, "nf4-bf16": 0.80,
"fp16-int8": 0.40, "int8-fp16": 0.40,
"fp16-nf4": 0.80, "nf4-fp16": 0.80,
"int8-nf4": 0.50, "nf4-int8": 0.50,
};
// framework-pair drift (different attention kernels, KV layouts, etc.).
// Conservative — empirical reports vary by model.
const FRAMEWORK_DRIFT = {
"lm-eval-hf-vllm-served": 0.30, "lm-eval-hf-vllm-batched": 0.25,
"lm-eval-hf-tgi": 0.20, "lm-eval-hf-transformers": 0.05,
"vllm-served-vllm-batched": 0.10, "vllm-served-tgi": 0.20,
"vllm-batched-tgi": 0.20, "vllm-served-transformers": 0.30,
"vllm-batched-transformers": 0.30, "tgi-transformers": 0.25,
};
const FRAMEWORKS = [
{ id: "lm-eval-hf", label: "lm-eval-harness (hf)" },
{ id: "vllm-served", label: "vLLM serve (OpenAI API)" },
{ id: "vllm-batched", label: "vLLM batched (offline)" },
{ id: "tgi", label: "Text Generation Inference (TGI)" },
{ id: "transformers", label: "transformers (raw .generate)" },
];
const DTYPES = [
{ id: "bf16", label: "BF16" },
{ id: "fp16", label: "FP16" },
{ id: "fp32", label: "FP32" },
{ id: "int8", label: "int8" },
{ id: "nf4", label: "NF4 (4-bit)" },
];
function dtypeDrift(a, b) {
const k1 = `${a}-${b}`;
const k2 = `${b}-${a}`;
return DTYPE_DRIFT[k1] ?? DTYPE_DRIFT[k2] ?? 0.20; // generic upper bound for unknown pairs
}
function frameworkDrift(a, b) {
if (a === b) return 0.05; // same framework, different runs/seeds
// sort the pair so lookup is symmetric
const [x, y] = [a, b].sort();
return FRAMEWORK_DRIFT[`${x}-${y}`] ?? 0.30; // default upper bound for any cross-framework
}
function batchDrift(batchA, batchB) {
if (!batchA || !batchB || batchA === batchB) return 0;
const ratio = Math.max(batchA, batchB) / Math.max(1, Math.min(batchA, batchB));
if (ratio <= 2) return 0.05;
if (ratio <= 8) return 0.10;
if (ratio <= 32) return 0.15;
return 0.20;
}
// Chat-template mismatch is the dominant failure mode — separated from numerical
// drift because the cause is structural, not floating point.
function templateDriftHuge(templateA, templateB) {
if (templateA === templateB) return null; // both same → numerical only
if (templateA === "unknown" || templateB === "unknown") return null;
return 25.0; // typical drop on multi-turn evals; user will swamp this
}
export function computeDriftBound(setupA, setupB) {
// setup = { score, framework, dtype, batch, chat_template, benchmark }
const dDtype = dtypeDrift(setupA.dtype, setupB.dtype);
const dFw = frameworkDrift(setupA.framework, setupB.framework);
const dBatch = batchDrift(setupA.batch, setupB.batch);
const dTpl = templateDriftHuge(setupA.chat_template, setupB.chat_template);
// Numerical-only bound (additive worst-case). Floor at 0.3 pts to account
// for random-seed + run-to-run non-determinism that ALL setups have, even
// when the configs match exactly.
const numericalBand = Math.max(0.3, dDtype + dFw + dBatch);
const observedGap = Math.abs((setupA.score ?? 0) - (setupB.score ?? 0));
let verdict, dominantCause = null;
if (dTpl !== null) {
// chat-template mismatch dominates anything else by orders of magnitude
verdict = "bug_template";
dominantCause = "template_mismatch";
} else if (observedGap <= numericalBand) {
verdict = "noise";
} else if (observedGap <= 2.5 * numericalBand) {
// 1× to 2.5× the noise band → borderline. Could be a real bug or just an
// unlucky run combination. User should investigate before claiming a fix.
verdict = "suspicious";
const contrib = { dtype: dDtype, framework: dFw, batch: dBatch };
dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0];
} else {
// > 2.5× → definitely beyond what numerical noise can explain.
verdict = "bug";
const contrib = { dtype: dDtype, framework: dFw, batch: dBatch };
dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0];
}
return {
observed_gap: Math.round(observedGap * 100) / 100,
numerical_band: Math.round(numericalBand * 100) / 100,
breakdown: {
dtype: Math.round(dDtype * 100) / 100,
framework: Math.round(dFw * 100) / 100,
batch: Math.round(dBatch * 100) / 100,
template_mismatch: dTpl,
},
verdict,
dominant_cause: dominantCause,
setup_a: setupA,
setup_b: setupB,
};
}
export { FRAMEWORKS, DTYPES };
|