Spaces:
Running
Running
| // Cross-framework drift bound (v0.7.5 anti-bullshit pack #6) | |
| // Given two benchmark scores from different (framework, dtype, batch, chat_template) | |
| // configurations, predicts the maximum allowable drift from numerical noise alone. | |
| // If the observed gap exceeds this bound, flags it as a real bug — typically | |
| // chat-template mismatch, KV-cache layout, or aggressive batching. | |
| // | |
| // Refs: arxiv 2506.09501 (numerical sources of LLM eval irreproducibility), | |
| // lm-evaluation-harness issue #1841 (chat_template auto-apply). | |
| // dtype-pair bands (additive contribution to expected drift in benchmark points, | |
| // assuming a 0-100 benchmark scale; halve roughly for 0-50 scale tasks). | |
| const DTYPE_DRIFT = { | |
| // same dtype, different runs → non-determinism floor | |
| "bf16-bf16": 0.05, "fp16-fp16": 0.05, "fp32-fp32": 0.02, "nf4-nf4": 0.10, "int8-int8": 0.08, | |
| // cross-precision | |
| "bf16-fp16": 0.30, "fp16-bf16": 0.30, | |
| "bf16-fp32": 0.05, "fp32-bf16": 0.05, | |
| "fp16-fp32": 0.10, "fp32-fp16": 0.10, | |
| // any quantized vs full-precision | |
| "bf16-int8": 0.40, "int8-bf16": 0.40, | |
| "bf16-nf4": 0.80, "nf4-bf16": 0.80, | |
| "fp16-int8": 0.40, "int8-fp16": 0.40, | |
| "fp16-nf4": 0.80, "nf4-fp16": 0.80, | |
| "int8-nf4": 0.50, "nf4-int8": 0.50, | |
| }; | |
| // framework-pair drift (different attention kernels, KV layouts, etc.). | |
| // Conservative — empirical reports vary by model. | |
| const FRAMEWORK_DRIFT = { | |
| "lm-eval-hf-vllm-served": 0.30, "lm-eval-hf-vllm-batched": 0.25, | |
| "lm-eval-hf-tgi": 0.20, "lm-eval-hf-transformers": 0.05, | |
| "vllm-served-vllm-batched": 0.10, "vllm-served-tgi": 0.20, | |
| "vllm-batched-tgi": 0.20, "vllm-served-transformers": 0.30, | |
| "vllm-batched-transformers": 0.30, "tgi-transformers": 0.25, | |
| }; | |
| const FRAMEWORKS = [ | |
| { id: "lm-eval-hf", label: "lm-eval-harness (hf)" }, | |
| { id: "vllm-served", label: "vLLM serve (OpenAI API)" }, | |
| { id: "vllm-batched", label: "vLLM batched (offline)" }, | |
| { id: "tgi", label: "Text Generation Inference (TGI)" }, | |
| { id: "transformers", label: "transformers (raw .generate)" }, | |
| ]; | |
| const DTYPES = [ | |
| { id: "bf16", label: "BF16" }, | |
| { id: "fp16", label: "FP16" }, | |
| { id: "fp32", label: "FP32" }, | |
| { id: "int8", label: "int8" }, | |
| { id: "nf4", label: "NF4 (4-bit)" }, | |
| ]; | |
| function dtypeDrift(a, b) { | |
| const k1 = `${a}-${b}`; | |
| const k2 = `${b}-${a}`; | |
| return DTYPE_DRIFT[k1] ?? DTYPE_DRIFT[k2] ?? 0.20; // generic upper bound for unknown pairs | |
| } | |
| function frameworkDrift(a, b) { | |
| if (a === b) return 0.05; // same framework, different runs/seeds | |
| // sort the pair so lookup is symmetric | |
| const [x, y] = [a, b].sort(); | |
| return FRAMEWORK_DRIFT[`${x}-${y}`] ?? 0.30; // default upper bound for any cross-framework | |
| } | |
| function batchDrift(batchA, batchB) { | |
| if (!batchA || !batchB || batchA === batchB) return 0; | |
| const ratio = Math.max(batchA, batchB) / Math.max(1, Math.min(batchA, batchB)); | |
| if (ratio <= 2) return 0.05; | |
| if (ratio <= 8) return 0.10; | |
| if (ratio <= 32) return 0.15; | |
| return 0.20; | |
| } | |
| // Chat-template mismatch is the dominant failure mode — separated from numerical | |
| // drift because the cause is structural, not floating point. | |
| function templateDriftHuge(templateA, templateB) { | |
| if (templateA === templateB) return null; // both same → numerical only | |
| if (templateA === "unknown" || templateB === "unknown") return null; | |
| return 25.0; // typical drop on multi-turn evals; user will swamp this | |
| } | |
| export function computeDriftBound(setupA, setupB) { | |
| // setup = { score, framework, dtype, batch, chat_template, benchmark } | |
| const dDtype = dtypeDrift(setupA.dtype, setupB.dtype); | |
| const dFw = frameworkDrift(setupA.framework, setupB.framework); | |
| const dBatch = batchDrift(setupA.batch, setupB.batch); | |
| const dTpl = templateDriftHuge(setupA.chat_template, setupB.chat_template); | |
| // Numerical-only bound (additive worst-case). Floor at 0.3 pts to account | |
| // for random-seed + run-to-run non-determinism that ALL setups have, even | |
| // when the configs match exactly. | |
| const numericalBand = Math.max(0.3, dDtype + dFw + dBatch); | |
| const observedGap = Math.abs((setupA.score ?? 0) - (setupB.score ?? 0)); | |
| let verdict, dominantCause = null; | |
| if (dTpl !== null) { | |
| // chat-template mismatch dominates anything else by orders of magnitude | |
| verdict = "bug_template"; | |
| dominantCause = "template_mismatch"; | |
| } else if (observedGap <= numericalBand) { | |
| verdict = "noise"; | |
| } else if (observedGap <= 2.5 * numericalBand) { | |
| // 1× to 2.5× the noise band → borderline. Could be a real bug or just an | |
| // unlucky run combination. User should investigate before claiming a fix. | |
| verdict = "suspicious"; | |
| const contrib = { dtype: dDtype, framework: dFw, batch: dBatch }; | |
| dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0]; | |
| } else { | |
| // > 2.5× → definitely beyond what numerical noise can explain. | |
| verdict = "bug"; | |
| const contrib = { dtype: dDtype, framework: dFw, batch: dBatch }; | |
| dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0]; | |
| } | |
| return { | |
| observed_gap: Math.round(observedGap * 100) / 100, | |
| numerical_band: Math.round(numericalBand * 100) / 100, | |
| breakdown: { | |
| dtype: Math.round(dDtype * 100) / 100, | |
| framework: Math.round(dFw * 100) / 100, | |
| batch: Math.round(dBatch * 100) / 100, | |
| template_mismatch: dTpl, | |
| }, | |
| verdict, | |
| dominant_cause: dominantCause, | |
| setup_a: setupA, | |
| setup_b: setupB, | |
| }; | |
| } | |
| export { FRAMEWORKS, DTYPES }; | |