File size: 5,552 Bytes
edb4038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
// Cross-framework drift bound (v0.7.5 anti-bullshit pack #6)
// Given two benchmark scores from different (framework, dtype, batch, chat_template)
// configurations, predicts the maximum allowable drift from numerical noise alone.
// If the observed gap exceeds this bound, flags it as a real bug — typically
// chat-template mismatch, KV-cache layout, or aggressive batching.
//
// Refs: arxiv 2506.09501 (numerical sources of LLM eval irreproducibility),
//       lm-evaluation-harness issue #1841 (chat_template auto-apply).

// dtype-pair bands (additive contribution to expected drift in benchmark points,
// assuming a 0-100 benchmark scale; halve roughly for 0-50 scale tasks).
const DTYPE_DRIFT = {
  // same dtype, different runs → non-determinism floor
  "bf16-bf16": 0.05, "fp16-fp16": 0.05, "fp32-fp32": 0.02, "nf4-nf4": 0.10, "int8-int8": 0.08,
  // cross-precision
  "bf16-fp16": 0.30, "fp16-bf16": 0.30,
  "bf16-fp32": 0.05, "fp32-bf16": 0.05,
  "fp16-fp32": 0.10, "fp32-fp16": 0.10,
  // any quantized vs full-precision
  "bf16-int8": 0.40, "int8-bf16": 0.40,
  "bf16-nf4":  0.80, "nf4-bf16":  0.80,
  "fp16-int8": 0.40, "int8-fp16": 0.40,
  "fp16-nf4":  0.80, "nf4-fp16":  0.80,
  "int8-nf4":  0.50, "nf4-int8":  0.50,
};

// framework-pair drift (different attention kernels, KV layouts, etc.).
// Conservative — empirical reports vary by model.
const FRAMEWORK_DRIFT = {
  "lm-eval-hf-vllm-served":     0.30, "lm-eval-hf-vllm-batched": 0.25,
  "lm-eval-hf-tgi":             0.20, "lm-eval-hf-transformers": 0.05,
  "vllm-served-vllm-batched":   0.10, "vllm-served-tgi":         0.20,
  "vllm-batched-tgi":           0.20, "vllm-served-transformers": 0.30,
  "vllm-batched-transformers":  0.30, "tgi-transformers":         0.25,
};

const FRAMEWORKS = [
  { id: "lm-eval-hf",      label: "lm-eval-harness (hf)" },
  { id: "vllm-served",     label: "vLLM serve (OpenAI API)" },
  { id: "vllm-batched",    label: "vLLM batched (offline)" },
  { id: "tgi",             label: "Text Generation Inference (TGI)" },
  { id: "transformers",    label: "transformers (raw .generate)" },
];

const DTYPES = [
  { id: "bf16", label: "BF16" },
  { id: "fp16", label: "FP16" },
  { id: "fp32", label: "FP32" },
  { id: "int8", label: "int8" },
  { id: "nf4",  label: "NF4 (4-bit)" },
];

function dtypeDrift(a, b) {
  const k1 = `${a}-${b}`;
  const k2 = `${b}-${a}`;
  return DTYPE_DRIFT[k1] ?? DTYPE_DRIFT[k2] ?? 0.20; // generic upper bound for unknown pairs
}

function frameworkDrift(a, b) {
  if (a === b) return 0.05; // same framework, different runs/seeds
  // sort the pair so lookup is symmetric
  const [x, y] = [a, b].sort();
  return FRAMEWORK_DRIFT[`${x}-${y}`] ?? 0.30; // default upper bound for any cross-framework
}

function batchDrift(batchA, batchB) {
  if (!batchA || !batchB || batchA === batchB) return 0;
  const ratio = Math.max(batchA, batchB) / Math.max(1, Math.min(batchA, batchB));
  if (ratio <= 2)  return 0.05;
  if (ratio <= 8)  return 0.10;
  if (ratio <= 32) return 0.15;
  return 0.20;
}

// Chat-template mismatch is the dominant failure mode — separated from numerical
// drift because the cause is structural, not floating point.
function templateDriftHuge(templateA, templateB) {
  if (templateA === templateB) return null;       // both same → numerical only
  if (templateA === "unknown" || templateB === "unknown") return null;
  return 25.0; // typical drop on multi-turn evals; user will swamp this
}

export function computeDriftBound(setupA, setupB) {
  // setup = { score, framework, dtype, batch, chat_template, benchmark }
  const dDtype = dtypeDrift(setupA.dtype, setupB.dtype);
  const dFw    = frameworkDrift(setupA.framework, setupB.framework);
  const dBatch = batchDrift(setupA.batch, setupB.batch);
  const dTpl   = templateDriftHuge(setupA.chat_template, setupB.chat_template);

  // Numerical-only bound (additive worst-case). Floor at 0.3 pts to account
  // for random-seed + run-to-run non-determinism that ALL setups have, even
  // when the configs match exactly.
  const numericalBand = Math.max(0.3, dDtype + dFw + dBatch);

  const observedGap = Math.abs((setupA.score ?? 0) - (setupB.score ?? 0));
  let verdict, dominantCause = null;

  if (dTpl !== null) {
    // chat-template mismatch dominates anything else by orders of magnitude
    verdict = "bug_template";
    dominantCause = "template_mismatch";
  } else if (observedGap <= numericalBand) {
    verdict = "noise";
  } else if (observedGap <= 2.5 * numericalBand) {
    // 1× to 2.5× the noise band → borderline. Could be a real bug or just an
    // unlucky run combination. User should investigate before claiming a fix.
    verdict = "suspicious";
    const contrib = { dtype: dDtype, framework: dFw, batch: dBatch };
    dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0];
  } else {
    // > 2.5× → definitely beyond what numerical noise can explain.
    verdict = "bug";
    const contrib = { dtype: dDtype, framework: dFw, batch: dBatch };
    dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0];
  }

  return {
    observed_gap: Math.round(observedGap * 100) / 100,
    numerical_band: Math.round(numericalBand * 100) / 100,
    breakdown: {
      dtype: Math.round(dDtype * 100) / 100,
      framework: Math.round(dFw * 100) / 100,
      batch: Math.round(dBatch * 100) / 100,
      template_mismatch: dTpl,
    },
    verdict,
    dominant_cause: dominantCause,
    setup_a: setupA,
    setup_b: setupB,
  };
}

export { FRAMEWORKS, DTYPES };