Spaces:

karlexmarin
/

taf-agent

Running

App Files Files Community

taf-agent / js /cross_drift.js

karlexmarin's picture

v0.7.5: Cross-framework drift bound (anti-bullshit #6) + autocomplete UX

edb4038 6 days ago

history blame contribute delete

5.55 kB

	// Cross-framework drift bound (v0.7.5 anti-bullshit pack #6)
	// Given two benchmark scores from different (framework, dtype, batch, chat_template)
	// configurations, predicts the maximum allowable drift from numerical noise alone.
	// If the observed gap exceeds this bound, flags it as a real bug — typically
	// chat-template mismatch, KV-cache layout, or aggressive batching.
	//
	// Refs: arxiv 2506.09501 (numerical sources of LLM eval irreproducibility),
	// lm-evaluation-harness issue #1841 (chat_template auto-apply).

	// dtype-pair bands (additive contribution to expected drift in benchmark points,
	// assuming a 0-100 benchmark scale; halve roughly for 0-50 scale tasks).
	const DTYPE_DRIFT = {
	// same dtype, different runs → non-determinism floor
	"bf16-bf16": 0.05, "fp16-fp16": 0.05, "fp32-fp32": 0.02, "nf4-nf4": 0.10, "int8-int8": 0.08,
	// cross-precision
	"bf16-fp16": 0.30, "fp16-bf16": 0.30,
	"bf16-fp32": 0.05, "fp32-bf16": 0.05,
	"fp16-fp32": 0.10, "fp32-fp16": 0.10,
	// any quantized vs full-precision
	"bf16-int8": 0.40, "int8-bf16": 0.40,
	"bf16-nf4": 0.80, "nf4-bf16": 0.80,
	"fp16-int8": 0.40, "int8-fp16": 0.40,
	"fp16-nf4": 0.80, "nf4-fp16": 0.80,
	"int8-nf4": 0.50, "nf4-int8": 0.50,
	};

	// framework-pair drift (different attention kernels, KV layouts, etc.).
	// Conservative — empirical reports vary by model.
	const FRAMEWORK_DRIFT = {
	"lm-eval-hf-vllm-served": 0.30, "lm-eval-hf-vllm-batched": 0.25,
	"lm-eval-hf-tgi": 0.20, "lm-eval-hf-transformers": 0.05,
	"vllm-served-vllm-batched": 0.10, "vllm-served-tgi": 0.20,
	"vllm-batched-tgi": 0.20, "vllm-served-transformers": 0.30,
	"vllm-batched-transformers": 0.30, "tgi-transformers": 0.25,
	};

	const FRAMEWORKS = [
	{ id: "lm-eval-hf", label: "lm-eval-harness (hf)" },
	{ id: "vllm-served", label: "vLLM serve (OpenAI API)" },
	{ id: "vllm-batched", label: "vLLM batched (offline)" },
	{ id: "tgi", label: "Text Generation Inference (TGI)" },
	{ id: "transformers", label: "transformers (raw .generate)" },
	];

	const DTYPES = [
	{ id: "bf16", label: "BF16" },
	{ id: "fp16", label: "FP16" },
	{ id: "fp32", label: "FP32" },
	{ id: "int8", label: "int8" },
	{ id: "nf4", label: "NF4 (4-bit)" },
	];

	function dtypeDrift(a, b) {
	const k1 = `${a}-${b}`;
	const k2 = `${b}-${a}`;
	return DTYPE_DRIFT[k1] ?? DTYPE_DRIFT[k2] ?? 0.20; // generic upper bound for unknown pairs
	}

	function frameworkDrift(a, b) {
	if (a === b) return 0.05; // same framework, different runs/seeds
	// sort the pair so lookup is symmetric
	const [x, y] = [a, b].sort();
	return FRAMEWORK_DRIFT[`${x}-${y}`] ?? 0.30; // default upper bound for any cross-framework
	}

	function batchDrift(batchA, batchB) {
	if (!batchA \|\| !batchB \|\| batchA === batchB) return 0;
	const ratio = Math.max(batchA, batchB) / Math.max(1, Math.min(batchA, batchB));
	if (ratio <= 2) return 0.05;
	if (ratio <= 8) return 0.10;
	if (ratio <= 32) return 0.15;
	return 0.20;
	}

	// Chat-template mismatch is the dominant failure mode — separated from numerical
	// drift because the cause is structural, not floating point.
	function templateDriftHuge(templateA, templateB) {
	if (templateA === templateB) return null; // both same → numerical only
	if (templateA === "unknown" \|\| templateB === "unknown") return null;
	return 25.0; // typical drop on multi-turn evals; user will swamp this
	}

	export function computeDriftBound(setupA, setupB) {
	// setup = { score, framework, dtype, batch, chat_template, benchmark }
	const dDtype = dtypeDrift(setupA.dtype, setupB.dtype);
	const dFw = frameworkDrift(setupA.framework, setupB.framework);
	const dBatch = batchDrift(setupA.batch, setupB.batch);
	const dTpl = templateDriftHuge(setupA.chat_template, setupB.chat_template);

	// Numerical-only bound (additive worst-case). Floor at 0.3 pts to account
	// for random-seed + run-to-run non-determinism that ALL setups have, even
	// when the configs match exactly.
	const numericalBand = Math.max(0.3, dDtype + dFw + dBatch);

	const observedGap = Math.abs((setupA.score ?? 0) - (setupB.score ?? 0));
	let verdict, dominantCause = null;

	if (dTpl !== null) {
	// chat-template mismatch dominates anything else by orders of magnitude
	verdict = "bug_template";
	dominantCause = "template_mismatch";
	} else if (observedGap <= numericalBand) {
	verdict = "noise";
	} else if (observedGap <= 2.5 * numericalBand) {
	// 1× to 2.5× the noise band → borderline. Could be a real bug or just an
	// unlucky run combination. User should investigate before claiming a fix.
	verdict = "suspicious";
	const contrib = { dtype: dDtype, framework: dFw, batch: dBatch };
	dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0];
	} else {
	// > 2.5× → definitely beyond what numerical noise can explain.
	verdict = "bug";
	const contrib = { dtype: dDtype, framework: dFw, batch: dBatch };
	dominantCause = Object.entries(contrib).sort((a, b) => b[1] - a[1])[0][0];
	}

	return {
	observed_gap: Math.round(observedGap * 100) / 100,
	numerical_band: Math.round(numericalBand * 100) / 100,
	breakdown: {
	dtype: Math.round(dDtype * 100) / 100,
	framework: Math.round(dFw * 100) / 100,
	batch: Math.round(dBatch * 100) / 100,
	template_mismatch: dTpl,
	},
	verdict,
	dominant_cause: dominantCause,
	setup_a: setupA,
	setup_b: setupB,
	};
	}

	export { FRAMEWORKS, DTYPES };