Spaces:

karlexmarin
/

taf-agent

Running

App Files Files Community

taf-agent / scripts /test_longscore_e2e.mjs

karlexmarin's picture

v0.8.8 LongScore mode — anti-bullshit pack #14 + Hub badge readability fix

ebabb49 5 days ago

history blame contribute delete

1.51 kB

	// E2E lookup smoke for the 3 example buttons (Jamba/Llama/dbrx) + a HELMET-only model.
	import { readFileSync } from "fs";
	globalThis.fetch = async (url) => {
	const path = url.startsWith("data/") ? `./${url}` : url;
	return { ok: true, json: async () => JSON.parse(readFileSync(path, "utf-8")) };
	};

	const { lookup } = await import("../js/longscore.js");

	const cases = [
	{ input: "Jamba-1.5-Large", expect: { code: "ruler_hit", verdict: "no_degradation" } },
	{ input: "Llama-3.1-70B-Instruct", expect: { code: "ruler_hit", verdict: "moderate" } },
	{ input: "dbrx", expect: { code: "ruler_hit", verdict: "extreme" } },
	{ input: "GPT-4", expect: { code: "helmet_only" } }, // HELMET-only
	{ input: "totally-fake-model-xyz", expect: { code: "miss" } },
	];

	let pass = 0, fail = 0;
	for (const c of cases) {
	const r = await lookup(c.input);
	const ok = r.code === c.expect.code &&
	(!c.expect.verdict \|\| r.verdict === c.expect.verdict);
	if (ok) {
	pass++;
	const score = r.ruler_long_score ? `LongScore=${(r.ruler_long_score.avg_lc*100).toFixed(1)}%` :
	r.helmet ? `HELMET overall=${r.helmet.overall}` : "";
	console.log(` ✓ ${c.input.padEnd(30)} → ${r.code.padEnd(12)} ${r.verdict \|\| "n/a".padEnd(15)} ${score}`);
	} else {
	fail++;
	console.log(` ✗ ${c.input.padEnd(30)} → got code=${r.code} verdict=${r.verdict}, expected=${JSON.stringify(c.expect)}`);
	}
	}
	console.log(`\n${pass}/${pass+fail} cases pass`);
	process.exit(fail > 0 ? 1 : 0);