Spaces:
Running
Running
File size: 3,189 Bytes
ebabb49 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | // Smoke test for js/longscore.js — verifies normalize, lookup, classify codes.
// Run: node scripts/test_longscore.mjs
import { readFileSync } from "fs";
// Mock fetch for Node ESM
globalThis.fetch = async (url) => {
const path = url.startsWith("data/") ? `./${url}` : url;
const txt = readFileSync(path, "utf-8");
return {
ok: true,
json: async () => JSON.parse(txt),
};
};
const { normalize, lookup, classify, rank } = await import("../js/longscore.js");
let pass = 0, fail = 0;
function check(name, cond, detail) {
if (cond) { pass++; console.log(` ✓ ${name}`); }
else { fail++; console.log(` ✗ ${name}${detail ? ": " + detail : ""}`); }
}
console.log("--- normalize ---");
check("trims + lowercases", normalize(" Qwen2.5 ") === "qwen-2-5");
check("strips meta-llama/", normalize("meta-llama/Llama-3.1-70B-Instruct") === "llama-3-1-70-b-instruct");
check("strips 01-ai/", normalize("01-ai/Yi-34B-200K") === "yi-34-b-200-k");
check("inst → instruct", normalize("Mistral-7B-Inst-v0.2") === "mistral-7-b-instruct-v-0-2");
check("dot → dash", normalize("Phi-3.5-mini-instruct") === "phi-3-5-mini-instruct");
check("empty", normalize("") === "");
console.log("\n--- classify ---");
const t = { no_degradation: -0.02, mild: -0.10, moderate: -0.20, severe: -0.30 };
check("no_data", classify(null, t) === "no_data");
check("no_degradation", classify(0.0, t) === "no_degradation");
check("mild", classify(-0.05, t) === "mild");
check("moderate", classify(-0.15, t) === "moderate");
check("severe", classify(-0.25, t) === "severe");
check("extreme", classify(-0.50, t) === "extreme");
console.log("\n--- lookup (RULER hit) ---");
const r1 = await lookup("Llama-3.1-70B-Instruct");
check("ruler_hit code", r1.code === "ruler_hit");
check("longscore present", typeof r1.ruler_long_score?.avg_lc === "number");
check("verdict assigned", r1.verdict !== null);
check("base ~96", r1.ruler_long_score?.base > 95 && r1.ruler_long_score?.base < 97,
`got base=${r1.ruler_long_score?.base}`);
check("Llama-3.1-70B avg_lc ~-0.10", Math.abs(r1.ruler_long_score?.avg_lc - (-0.1024)) < 0.001,
`got ${r1.ruler_long_score?.avg_lc}`);
console.log("\n--- lookup (Jamba — best LongScore) ---");
const r2 = await lookup("Jamba-1.5-Large");
check("ruler_hit", r2.code === "ruler_hit");
check("Jamba near-zero degradation", r2.ruler_long_score?.avg_lc > -0.02);
console.log("\n--- lookup (dbrx — severe) ---");
const r3 = await lookup("dbrx");
check("ruler_hit", r3.code === "ruler_hit");
check("dbrx severe verdict", r3.verdict === "severe" || r3.verdict === "extreme",
`got verdict=${r3.verdict} for avg_lc=${r3.ruler_long_score?.avg_lc}`);
console.log("\n--- lookup (miss) ---");
const r4 = await lookup("nonexistent-model-123");
check("miss code", r4.code === "miss");
check("normalized id present", r4.normalized_id === "nonexistent-model-123");
console.log("\n--- rank ---");
const ranking = await rank("worst");
check("ranking returned", Array.isArray(ranking) && ranking.length > 0);
check("worst is most negative", ranking[0].avg_lc < ranking[ranking.length - 1].avg_lc);
console.log(`\n${pass} passed, ${fail} failed`);
process.exit(fail > 0 ? 1 : 0);
|