Spaces:
Running
Running
| // Smoke test for js/longscore.js — verifies normalize, lookup, classify codes. | |
| // Run: node scripts/test_longscore.mjs | |
| import { readFileSync } from "fs"; | |
| // Mock fetch for Node ESM | |
| globalThis.fetch = async (url) => { | |
| const path = url.startsWith("data/") ? `./${url}` : url; | |
| const txt = readFileSync(path, "utf-8"); | |
| return { | |
| ok: true, | |
| json: async () => JSON.parse(txt), | |
| }; | |
| }; | |
| const { normalize, lookup, classify, rank } = await import("../js/longscore.js"); | |
| let pass = 0, fail = 0; | |
| function check(name, cond, detail) { | |
| if (cond) { pass++; console.log(` ✓ ${name}`); } | |
| else { fail++; console.log(` ✗ ${name}${detail ? ": " + detail : ""}`); } | |
| } | |
| console.log("--- normalize ---"); | |
| check("trims + lowercases", normalize(" Qwen2.5 ") === "qwen-2-5"); | |
| check("strips meta-llama/", normalize("meta-llama/Llama-3.1-70B-Instruct") === "llama-3-1-70-b-instruct"); | |
| check("strips 01-ai/", normalize("01-ai/Yi-34B-200K") === "yi-34-b-200-k"); | |
| check("inst → instruct", normalize("Mistral-7B-Inst-v0.2") === "mistral-7-b-instruct-v-0-2"); | |
| check("dot → dash", normalize("Phi-3.5-mini-instruct") === "phi-3-5-mini-instruct"); | |
| check("empty", normalize("") === ""); | |
| console.log("\n--- classify ---"); | |
| const t = { no_degradation: -0.02, mild: -0.10, moderate: -0.20, severe: -0.30 }; | |
| check("no_data", classify(null, t) === "no_data"); | |
| check("no_degradation", classify(0.0, t) === "no_degradation"); | |
| check("mild", classify(-0.05, t) === "mild"); | |
| check("moderate", classify(-0.15, t) === "moderate"); | |
| check("severe", classify(-0.25, t) === "severe"); | |
| check("extreme", classify(-0.50, t) === "extreme"); | |
| console.log("\n--- lookup (RULER hit) ---"); | |
| const r1 = await lookup("Llama-3.1-70B-Instruct"); | |
| check("ruler_hit code", r1.code === "ruler_hit"); | |
| check("longscore present", typeof r1.ruler_long_score?.avg_lc === "number"); | |
| check("verdict assigned", r1.verdict !== null); | |
| check("base ~96", r1.ruler_long_score?.base > 95 && r1.ruler_long_score?.base < 97, | |
| `got base=${r1.ruler_long_score?.base}`); | |
| check("Llama-3.1-70B avg_lc ~-0.10", Math.abs(r1.ruler_long_score?.avg_lc - (-0.1024)) < 0.001, | |
| `got ${r1.ruler_long_score?.avg_lc}`); | |
| console.log("\n--- lookup (Jamba — best LongScore) ---"); | |
| const r2 = await lookup("Jamba-1.5-Large"); | |
| check("ruler_hit", r2.code === "ruler_hit"); | |
| check("Jamba near-zero degradation", r2.ruler_long_score?.avg_lc > -0.02); | |
| console.log("\n--- lookup (dbrx — severe) ---"); | |
| const r3 = await lookup("dbrx"); | |
| check("ruler_hit", r3.code === "ruler_hit"); | |
| check("dbrx severe verdict", r3.verdict === "severe" || r3.verdict === "extreme", | |
| `got verdict=${r3.verdict} for avg_lc=${r3.ruler_long_score?.avg_lc}`); | |
| console.log("\n--- lookup (miss) ---"); | |
| const r4 = await lookup("nonexistent-model-123"); | |
| check("miss code", r4.code === "miss"); | |
| check("normalized id present", r4.normalized_id === "nonexistent-model-123"); | |
| console.log("\n--- rank ---"); | |
| const ranking = await rank("worst"); | |
| check("ranking returned", Array.isArray(ranking) && ranking.length > 0); | |
| check("worst is most negative", ranking[0].avg_lc < ranking[ranking.length - 1].avg_lc); | |
| console.log(`\n${pass} passed, ${fail} failed`); | |
| process.exit(fail > 0 ? 1 : 0); | |