Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Add Expand and Collapse button for long samples in the detail section
Browse files- backend/data_loader.py +17 -5
- frontend/leaderboard.html +39 -7
backend/data_loader.py
CHANGED
|
@@ -593,6 +593,11 @@ def _norm_answer(value: Any) -> str:
|
|
| 593 |
return str(value).strip()
|
| 594 |
|
| 595 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
def _pick_metric(
|
| 597 |
metric: Dict[str, Any],
|
| 598 |
benchmark_base: str,
|
|
@@ -810,10 +815,14 @@ def _read_detail_parquet(
|
|
| 810 |
gold = rec.get("gold_answer") or rec.get("gold_verse_explanations")
|
| 811 |
|
| 812 |
is_correct = None
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
|
| 818 |
rows.append(_make_simple_row(
|
| 819 |
subtask=subtask,
|
|
@@ -899,8 +908,11 @@ def _read_detail_fannorflop_rows(records: List[Any], subtask: str, benchmark_bas
|
|
| 899 |
predicted = rec.get("predicted_answer") or output
|
| 900 |
gold = rec.get("gold_answer")
|
| 901 |
is_correct = None
|
| 902 |
-
|
|
|
|
| 903 |
is_correct = (_norm_answer(gold) == _norm_answer(predicted))
|
|
|
|
|
|
|
| 904 |
rows.append(_make_simple_row(
|
| 905 |
subtask=subtask,
|
| 906 |
question_id=rec.get("id"),
|
|
|
|
| 593 |
return str(value).strip()
|
| 594 |
|
| 595 |
|
| 596 |
+
def _is_primitive_answer(value: Any) -> bool:
|
| 597 |
+
value = _py_scalar(value)
|
| 598 |
+
return value is not None and isinstance(value, (str, int, float, bool, np.integer, np.floating))
|
| 599 |
+
|
| 600 |
+
|
| 601 |
def _pick_metric(
|
| 602 |
metric: Dict[str, Any],
|
| 603 |
benchmark_base: str,
|
|
|
|
| 815 |
gold = rec.get("gold_answer") or rec.get("gold_verse_explanations")
|
| 816 |
|
| 817 |
is_correct = None
|
| 818 |
+
binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
|
| 819 |
+
if binary_mode:
|
| 820 |
+
gold_norm = _norm_answer(gold)
|
| 821 |
+
pred_norm = _norm_answer(predicted)
|
| 822 |
+
if gold_norm and pred_norm:
|
| 823 |
+
is_correct = (gold_norm == pred_norm)
|
| 824 |
+
metric_key = "fannorflop"
|
| 825 |
+
metric_value = 1.0 if is_correct else 0.0
|
| 826 |
|
| 827 |
rows.append(_make_simple_row(
|
| 828 |
subtask=subtask,
|
|
|
|
| 908 |
predicted = rec.get("predicted_answer") or output
|
| 909 |
gold = rec.get("gold_answer")
|
| 910 |
is_correct = None
|
| 911 |
+
binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
|
| 912 |
+
if binary_mode and gold not in (None, "") and predicted not in (None, ""):
|
| 913 |
is_correct = (_norm_answer(gold) == _norm_answer(predicted))
|
| 914 |
+
metric_key = "fannorflop"
|
| 915 |
+
metric_value = 1.0 if is_correct else 0.0
|
| 916 |
rows.append(_make_simple_row(
|
| 917 |
subtask=subtask,
|
| 918 |
question_id=rec.get("id"),
|
frontend/leaderboard.html
CHANGED
|
@@ -640,6 +640,22 @@
|
|
| 640 |
.replace(/>/g, ">")
|
| 641 |
.replace(/\"/g, """)
|
| 642 |
.replace(/'/g, "'");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
|
| 644 |
let lbData = [], grid, maxMeta = 100, minMeta = 0, tableColumns = [];
|
| 645 |
let currentSort = { colId: null, dir: 'none' };
|
|
@@ -760,6 +776,16 @@
|
|
| 760 |
applyFilters();
|
| 761 |
}
|
| 762 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 763 |
// --- BENCHMARK DETAILS MODAL ---
|
| 764 |
window.openBenchmarkDetails = async function (modelName, benchmark) {
|
| 765 |
$('#benchmarkModalTitle').innerText = `${benchmark} Details — ${modelName}`;
|
|
@@ -815,11 +841,12 @@
|
|
| 815 |
`).join("");
|
| 816 |
|
| 817 |
const isCodeEvalBenchmark = benchmark === "HumanEval+" || benchmark === "MBPP+";
|
| 818 |
-
$('#benchmarkRows').innerHTML = rows.map(r => {
|
| 819 |
const promptColSpan = showSubtaskColumn ? "col-span-6" : "col-span-8";
|
| 820 |
const subtaskCell = showSubtaskColumn
|
| 821 |
? `<div class="col-span-2 text-slate-600 dark:text-slate-300">${escapeHtml(r.subtask)}</div>`
|
| 822 |
: ``;
|
|
|
|
| 823 |
let correctBadge = `<span class="text-slate-500 dark:text-slate-400 font-semibold">Unknown</span>`;
|
| 824 |
if (r.is_correct === true) {
|
| 825 |
correctBadge = `<span class="text-emerald-600 dark:text-emerald-400 font-semibold">Correct</span>`;
|
|
@@ -830,12 +857,14 @@
|
|
| 830 |
const scoreText = Number.isFinite(n) ? n.toFixed(4) : escapeHtml(r.metric);
|
| 831 |
correctBadge = `<span class="text-sky-600 dark:text-sky-400 font-semibold">Score: ${scoreText}</span>`;
|
| 832 |
}
|
| 833 |
-
const prompt =
|
| 834 |
isCodeEvalBenchmark
|
| 835 |
? formatDetailValue(r.prompt, false)
|
| 836 |
: asUnknown(r.prompt)
|
| 837 |
);
|
| 838 |
-
const output =
|
|
|
|
|
|
|
| 839 |
const sampleMeta = [
|
| 840 |
r.question_id ? `id: ${escapeHtml(r.question_id)}` : null,
|
| 841 |
r.metric !== null && r.metric !== undefined ? `score: ${escapeHtml(r.metric)}` : null,
|
|
@@ -845,13 +874,16 @@
|
|
| 845 |
<div class="grid grid-cols-12 gap-3 px-4 py-3 text-xs">
|
| 846 |
${subtaskCell}
|
| 847 |
<div class="${promptColSpan} text-slate-700 dark:text-slate-200">
|
| 848 |
-
<div class="font-semibold text-slate-800 dark:text-slate-100
|
| 849 |
-
<div class="mt-2 text-slate-500 dark:text-slate-400
|
|
|
|
|
|
|
|
|
|
| 850 |
${sampleMeta ? `<div class="mt-1 text-slate-400 dark:text-slate-500">${sampleMeta}</div>` : ``}
|
| 851 |
<div class="mt-1">${correctBadge}</div>
|
| 852 |
</div>
|
| 853 |
-
<div class="col-span-2 text-slate-600 dark:text-slate-300
|
| 854 |
-
<div class="col-span-2 text-slate-600 dark:text-slate-300
|
| 855 |
</div>
|
| 856 |
`;
|
| 857 |
}).join("");
|
|
|
|
| 640 |
.replace(/>/g, ">")
|
| 641 |
.replace(/\"/g, """)
|
| 642 |
.replace(/'/g, "'");
|
| 643 |
+
const DETAIL_CELL_LIMIT = 260;
|
| 644 |
+
const expandableTextHtml = (value, idBase, limit = DETAIL_CELL_LIMIT) => {
|
| 645 |
+
const raw = String(value ?? "");
|
| 646 |
+
if (!raw) return "";
|
| 647 |
+
if (raw.length <= limit) {
|
| 648 |
+
return `<div class="whitespace-pre-wrap break-words">${escapeHtml(raw)}</div>`;
|
| 649 |
+
}
|
| 650 |
+
const short = `${raw.slice(0, limit).trimEnd()}...`;
|
| 651 |
+
return `
|
| 652 |
+
<div>
|
| 653 |
+
<div id="${idBase}-short" class="whitespace-pre-wrap break-words">${escapeHtml(short)}</div>
|
| 654 |
+
<div id="${idBase}-full" class="hidden whitespace-pre-wrap break-words">${escapeHtml(raw)}</div>
|
| 655 |
+
<button type="button" onclick="window.toggleExpandText('${idBase}', this)" class="mt-2 inline-flex items-center gap-1 px-2 py-1 rounded-md border border-indigo-300 dark:border-indigo-600 bg-indigo-50 dark:bg-indigo-900/30 text-xs font-bold text-indigo-700 dark:text-indigo-300 hover:bg-indigo-100 dark:hover:bg-indigo-900/50 transition-colors">Expand</button>
|
| 656 |
+
</div>
|
| 657 |
+
`;
|
| 658 |
+
};
|
| 659 |
|
| 660 |
let lbData = [], grid, maxMeta = 100, minMeta = 0, tableColumns = [];
|
| 661 |
let currentSort = { colId: null, dir: 'none' };
|
|
|
|
| 776 |
applyFilters();
|
| 777 |
}
|
| 778 |
|
| 779 |
+
window.toggleExpandText = function (idBase, btn) {
|
| 780 |
+
const shortEl = document.getElementById(`${idBase}-short`);
|
| 781 |
+
const fullEl = document.getElementById(`${idBase}-full`);
|
| 782 |
+
if (!shortEl || !fullEl || !btn) return;
|
| 783 |
+
const expanding = fullEl.classList.contains('hidden');
|
| 784 |
+
shortEl.classList.toggle('hidden', expanding);
|
| 785 |
+
fullEl.classList.toggle('hidden', !expanding);
|
| 786 |
+
btn.innerText = expanding ? 'Collapse' : 'Expand';
|
| 787 |
+
};
|
| 788 |
+
|
| 789 |
// --- BENCHMARK DETAILS MODAL ---
|
| 790 |
window.openBenchmarkDetails = async function (modelName, benchmark) {
|
| 791 |
$('#benchmarkModalTitle').innerText = `${benchmark} Details — ${modelName}`;
|
|
|
|
| 841 |
`).join("");
|
| 842 |
|
| 843 |
const isCodeEvalBenchmark = benchmark === "HumanEval+" || benchmark === "MBPP+";
|
| 844 |
+
$('#benchmarkRows').innerHTML = rows.map((r, idx) => {
|
| 845 |
const promptColSpan = showSubtaskColumn ? "col-span-6" : "col-span-8";
|
| 846 |
const subtaskCell = showSubtaskColumn
|
| 847 |
? `<div class="col-span-2 text-slate-600 dark:text-slate-300">${escapeHtml(r.subtask)}</div>`
|
| 848 |
: ``;
|
| 849 |
+
const rowId = `bd-row-${idx}`;
|
| 850 |
let correctBadge = `<span class="text-slate-500 dark:text-slate-400 font-semibold">Unknown</span>`;
|
| 851 |
if (r.is_correct === true) {
|
| 852 |
correctBadge = `<span class="text-emerald-600 dark:text-emerald-400 font-semibold">Correct</span>`;
|
|
|
|
| 857 |
const scoreText = Number.isFinite(n) ? n.toFixed(4) : escapeHtml(r.metric);
|
| 858 |
correctBadge = `<span class="text-sky-600 dark:text-sky-400 font-semibold">Score: ${scoreText}</span>`;
|
| 859 |
}
|
| 860 |
+
const prompt = (
|
| 861 |
isCodeEvalBenchmark
|
| 862 |
? formatDetailValue(r.prompt, false)
|
| 863 |
: asUnknown(r.prompt)
|
| 864 |
);
|
| 865 |
+
const output = asUnknown(r.output);
|
| 866 |
+
const gold = formatDetailValue(r.gold_answer, !isCodeEvalBenchmark);
|
| 867 |
+
const predicted = formatDetailValue(r.predicted_answer);
|
| 868 |
const sampleMeta = [
|
| 869 |
r.question_id ? `id: ${escapeHtml(r.question_id)}` : null,
|
| 870 |
r.metric !== null && r.metric !== undefined ? `score: ${escapeHtml(r.metric)}` : null,
|
|
|
|
| 874 |
<div class="grid grid-cols-12 gap-3 px-4 py-3 text-xs">
|
| 875 |
${subtaskCell}
|
| 876 |
<div class="${promptColSpan} text-slate-700 dark:text-slate-200">
|
| 877 |
+
<div class="font-semibold text-slate-800 dark:text-slate-100">${expandableTextHtml(prompt, `${rowId}-prompt`)}</div>
|
| 878 |
+
<div class="mt-2 text-slate-500 dark:text-slate-400">
|
| 879 |
+
<span class="font-semibold">Output:</span>
|
| 880 |
+
${expandableTextHtml(output, `${rowId}-output`)}
|
| 881 |
+
</div>
|
| 882 |
${sampleMeta ? `<div class="mt-1 text-slate-400 dark:text-slate-500">${sampleMeta}</div>` : ``}
|
| 883 |
<div class="mt-1">${correctBadge}</div>
|
| 884 |
</div>
|
| 885 |
+
<div class="col-span-2 text-slate-600 dark:text-slate-300">${expandableTextHtml(gold, `${rowId}-gold`)}</div>
|
| 886 |
+
<div class="col-span-2 text-slate-600 dark:text-slate-300">${expandableTextHtml(predicted, `${rowId}-pred`)}</div>
|
| 887 |
</div>
|
| 888 |
`;
|
| 889 |
}).join("");
|