Alyafeai commited on
Commit
129be1e
·
1 Parent(s): b50cc9d

Add Expand and Collapse button for long samples in the detail section

Browse files
Files changed (2) hide show
  1. backend/data_loader.py +17 -5
  2. frontend/leaderboard.html +39 -7
backend/data_loader.py CHANGED
@@ -593,6 +593,11 @@ def _norm_answer(value: Any) -> str:
593
  return str(value).strip()
594
 
595
 
 
 
 
 
 
596
  def _pick_metric(
597
  metric: Dict[str, Any],
598
  benchmark_base: str,
@@ -810,10 +815,14 @@ def _read_detail_parquet(
810
  gold = rec.get("gold_answer") or rec.get("gold_verse_explanations")
811
 
812
  is_correct = None
813
- gold_norm = _norm_answer(gold)
814
- pred_norm = _norm_answer(predicted)
815
- if gold_norm and pred_norm:
816
- is_correct = (gold_norm == pred_norm)
 
 
 
 
817
 
818
  rows.append(_make_simple_row(
819
  subtask=subtask,
@@ -899,8 +908,11 @@ def _read_detail_fannorflop_rows(records: List[Any], subtask: str, benchmark_bas
899
  predicted = rec.get("predicted_answer") or output
900
  gold = rec.get("gold_answer")
901
  is_correct = None
902
- if gold not in (None, "") and predicted not in (None, ""):
 
903
  is_correct = (_norm_answer(gold) == _norm_answer(predicted))
 
 
904
  rows.append(_make_simple_row(
905
  subtask=subtask,
906
  question_id=rec.get("id"),
 
593
  return str(value).strip()
594
 
595
 
596
+ def _is_primitive_answer(value: Any) -> bool:
597
+ value = _py_scalar(value)
598
+ return value is not None and isinstance(value, (str, int, float, bool, np.integer, np.floating))
599
+
600
+
601
  def _pick_metric(
602
  metric: Dict[str, Any],
603
  benchmark_base: str,
 
815
  gold = rec.get("gold_answer") or rec.get("gold_verse_explanations")
816
 
817
  is_correct = None
818
+ binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
819
+ if binary_mode:
820
+ gold_norm = _norm_answer(gold)
821
+ pred_norm = _norm_answer(predicted)
822
+ if gold_norm and pred_norm:
823
+ is_correct = (gold_norm == pred_norm)
824
+ metric_key = "fannorflop"
825
+ metric_value = 1.0 if is_correct else 0.0
826
 
827
  rows.append(_make_simple_row(
828
  subtask=subtask,
 
908
  predicted = rec.get("predicted_answer") or output
909
  gold = rec.get("gold_answer")
910
  is_correct = None
911
+ binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
912
+ if binary_mode and gold not in (None, "") and predicted not in (None, ""):
913
  is_correct = (_norm_answer(gold) == _norm_answer(predicted))
914
+ metric_key = "fannorflop"
915
+ metric_value = 1.0 if is_correct else 0.0
916
  rows.append(_make_simple_row(
917
  subtask=subtask,
918
  question_id=rec.get("id"),
frontend/leaderboard.html CHANGED
@@ -640,6 +640,22 @@
640
  .replace(/>/g, ">")
641
  .replace(/\"/g, """)
642
  .replace(/'/g, "'");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
 
644
  let lbData = [], grid, maxMeta = 100, minMeta = 0, tableColumns = [];
645
  let currentSort = { colId: null, dir: 'none' };
@@ -760,6 +776,16 @@
760
  applyFilters();
761
  }
762
 
 
 
 
 
 
 
 
 
 
 
763
  // --- BENCHMARK DETAILS MODAL ---
764
  window.openBenchmarkDetails = async function (modelName, benchmark) {
765
  $('#benchmarkModalTitle').innerText = `${benchmark} Details — ${modelName}`;
@@ -815,11 +841,12 @@
815
  `).join("");
816
 
817
  const isCodeEvalBenchmark = benchmark === "HumanEval+" || benchmark === "MBPP+";
818
- $('#benchmarkRows').innerHTML = rows.map(r => {
819
  const promptColSpan = showSubtaskColumn ? "col-span-6" : "col-span-8";
820
  const subtaskCell = showSubtaskColumn
821
  ? `<div class="col-span-2 text-slate-600 dark:text-slate-300">${escapeHtml(r.subtask)}</div>`
822
  : ``;
 
823
  let correctBadge = `<span class="text-slate-500 dark:text-slate-400 font-semibold">Unknown</span>`;
824
  if (r.is_correct === true) {
825
  correctBadge = `<span class="text-emerald-600 dark:text-emerald-400 font-semibold">Correct</span>`;
@@ -830,12 +857,14 @@
830
  const scoreText = Number.isFinite(n) ? n.toFixed(4) : escapeHtml(r.metric);
831
  correctBadge = `<span class="text-sky-600 dark:text-sky-400 font-semibold">Score: ${scoreText}</span>`;
832
  }
833
- const prompt = escapeHtml(
834
  isCodeEvalBenchmark
835
  ? formatDetailValue(r.prompt, false)
836
  : asUnknown(r.prompt)
837
  );
838
- const output = escapeHtml(asUnknown(r.output));
 
 
839
  const sampleMeta = [
840
  r.question_id ? `id: ${escapeHtml(r.question_id)}` : null,
841
  r.metric !== null && r.metric !== undefined ? `score: ${escapeHtml(r.metric)}` : null,
@@ -845,13 +874,16 @@
845
  <div class="grid grid-cols-12 gap-3 px-4 py-3 text-xs">
846
  ${subtaskCell}
847
  <div class="${promptColSpan} text-slate-700 dark:text-slate-200">
848
- <div class="font-semibold text-slate-800 dark:text-slate-100 whitespace-pre-wrap">${prompt}</div>
849
- <div class="mt-2 text-slate-500 dark:text-slate-400 whitespace-pre-wrap"><span class="font-semibold">Output:</span> ${output}</div>
 
 
 
850
  ${sampleMeta ? `<div class="mt-1 text-slate-400 dark:text-slate-500">${sampleMeta}</div>` : ``}
851
  <div class="mt-1">${correctBadge}</div>
852
  </div>
853
- <div class="col-span-2 text-slate-600 dark:text-slate-300 whitespace-pre-wrap">${escapeHtml(formatDetailValue(r.gold_answer, !isCodeEvalBenchmark))}</div>
854
- <div class="col-span-2 text-slate-600 dark:text-slate-300 whitespace-pre-wrap">${escapeHtml(formatDetailValue(r.predicted_answer))}</div>
855
  </div>
856
  `;
857
  }).join("");
 
640
  .replace(/>/g, "&gt;")
641
  .replace(/\"/g, "&quot;")
642
  .replace(/'/g, "&#039;");
643
+ const DETAIL_CELL_LIMIT = 260;
644
+ const expandableTextHtml = (value, idBase, limit = DETAIL_CELL_LIMIT) => {
645
+ const raw = String(value ?? "");
646
+ if (!raw) return "";
647
+ if (raw.length <= limit) {
648
+ return `<div class="whitespace-pre-wrap break-words">${escapeHtml(raw)}</div>`;
649
+ }
650
+ const short = `${raw.slice(0, limit).trimEnd()}...`;
651
+ return `
652
+ <div>
653
+ <div id="${idBase}-short" class="whitespace-pre-wrap break-words">${escapeHtml(short)}</div>
654
+ <div id="${idBase}-full" class="hidden whitespace-pre-wrap break-words">${escapeHtml(raw)}</div>
655
+ <button type="button" onclick="window.toggleExpandText('${idBase}', this)" class="mt-2 inline-flex items-center gap-1 px-2 py-1 rounded-md border border-indigo-300 dark:border-indigo-600 bg-indigo-50 dark:bg-indigo-900/30 text-xs font-bold text-indigo-700 dark:text-indigo-300 hover:bg-indigo-100 dark:hover:bg-indigo-900/50 transition-colors">Expand</button>
656
+ </div>
657
+ `;
658
+ };
659
 
660
  let lbData = [], grid, maxMeta = 100, minMeta = 0, tableColumns = [];
661
  let currentSort = { colId: null, dir: 'none' };
 
776
  applyFilters();
777
  }
778
 
779
+ window.toggleExpandText = function (idBase, btn) {
780
+ const shortEl = document.getElementById(`${idBase}-short`);
781
+ const fullEl = document.getElementById(`${idBase}-full`);
782
+ if (!shortEl || !fullEl || !btn) return;
783
+ const expanding = fullEl.classList.contains('hidden');
784
+ shortEl.classList.toggle('hidden', expanding);
785
+ fullEl.classList.toggle('hidden', !expanding);
786
+ btn.innerText = expanding ? 'Collapse' : 'Expand';
787
+ };
788
+
789
  // --- BENCHMARK DETAILS MODAL ---
790
  window.openBenchmarkDetails = async function (modelName, benchmark) {
791
  $('#benchmarkModalTitle').innerText = `${benchmark} Details — ${modelName}`;
 
841
  `).join("");
842
 
843
  const isCodeEvalBenchmark = benchmark === "HumanEval+" || benchmark === "MBPP+";
844
+ $('#benchmarkRows').innerHTML = rows.map((r, idx) => {
845
  const promptColSpan = showSubtaskColumn ? "col-span-6" : "col-span-8";
846
  const subtaskCell = showSubtaskColumn
847
  ? `<div class="col-span-2 text-slate-600 dark:text-slate-300">${escapeHtml(r.subtask)}</div>`
848
  : ``;
849
+ const rowId = `bd-row-${idx}`;
850
  let correctBadge = `<span class="text-slate-500 dark:text-slate-400 font-semibold">Unknown</span>`;
851
  if (r.is_correct === true) {
852
  correctBadge = `<span class="text-emerald-600 dark:text-emerald-400 font-semibold">Correct</span>`;
 
857
  const scoreText = Number.isFinite(n) ? n.toFixed(4) : escapeHtml(r.metric);
858
  correctBadge = `<span class="text-sky-600 dark:text-sky-400 font-semibold">Score: ${scoreText}</span>`;
859
  }
860
+ const prompt = (
861
  isCodeEvalBenchmark
862
  ? formatDetailValue(r.prompt, false)
863
  : asUnknown(r.prompt)
864
  );
865
+ const output = asUnknown(r.output);
866
+ const gold = formatDetailValue(r.gold_answer, !isCodeEvalBenchmark);
867
+ const predicted = formatDetailValue(r.predicted_answer);
868
  const sampleMeta = [
869
  r.question_id ? `id: ${escapeHtml(r.question_id)}` : null,
870
  r.metric !== null && r.metric !== undefined ? `score: ${escapeHtml(r.metric)}` : null,
 
874
  <div class="grid grid-cols-12 gap-3 px-4 py-3 text-xs">
875
  ${subtaskCell}
876
  <div class="${promptColSpan} text-slate-700 dark:text-slate-200">
877
+ <div class="font-semibold text-slate-800 dark:text-slate-100">${expandableTextHtml(prompt, `${rowId}-prompt`)}</div>
878
+ <div class="mt-2 text-slate-500 dark:text-slate-400">
879
+ <span class="font-semibold">Output:</span>
880
+ ${expandableTextHtml(output, `${rowId}-output`)}
881
+ </div>
882
  ${sampleMeta ? `<div class="mt-1 text-slate-400 dark:text-slate-500">${sampleMeta}</div>` : ``}
883
  <div class="mt-1">${correctBadge}</div>
884
  </div>
885
+ <div class="col-span-2 text-slate-600 dark:text-slate-300">${expandableTextHtml(gold, `${rowId}-gold`)}</div>
886
+ <div class="col-span-2 text-slate-600 dark:text-slate-300">${expandableTextHtml(predicted, `${rowId}-pred`)}</div>
887
  </div>
888
  `;
889
  }).join("");