Alyafeai commited on
Commit
7749d9c
·
1 Parent(s): 4f2765c

fix(ui): render structured benchmark details correctly

Browse files
backend/data_loader.py CHANGED
@@ -590,6 +590,30 @@ def _py_scalar(value: Any) -> Any:
590
  return value
591
 
592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  def _json_safe(value: Any) -> Any:
594
  value = _py_scalar(value)
595
  if isinstance(value, dict):
@@ -853,7 +877,11 @@ def _read_detail_parquet(
853
  )
854
  predicted = rec.get("predicted_answer") or output
855
  gold_raw = rec.get("gold_answer")
856
- gold_display = gold_raw if gold_raw not in (None, "") else rec.get("gold_verse_explanations")
 
 
 
 
857
 
858
  is_correct = None
859
  # Only enable binary correct/wrong mode for explicit gold_answer labels.
@@ -920,12 +948,12 @@ def _make_simple_row(
920
  "subtask": subtask,
921
  "question_id": _py_scalar(question_id),
922
  "task_name": _py_scalar(task_name),
923
- "prompt": prompt or "",
924
  "input_prompt": None,
925
- "output": output,
926
  "choices": [],
927
- "gold_answer": _py_scalar(gold_answer),
928
- "predicted_answer": _py_scalar(predicted_answer),
929
  "is_correct": is_correct,
930
  "metric_name": metric_name,
931
  "metric": _to_float_scalar(metric_value),
@@ -949,6 +977,10 @@ def _read_detail_fannorflop_rows(records: List[Any], subtask: str, benchmark_bas
949
  output = rec.get("extracted_response") or rec.get("response")
950
  predicted = rec.get("predicted_answer") or output
951
  gold = rec.get("gold_answer")
 
 
 
 
952
  is_correct = None
953
  # Only enable binary mode when explicit gold_answer exists.
954
  binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
 
590
  return value
591
 
592
 
593
+ def _decode_structured_string(value: Any) -> Any:
594
+ value = _py_scalar(value)
595
+ if not isinstance(value, str):
596
+ return value
597
+
598
+ s = value.strip()
599
+ if not s:
600
+ return value
601
+
602
+ looks_structured = (
603
+ (s.startswith("{") and s.endswith("}")) or
604
+ (s.startswith("[") and s.endswith("]"))
605
+ )
606
+ if not looks_structured:
607
+ return value
608
+
609
+ for parser in (json.loads, ast.literal_eval):
610
+ with contextlib.suppress(Exception):
611
+ parsed = parser(s)
612
+ if isinstance(parsed, (dict, list)):
613
+ return _json_safe(parsed)
614
+ return value
615
+
616
+
617
  def _json_safe(value: Any) -> Any:
618
  value = _py_scalar(value)
619
  if isinstance(value, dict):
 
877
  )
878
  predicted = rec.get("predicted_answer") or output
879
  gold_raw = rec.get("gold_answer")
880
+ gold_display = gold_raw if gold_raw not in (None, "") else (
881
+ rec.get("gold_verse_explanations")
882
+ if rec.get("gold_verse_explanations") not in (None, "")
883
+ else rec.get("verse_explanations")
884
+ )
885
 
886
  is_correct = None
887
  # Only enable binary correct/wrong mode for explicit gold_answer labels.
 
948
  "subtask": subtask,
949
  "question_id": _py_scalar(question_id),
950
  "task_name": _py_scalar(task_name),
951
+ "prompt": _decode_structured_string(prompt or ""),
952
  "input_prompt": None,
953
+ "output": _decode_structured_string(output),
954
  "choices": [],
955
+ "gold_answer": _decode_structured_string(gold_answer),
956
+ "predicted_answer": _decode_structured_string(predicted_answer),
957
  "is_correct": is_correct,
958
  "metric_name": metric_name,
959
  "metric": _to_float_scalar(metric_value),
 
977
  output = rec.get("extracted_response") or rec.get("response")
978
  predicted = rec.get("predicted_answer") or output
979
  gold = rec.get("gold_answer")
980
+ if gold in (None, ""):
981
+ gold = rec.get("gold_verse_explanations")
982
+ if gold in (None, ""):
983
+ gold = rec.get("verse_explanations")
984
  is_correct = None
985
  # Only enable binary mode when explicit gold_answer exists.
986
  binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
frontend/leaderboard.html CHANGED
@@ -1517,7 +1517,7 @@ window.toggleExpandText = function (idBase, btn) {
1517
  ? formatDetailValue(r.prompt, false)
1518
  : asUnknown(r.prompt)
1519
  );
1520
- const output = asUnknown(r.output);
1521
  const gold = formatDetailValue(r.gold_answer, !isCodeEvalBenchmark);
1522
  const predicted = formatDetailValue(r.predicted_answer);
1523
  const sampleMeta = [
 
1517
  ? formatDetailValue(r.prompt, false)
1518
  : asUnknown(r.prompt)
1519
  );
1520
+ const output = formatDetailValue(r.output);
1521
  const gold = formatDetailValue(r.gold_answer, !isCodeEvalBenchmark);
1522
  const predicted = formatDetailValue(r.predicted_answer);
1523
  const sampleMeta = [