Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
fix(ui): render structured benchmark details correctly
Browse files- backend/data_loader.py +37 -5
- frontend/leaderboard.html +1 -1
backend/data_loader.py
CHANGED
|
@@ -590,6 +590,30 @@ def _py_scalar(value: Any) -> Any:
|
|
| 590 |
return value
|
| 591 |
|
| 592 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
def _json_safe(value: Any) -> Any:
|
| 594 |
value = _py_scalar(value)
|
| 595 |
if isinstance(value, dict):
|
|
@@ -853,7 +877,11 @@ def _read_detail_parquet(
|
|
| 853 |
)
|
| 854 |
predicted = rec.get("predicted_answer") or output
|
| 855 |
gold_raw = rec.get("gold_answer")
|
| 856 |
-
gold_display = gold_raw if gold_raw not in (None, "") else
|
|
|
|
|
|
|
|
|
|
|
|
|
| 857 |
|
| 858 |
is_correct = None
|
| 859 |
# Only enable binary correct/wrong mode for explicit gold_answer labels.
|
|
@@ -920,12 +948,12 @@ def _make_simple_row(
|
|
| 920 |
"subtask": subtask,
|
| 921 |
"question_id": _py_scalar(question_id),
|
| 922 |
"task_name": _py_scalar(task_name),
|
| 923 |
-
"prompt": prompt or "",
|
| 924 |
"input_prompt": None,
|
| 925 |
-
"output": output,
|
| 926 |
"choices": [],
|
| 927 |
-
"gold_answer":
|
| 928 |
-
"predicted_answer":
|
| 929 |
"is_correct": is_correct,
|
| 930 |
"metric_name": metric_name,
|
| 931 |
"metric": _to_float_scalar(metric_value),
|
|
@@ -949,6 +977,10 @@ def _read_detail_fannorflop_rows(records: List[Any], subtask: str, benchmark_bas
|
|
| 949 |
output = rec.get("extracted_response") or rec.get("response")
|
| 950 |
predicted = rec.get("predicted_answer") or output
|
| 951 |
gold = rec.get("gold_answer")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 952 |
is_correct = None
|
| 953 |
# Only enable binary mode when explicit gold_answer exists.
|
| 954 |
binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
|
|
|
|
| 590 |
return value
|
| 591 |
|
| 592 |
|
| 593 |
+
def _decode_structured_string(value: Any) -> Any:
|
| 594 |
+
value = _py_scalar(value)
|
| 595 |
+
if not isinstance(value, str):
|
| 596 |
+
return value
|
| 597 |
+
|
| 598 |
+
s = value.strip()
|
| 599 |
+
if not s:
|
| 600 |
+
return value
|
| 601 |
+
|
| 602 |
+
looks_structured = (
|
| 603 |
+
(s.startswith("{") and s.endswith("}")) or
|
| 604 |
+
(s.startswith("[") and s.endswith("]"))
|
| 605 |
+
)
|
| 606 |
+
if not looks_structured:
|
| 607 |
+
return value
|
| 608 |
+
|
| 609 |
+
for parser in (json.loads, ast.literal_eval):
|
| 610 |
+
with contextlib.suppress(Exception):
|
| 611 |
+
parsed = parser(s)
|
| 612 |
+
if isinstance(parsed, (dict, list)):
|
| 613 |
+
return _json_safe(parsed)
|
| 614 |
+
return value
|
| 615 |
+
|
| 616 |
+
|
| 617 |
def _json_safe(value: Any) -> Any:
|
| 618 |
value = _py_scalar(value)
|
| 619 |
if isinstance(value, dict):
|
|
|
|
| 877 |
)
|
| 878 |
predicted = rec.get("predicted_answer") or output
|
| 879 |
gold_raw = rec.get("gold_answer")
|
| 880 |
+
gold_display = gold_raw if gold_raw not in (None, "") else (
|
| 881 |
+
rec.get("gold_verse_explanations")
|
| 882 |
+
if rec.get("gold_verse_explanations") not in (None, "")
|
| 883 |
+
else rec.get("verse_explanations")
|
| 884 |
+
)
|
| 885 |
|
| 886 |
is_correct = None
|
| 887 |
# Only enable binary correct/wrong mode for explicit gold_answer labels.
|
|
|
|
| 948 |
"subtask": subtask,
|
| 949 |
"question_id": _py_scalar(question_id),
|
| 950 |
"task_name": _py_scalar(task_name),
|
| 951 |
+
"prompt": _decode_structured_string(prompt or ""),
|
| 952 |
"input_prompt": None,
|
| 953 |
+
"output": _decode_structured_string(output),
|
| 954 |
"choices": [],
|
| 955 |
+
"gold_answer": _decode_structured_string(gold_answer),
|
| 956 |
+
"predicted_answer": _decode_structured_string(predicted_answer),
|
| 957 |
"is_correct": is_correct,
|
| 958 |
"metric_name": metric_name,
|
| 959 |
"metric": _to_float_scalar(metric_value),
|
|
|
|
| 977 |
output = rec.get("extracted_response") or rec.get("response")
|
| 978 |
predicted = rec.get("predicted_answer") or output
|
| 979 |
gold = rec.get("gold_answer")
|
| 980 |
+
if gold in (None, ""):
|
| 981 |
+
gold = rec.get("gold_verse_explanations")
|
| 982 |
+
if gold in (None, ""):
|
| 983 |
+
gold = rec.get("verse_explanations")
|
| 984 |
is_correct = None
|
| 985 |
# Only enable binary mode when explicit gold_answer exists.
|
| 986 |
binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
|
frontend/leaderboard.html
CHANGED
|
@@ -1517,7 +1517,7 @@ window.toggleExpandText = function (idBase, btn) {
|
|
| 1517 |
? formatDetailValue(r.prompt, false)
|
| 1518 |
: asUnknown(r.prompt)
|
| 1519 |
);
|
| 1520 |
-
const output =
|
| 1521 |
const gold = formatDetailValue(r.gold_answer, !isCodeEvalBenchmark);
|
| 1522 |
const predicted = formatDetailValue(r.predicted_answer);
|
| 1523 |
const sampleMeta = [
|
|
|
|
| 1517 |
? formatDetailValue(r.prompt, false)
|
| 1518 |
: asUnknown(r.prompt)
|
| 1519 |
);
|
| 1520 |
+
const output = formatDetailValue(r.output);
|
| 1521 |
const gold = formatDetailValue(r.gold_answer, !isCodeEvalBenchmark);
|
| 1522 |
const predicted = formatDetailValue(r.predicted_answer);
|
| 1523 |
const sampleMeta = [
|