Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
fix to show bert score for each sample
Browse files- backend/data_loader.py +7 -4
backend/data_loader.py
CHANGED
|
@@ -848,12 +848,14 @@ def _read_detail_parquet(
|
|
| 848 |
|
| 849 |
)
|
| 850 |
predicted = rec.get("predicted_answer") or output
|
| 851 |
-
|
|
|
|
| 852 |
|
| 853 |
is_correct = None
|
| 854 |
-
|
|
|
|
| 855 |
if binary_mode:
|
| 856 |
-
gold_norm = _norm_answer(
|
| 857 |
pred_norm = _norm_answer(predicted)
|
| 858 |
if gold_norm and pred_norm:
|
| 859 |
is_correct = (gold_norm == pred_norm)
|
|
@@ -866,7 +868,7 @@ def _read_detail_parquet(
|
|
| 866 |
task_name=benchmark_base,
|
| 867 |
prompt=rec.get("prompt"),
|
| 868 |
output=output,
|
| 869 |
-
gold_answer=
|
| 870 |
predicted_answer=predicted,
|
| 871 |
metric_name=metric_key,
|
| 872 |
metric_value=metric_value,
|
|
@@ -944,6 +946,7 @@ def _read_detail_fannorflop_rows(records: List[Any], subtask: str, benchmark_bas
|
|
| 944 |
predicted = rec.get("predicted_answer") or output
|
| 945 |
gold = rec.get("gold_answer")
|
| 946 |
is_correct = None
|
|
|
|
| 947 |
binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
|
| 948 |
if binary_mode and gold not in (None, "") and predicted not in (None, ""):
|
| 949 |
is_correct = (_norm_answer(gold) == _norm_answer(predicted))
|
|
|
|
| 848 |
|
| 849 |
)
|
| 850 |
predicted = rec.get("predicted_answer") or output
|
| 851 |
+
gold_raw = rec.get("gold_answer")
|
| 852 |
+
gold_display = gold_raw if gold_raw not in (None, "") else rec.get("gold_verse_explanations")
|
| 853 |
|
| 854 |
is_correct = None
|
| 855 |
+
# Only enable binary correct/wrong mode for explicit gold_answer labels.
|
| 856 |
+
binary_mode = _is_primitive_answer(gold_raw) and _is_primitive_answer(predicted)
|
| 857 |
if binary_mode:
|
| 858 |
+
gold_norm = _norm_answer(gold_raw)
|
| 859 |
pred_norm = _norm_answer(predicted)
|
| 860 |
if gold_norm and pred_norm:
|
| 861 |
is_correct = (gold_norm == pred_norm)
|
|
|
|
| 868 |
task_name=benchmark_base,
|
| 869 |
prompt=rec.get("prompt"),
|
| 870 |
output=output,
|
| 871 |
+
gold_answer=gold_display,
|
| 872 |
predicted_answer=predicted,
|
| 873 |
metric_name=metric_key,
|
| 874 |
metric_value=metric_value,
|
|
|
|
| 946 |
predicted = rec.get("predicted_answer") or output
|
| 947 |
gold = rec.get("gold_answer")
|
| 948 |
is_correct = None
|
| 949 |
+
# Only enable binary mode when explicit gold_answer exists.
|
| 950 |
binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
|
| 951 |
if binary_mode and gold not in (None, "") and predicted not in (None, ""):
|
| 952 |
is_correct = (_norm_answer(gold) == _norm_answer(predicted))
|