Alyafeai commited on
Commit
8cbc289
·
1 Parent(s): 5af9331

fix to show bert score for each sample

Browse files
Files changed (1) hide show
  1. backend/data_loader.py +7 -4
backend/data_loader.py CHANGED
@@ -848,12 +848,14 @@ def _read_detail_parquet(
848
 
849
  )
850
  predicted = rec.get("predicted_answer") or output
851
- gold = rec.get("gold_answer") or rec.get("gold_verse_explanations")
 
852
 
853
  is_correct = None
854
- binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
 
855
  if binary_mode:
856
- gold_norm = _norm_answer(gold)
857
  pred_norm = _norm_answer(predicted)
858
  if gold_norm and pred_norm:
859
  is_correct = (gold_norm == pred_norm)
@@ -866,7 +868,7 @@ def _read_detail_parquet(
866
  task_name=benchmark_base,
867
  prompt=rec.get("prompt"),
868
  output=output,
869
- gold_answer=gold,
870
  predicted_answer=predicted,
871
  metric_name=metric_key,
872
  metric_value=metric_value,
@@ -944,6 +946,7 @@ def _read_detail_fannorflop_rows(records: List[Any], subtask: str, benchmark_bas
944
  predicted = rec.get("predicted_answer") or output
945
  gold = rec.get("gold_answer")
946
  is_correct = None
 
947
  binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
948
  if binary_mode and gold not in (None, "") and predicted not in (None, ""):
949
  is_correct = (_norm_answer(gold) == _norm_answer(predicted))
 
848
 
849
  )
850
  predicted = rec.get("predicted_answer") or output
851
+ gold_raw = rec.get("gold_answer")
852
+ gold_display = gold_raw if gold_raw not in (None, "") else rec.get("gold_verse_explanations")
853
 
854
  is_correct = None
855
+ # Only enable binary correct/wrong mode for explicit gold_answer labels.
856
+ binary_mode = _is_primitive_answer(gold_raw) and _is_primitive_answer(predicted)
857
  if binary_mode:
858
+ gold_norm = _norm_answer(gold_raw)
859
  pred_norm = _norm_answer(predicted)
860
  if gold_norm and pred_norm:
861
  is_correct = (gold_norm == pred_norm)
 
868
  task_name=benchmark_base,
869
  prompt=rec.get("prompt"),
870
  output=output,
871
+ gold_answer=gold_display,
872
  predicted_answer=predicted,
873
  metric_name=metric_key,
874
  metric_value=metric_value,
 
946
  predicted = rec.get("predicted_answer") or output
947
  gold = rec.get("gold_answer")
948
  is_correct = None
949
+ # Only enable binary mode when explicit gold_answer exists.
950
  binary_mode = _is_primitive_answer(gold) and _is_primitive_answer(predicted)
951
  if binary_mode and gold not in (None, "") and predicted not in (None, ""):
952
  is_correct = (_norm_answer(gold) == _norm_answer(predicted))