Alyafeai commited on
Commit
53dfe4f
·
1 Parent(s): 2f314ce

fix issue with multi-options answers, and with the samples that don't have binary score

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. backend/data_loader.py +183 -25
  3. frontend/leaderboard.html +11 -6
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  **/__pycache__/**
2
- .vscode/
 
 
1
  **/__pycache__/**
2
+ .vscode/
3
+ scripts/*
backend/data_loader.py CHANGED
@@ -69,6 +69,39 @@ for task_key, _, display in TASKS:
69
  bases.append(base)
70
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  # -----------------------------------------------------------------------------
73
  # Utilities
74
  # -----------------------------------------------------------------------------
@@ -437,6 +470,91 @@ def _json_safe(value: Any) -> Any:
437
  return value
438
 
439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  def _extract_predicted_answer(model_response: Dict[str, Any], choices: List[Any]) -> Any:
441
  logprobs = model_response.get("logprobs")
442
  if logprobs is not None and choices:
@@ -469,7 +587,12 @@ def _first_non_empty(values: Any) -> Optional[str]:
469
  return None
470
 
471
 
472
- def _read_detail_parquet(path: str, subtask: str) -> List[Dict[str, Any]]:
 
 
 
 
 
473
  try:
474
  df = pd.read_parquet(path)
475
  except Exception as e:
@@ -484,19 +607,14 @@ def _read_detail_parquet(path: str, subtask: str) -> List[Dict[str, Any]]:
484
 
485
  choices = _as_list(doc.get("choices"))
486
  choices = [_py_scalar(c) for c in choices]
487
- gold_idx = doc.get("gold_index")
488
- gold_answer = None
489
- if isinstance(gold_idx, (int, np.integer)) and 0 <= int(gold_idx) < len(choices):
490
- gold_answer = choices[int(gold_idx)]
491
-
492
- metric_value = None
493
- metric_name = None
494
- if isinstance(metric, dict) and metric:
495
- metric_name = next(iter(metric.keys()))
496
- try:
497
- metric_value = float(next(iter(metric.values())))
498
- except Exception:
499
- metric_value = None
500
 
501
  model_response_dict = model_response if isinstance(model_response, dict) else {}
502
  predicted_answer = _extract_predicted_answer(model_response_dict, choices)
@@ -507,8 +625,23 @@ def _read_detail_parquet(path: str, subtask: str) -> List[Dict[str, Any]]:
507
  output_text = str(predicted_answer)
508
 
509
  is_correct = None
510
- if metric_value is not None and metric_value in (0.0, 1.0):
511
  is_correct = bool(metric_value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
 
513
  prompt = (
514
  doc.get("query")
@@ -557,39 +690,64 @@ def load_benchmark_details(
557
  if not benchmark_bases:
558
  benchmark_bases = [benchmark_display]
559
 
560
- selected_entries: List[tuple[str, Dict[str, Any]]] = []
561
  for base in benchmark_bases:
562
  subtasks = model_bucket.get(base, {})
 
563
  if not subtasks:
564
  base_l = base.strip().lower()
565
  for indexed_base, bucket in model_bucket.items():
566
  if indexed_base.strip().lower() == base_l:
 
567
  subtasks = bucket
568
  break
 
 
 
 
 
 
 
 
 
569
  for subtask, info in subtasks.items():
570
- selected_entries.append((subtask, info))
571
 
572
  if not selected_entries:
573
  return {"benchmark": benchmark_display, "subtasks": [], "rows": []}
574
 
575
- selected_entries.sort(key=lambda x: x[0].lower())
576
 
577
  rows_by_subtask: List[List[Dict[str, Any]]] = []
578
  subtasks_summary: List[Dict[str, Any]] = []
579
- for subtask, info in selected_entries:
580
- rows = _read_detail_parquet(info["path"], subtask)
581
  rows_by_subtask.append(rows)
582
 
583
- valid = [r for r in rows if isinstance(r.get("is_correct"), bool)]
584
- correct = sum(1 for r in valid if r["is_correct"])
585
- total = len(valid)
586
- accuracy = round((correct / total) * 100, 2) if total > 0 else None
 
 
 
 
 
 
 
 
 
 
 
 
 
587
  subtasks_summary.append({
588
  "subtask": subtask,
589
  "total": len(rows),
590
- "scored": total,
591
  "correct": correct,
592
  "accuracy": accuracy,
 
593
  })
594
 
595
  total_rows = sum(len(rows) for rows in rows_by_subtask)
 
69
  bases.append(base)
70
 
71
 
72
+ def _extract_base_metric_pairs(task_key: Any, metric_key: Any) -> List[tuple[str, str]]:
73
+ pairs: List[tuple[str, str]] = []
74
+
75
+ if isinstance(task_key, list):
76
+ if isinstance(metric_key, list):
77
+ for tk, mk in zip(task_key, metric_key):
78
+ if isinstance(mk, tuple):
79
+ mk = mk[0]
80
+ pairs.extend(_extract_base_metric_pairs(tk, mk))
81
+ return pairs
82
+
83
+ if not isinstance(task_key, str) or not isinstance(metric_key, str):
84
+ return pairs
85
+
86
+ base = task_key.split(":", 1)[0].split("|", 1)[0].strip()
87
+ if base:
88
+ pairs.append((base, metric_key))
89
+ return pairs
90
+
91
+
92
+ BENCHMARK_BASE_TO_METRICS: Dict[str, List[str]] = {}
93
+ BENCHMARK_DISPLAY_TO_BASE_METRICS: Dict[str, Dict[str, List[str]]] = {}
94
+ for task_key, metric_key, display in TASKS:
95
+ display_bucket = BENCHMARK_DISPLAY_TO_BASE_METRICS.setdefault(display, {})
96
+ for base, metric_name in _extract_base_metric_pairs(task_key, metric_key):
97
+ base_bucket = BENCHMARK_BASE_TO_METRICS.setdefault(base, [])
98
+ if metric_name and metric_name not in base_bucket:
99
+ base_bucket.append(metric_name)
100
+ display_metric_bucket = display_bucket.setdefault(base, [])
101
+ if metric_name and metric_name not in display_metric_bucket:
102
+ display_metric_bucket.append(metric_name)
103
+
104
+
105
  # -----------------------------------------------------------------------------
106
  # Utilities
107
  # -----------------------------------------------------------------------------
 
470
  return value
471
 
472
 
473
+ def _to_float_scalar(value: Any) -> Optional[float]:
474
+ value = _py_scalar(value)
475
+ if isinstance(value, (int, float, np.integer, np.floating)):
476
+ return float(value)
477
+ return None
478
+
479
+
480
+ def _normalize_indices(value: Any) -> List[int]:
481
+ indices: List[int] = []
482
+ for item in _as_list(value):
483
+ item = _py_scalar(item)
484
+ if isinstance(item, (int, np.integer)):
485
+ indices.append(int(item))
486
+ return indices
487
+
488
+
489
+ def _format_answer(values: List[Any]) -> Any:
490
+ if not values:
491
+ return None
492
+ clean = [str(_py_scalar(v)) for v in values]
493
+ if len(clean) == 1:
494
+ return clean[0]
495
+ return ", ".join(clean)
496
+
497
+
498
+ def _norm_answer(value: Any) -> str:
499
+ value = _py_scalar(value)
500
+ if value is None:
501
+ return ""
502
+ return str(value).strip()
503
+
504
+
505
+ def _pick_metric(
506
+ metric: Dict[str, Any],
507
+ benchmark_base: str,
508
+ preferred_metrics: Optional[List[str]] = None,
509
+ ) -> tuple[Optional[str], Optional[float]]:
510
+ if not isinstance(metric, dict) or not metric:
511
+ return None, None
512
+
513
+ preferred = preferred_metrics or BENCHMARK_BASE_TO_METRICS.get(benchmark_base, [])
514
+ for name in preferred:
515
+ if name in metric:
516
+ val = _to_float_scalar(metric.get(name))
517
+ if val is not None:
518
+ return name, val
519
+
520
+ # Fallback for known detail formats.
521
+ for name in ["normalized_score_norm", "BERTScore-F", "acc", "accuracy"]:
522
+ if name in metric:
523
+ val = _to_float_scalar(metric.get(name))
524
+ if val is not None:
525
+ return name, val
526
+
527
+ for name, raw_val in metric.items():
528
+ val = _to_float_scalar(raw_val)
529
+ if val is not None:
530
+ return str(name), val
531
+ return None, None
532
+
533
+
534
+ def _is_binary_metric_name(metric_name: Optional[str]) -> bool:
535
+ if not metric_name:
536
+ return False
537
+ n = metric_name.lower()
538
+ return (
539
+ n.startswith("acc")
540
+ or "accuracy" in n
541
+ or "score_norm" in n
542
+ or n in {"exact_match", "fann_or_flop"}
543
+ )
544
+
545
+
546
+ def _is_choice_metric_name(metric_name: Optional[str]) -> bool:
547
+ if not metric_name:
548
+ return False
549
+ n = metric_name.lower()
550
+ return (
551
+ n.startswith("acc")
552
+ or "mc_prob" in n
553
+ or "score_norm" in n
554
+ or n in {"exact_match", "fann_or_flop"}
555
+ )
556
+
557
+
558
  def _extract_predicted_answer(model_response: Dict[str, Any], choices: List[Any]) -> Any:
559
  logprobs = model_response.get("logprobs")
560
  if logprobs is not None and choices:
 
587
  return None
588
 
589
 
590
+ def _read_detail_parquet(
591
+ path: str,
592
+ subtask: str,
593
+ benchmark_base: str,
594
+ preferred_metrics: Optional[List[str]] = None,
595
+ ) -> List[Dict[str, Any]]:
596
  try:
597
  df = pd.read_parquet(path)
598
  except Exception as e:
 
607
 
608
  choices = _as_list(doc.get("choices"))
609
  choices = [_py_scalar(c) for c in choices]
610
+ gold_indices = _normalize_indices(doc.get("gold_index"))
611
+ gold_values: List[Any] = []
612
+ for idx in gold_indices:
613
+ if 0 <= idx < len(choices):
614
+ gold_values.append(choices[idx])
615
+ gold_answer = _format_answer(gold_values)
616
+
617
+ metric_name, metric_value = _pick_metric(metric, benchmark_base, preferred_metrics)
 
 
 
 
 
618
 
619
  model_response_dict = model_response if isinstance(model_response, dict) else {}
620
  predicted_answer = _extract_predicted_answer(model_response_dict, choices)
 
625
  output_text = str(predicted_answer)
626
 
627
  is_correct = None
628
+ if metric_value is not None and _is_binary_metric_name(metric_name) and metric_value in (0.0, 1.0):
629
  is_correct = bool(metric_value)
630
+ else:
631
+ binary_score = _to_float_scalar(metric.get("normalized_score_norm"))
632
+ if binary_score is not None and binary_score in (0.0, 1.0):
633
+ is_correct = bool(binary_score)
634
+
635
+ # For multi-gold classification (e.g. Mizan), accept prediction if it matches any gold option.
636
+ pred_norm = _norm_answer(predicted_answer)
637
+ choice_norms = {_norm_answer(c) for c in choices if _norm_answer(c)}
638
+ gold_norms = {_norm_answer(g) for g in gold_values if _norm_answer(g)}
639
+ if _is_choice_metric_name(metric_name) and pred_norm and pred_norm in choice_norms and gold_norms:
640
+ is_correct = pred_norm in gold_norms
641
+
642
+ predicted_answer = _py_scalar(predicted_answer)
643
+ if isinstance(predicted_answer, list):
644
+ predicted_answer = _format_answer(predicted_answer)
645
 
646
  prompt = (
647
  doc.get("query")
 
690
  if not benchmark_bases:
691
  benchmark_bases = [benchmark_display]
692
 
693
+ selected_entries: List[tuple[str, str, Dict[str, Any], List[str]]] = []
694
  for base in benchmark_bases:
695
  subtasks = model_bucket.get(base, {})
696
+ selected_base = base
697
  if not subtasks:
698
  base_l = base.strip().lower()
699
  for indexed_base, bucket in model_bucket.items():
700
  if indexed_base.strip().lower() == base_l:
701
+ selected_base = indexed_base
702
  subtasks = bucket
703
  break
704
+ display_metric_bucket = BENCHMARK_DISPLAY_TO_BASE_METRICS.get(benchmark_display, {})
705
+ preferred_metrics = display_metric_bucket.get(selected_base)
706
+ if preferred_metrics is None:
707
+ # Case-insensitive fallback.
708
+ for k, v in display_metric_bucket.items():
709
+ if k.strip().lower() == selected_base.strip().lower():
710
+ preferred_metrics = v
711
+ break
712
+ preferred_metrics = preferred_metrics or BENCHMARK_BASE_TO_METRICS.get(selected_base, [])
713
  for subtask, info in subtasks.items():
714
+ selected_entries.append((selected_base, subtask, info, preferred_metrics))
715
 
716
  if not selected_entries:
717
  return {"benchmark": benchmark_display, "subtasks": [], "rows": []}
718
 
719
+ selected_entries.sort(key=lambda x: x[1].lower())
720
 
721
  rows_by_subtask: List[List[Dict[str, Any]]] = []
722
  subtasks_summary: List[Dict[str, Any]] = []
723
+ for base, subtask, info, preferred_metrics in selected_entries:
724
+ rows = _read_detail_parquet(info["path"], subtask, base, preferred_metrics)
725
  rows_by_subtask.append(rows)
726
 
727
+ scored_rows = [r for r in rows if r.get("metric") is not None]
728
+ metric_name = next((str(r.get("metric_name")) for r in scored_rows if r.get("metric_name")), None)
729
+ use_metric_mode = metric_name is not None and not _is_binary_metric_name(metric_name)
730
+
731
+ if use_metric_mode:
732
+ correct = None
733
+ scored = len(scored_rows)
734
+ avg_metric = (sum(float(r["metric"]) for r in scored_rows) / scored) if scored > 0 else None
735
+ accuracy = round(avg_metric * 100, 2) if avg_metric is not None else None
736
+ summary_mode = "metric"
737
+ else:
738
+ binary_rows = [r for r in rows if isinstance(r.get("is_correct"), bool)]
739
+ correct = sum(1 for r in binary_rows if r["is_correct"])
740
+ scored = len(binary_rows)
741
+ accuracy = round((correct / scored) * 100, 2) if scored > 0 else None
742
+ summary_mode = "binary"
743
+
744
  subtasks_summary.append({
745
  "subtask": subtask,
746
  "total": len(rows),
747
+ "scored": scored,
748
  "correct": correct,
749
  "accuracy": accuracy,
750
+ "mode": summary_mode,
751
  })
752
 
753
  total_rows = sum(len(rows) for rows in rows_by_subtask)
frontend/leaderboard.html CHANGED
@@ -763,16 +763,21 @@
763
  <div class="p-3 rounded-lg border border-slate-200 dark:border-slate-700 bg-slate-50 dark:bg-slate-800/70">
764
  <div class="text-xs text-slate-500 dark:text-slate-400">${escapeHtml(s.subtask)}</div>
765
  <div class="text-sm font-bold text-slate-800 dark:text-slate-100 mt-1">${s.accuracy === null ? "Unknown" : `${s.accuracy}%`}</div>
766
- <div class="text-xs text-slate-500 dark:text-slate-400 mt-0.5">${s.correct}/${s.scored} correct</div>
767
  </div>
768
  `).join("");
769
 
770
  $('#benchmarkRows').innerHTML = rows.map(r => {
771
- const correctBadge = r.is_correct === true
772
- ? `<span class="text-emerald-600 dark:text-emerald-400 font-semibold">Correct</span>`
773
- : (r.is_correct === false
774
- ? `<span class="text-rose-600 dark:text-rose-400 font-semibold">Wrong</span>`
775
- : `<span class="text-slate-500 dark:text-slate-400 font-semibold">Unknown</span>`);
 
 
 
 
 
776
  const prompt = escapeHtml(asUnknown(r.prompt));
777
  const output = escapeHtml(asUnknown(r.output));
778
  const sampleMeta = [
 
763
  <div class="p-3 rounded-lg border border-slate-200 dark:border-slate-700 bg-slate-50 dark:bg-slate-800/70">
764
  <div class="text-xs text-slate-500 dark:text-slate-400">${escapeHtml(s.subtask)}</div>
765
  <div class="text-sm font-bold text-slate-800 dark:text-slate-100 mt-1">${s.accuracy === null ? "Unknown" : `${s.accuracy}%`}</div>
766
+ <div class="text-xs text-slate-500 dark:text-slate-400 mt-0.5">${s.mode === "metric" ? `${s.scored} scored` : `${s.correct}/${s.scored} correct`}</div>
767
  </div>
768
  `).join("");
769
 
770
  $('#benchmarkRows').innerHTML = rows.map(r => {
771
+ let correctBadge = `<span class="text-slate-500 dark:text-slate-400 font-semibold">Unknown</span>`;
772
+ if (r.is_correct === true) {
773
+ correctBadge = `<span class="text-emerald-600 dark:text-emerald-400 font-semibold">Correct</span>`;
774
+ } else if (r.is_correct === false) {
775
+ correctBadge = `<span class="text-rose-600 dark:text-rose-400 font-semibold">Wrong</span>`;
776
+ } else if (r.metric !== null && r.metric !== undefined) {
777
+ const n = Number(r.metric);
778
+ const scoreText = Number.isFinite(n) ? n.toFixed(4) : escapeHtml(r.metric);
779
+ correctBadge = `<span class="text-sky-600 dark:text-sky-400 font-semibold">Score: ${scoreText}</span>`;
780
+ }
781
  const prompt = escapeHtml(asUnknown(r.prompt));
782
  const output = escapeHtml(asUnknown(r.output));
783
  const sampleMeta = [