Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
fix issue with multi-options answers, and with the samples that don't have binary score
Browse files- .gitignore +2 -1
- backend/data_loader.py +183 -25
- frontend/leaderboard.html +11 -6
.gitignore
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
**/__pycache__/**
|
| 2 |
-
.vscode/
|
|
|
|
|
|
| 1 |
**/__pycache__/**
|
| 2 |
+
.vscode/
|
| 3 |
+
scripts/*
|
backend/data_loader.py
CHANGED
|
@@ -69,6 +69,39 @@ for task_key, _, display in TASKS:
|
|
| 69 |
bases.append(base)
|
| 70 |
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
# -----------------------------------------------------------------------------
|
| 73 |
# Utilities
|
| 74 |
# -----------------------------------------------------------------------------
|
|
@@ -437,6 +470,91 @@ def _json_safe(value: Any) -> Any:
|
|
| 437 |
return value
|
| 438 |
|
| 439 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
def _extract_predicted_answer(model_response: Dict[str, Any], choices: List[Any]) -> Any:
|
| 441 |
logprobs = model_response.get("logprobs")
|
| 442 |
if logprobs is not None and choices:
|
|
@@ -469,7 +587,12 @@ def _first_non_empty(values: Any) -> Optional[str]:
|
|
| 469 |
return None
|
| 470 |
|
| 471 |
|
| 472 |
-
def _read_detail_parquet(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
try:
|
| 474 |
df = pd.read_parquet(path)
|
| 475 |
except Exception as e:
|
|
@@ -484,19 +607,14 @@ def _read_detail_parquet(path: str, subtask: str) -> List[Dict[str, Any]]:
|
|
| 484 |
|
| 485 |
choices = _as_list(doc.get("choices"))
|
| 486 |
choices = [_py_scalar(c) for c in choices]
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
metric_name = next(iter(metric.keys()))
|
| 496 |
-
try:
|
| 497 |
-
metric_value = float(next(iter(metric.values())))
|
| 498 |
-
except Exception:
|
| 499 |
-
metric_value = None
|
| 500 |
|
| 501 |
model_response_dict = model_response if isinstance(model_response, dict) else {}
|
| 502 |
predicted_answer = _extract_predicted_answer(model_response_dict, choices)
|
|
@@ -507,8 +625,23 @@ def _read_detail_parquet(path: str, subtask: str) -> List[Dict[str, Any]]:
|
|
| 507 |
output_text = str(predicted_answer)
|
| 508 |
|
| 509 |
is_correct = None
|
| 510 |
-
if metric_value is not None and metric_value in (0.0, 1.0):
|
| 511 |
is_correct = bool(metric_value)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
|
| 513 |
prompt = (
|
| 514 |
doc.get("query")
|
|
@@ -557,39 +690,64 @@ def load_benchmark_details(
|
|
| 557 |
if not benchmark_bases:
|
| 558 |
benchmark_bases = [benchmark_display]
|
| 559 |
|
| 560 |
-
selected_entries: List[tuple[str, Dict[str, Any]]] = []
|
| 561 |
for base in benchmark_bases:
|
| 562 |
subtasks = model_bucket.get(base, {})
|
|
|
|
| 563 |
if not subtasks:
|
| 564 |
base_l = base.strip().lower()
|
| 565 |
for indexed_base, bucket in model_bucket.items():
|
| 566 |
if indexed_base.strip().lower() == base_l:
|
|
|
|
| 567 |
subtasks = bucket
|
| 568 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
for subtask, info in subtasks.items():
|
| 570 |
-
selected_entries.append((subtask, info))
|
| 571 |
|
| 572 |
if not selected_entries:
|
| 573 |
return {"benchmark": benchmark_display, "subtasks": [], "rows": []}
|
| 574 |
|
| 575 |
-
selected_entries.sort(key=lambda x: x[
|
| 576 |
|
| 577 |
rows_by_subtask: List[List[Dict[str, Any]]] = []
|
| 578 |
subtasks_summary: List[Dict[str, Any]] = []
|
| 579 |
-
for subtask, info in selected_entries:
|
| 580 |
-
rows = _read_detail_parquet(info["path"], subtask)
|
| 581 |
rows_by_subtask.append(rows)
|
| 582 |
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
subtasks_summary.append({
|
| 588 |
"subtask": subtask,
|
| 589 |
"total": len(rows),
|
| 590 |
-
"scored":
|
| 591 |
"correct": correct,
|
| 592 |
"accuracy": accuracy,
|
|
|
|
| 593 |
})
|
| 594 |
|
| 595 |
total_rows = sum(len(rows) for rows in rows_by_subtask)
|
|
|
|
| 69 |
bases.append(base)
|
| 70 |
|
| 71 |
|
| 72 |
+
def _extract_base_metric_pairs(task_key: Any, metric_key: Any) -> List[tuple[str, str]]:
|
| 73 |
+
pairs: List[tuple[str, str]] = []
|
| 74 |
+
|
| 75 |
+
if isinstance(task_key, list):
|
| 76 |
+
if isinstance(metric_key, list):
|
| 77 |
+
for tk, mk in zip(task_key, metric_key):
|
| 78 |
+
if isinstance(mk, tuple):
|
| 79 |
+
mk = mk[0]
|
| 80 |
+
pairs.extend(_extract_base_metric_pairs(tk, mk))
|
| 81 |
+
return pairs
|
| 82 |
+
|
| 83 |
+
if not isinstance(task_key, str) or not isinstance(metric_key, str):
|
| 84 |
+
return pairs
|
| 85 |
+
|
| 86 |
+
base = task_key.split(":", 1)[0].split("|", 1)[0].strip()
|
| 87 |
+
if base:
|
| 88 |
+
pairs.append((base, metric_key))
|
| 89 |
+
return pairs
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
BENCHMARK_BASE_TO_METRICS: Dict[str, List[str]] = {}
|
| 93 |
+
BENCHMARK_DISPLAY_TO_BASE_METRICS: Dict[str, Dict[str, List[str]]] = {}
|
| 94 |
+
for task_key, metric_key, display in TASKS:
|
| 95 |
+
display_bucket = BENCHMARK_DISPLAY_TO_BASE_METRICS.setdefault(display, {})
|
| 96 |
+
for base, metric_name in _extract_base_metric_pairs(task_key, metric_key):
|
| 97 |
+
base_bucket = BENCHMARK_BASE_TO_METRICS.setdefault(base, [])
|
| 98 |
+
if metric_name and metric_name not in base_bucket:
|
| 99 |
+
base_bucket.append(metric_name)
|
| 100 |
+
display_metric_bucket = display_bucket.setdefault(base, [])
|
| 101 |
+
if metric_name and metric_name not in display_metric_bucket:
|
| 102 |
+
display_metric_bucket.append(metric_name)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
# -----------------------------------------------------------------------------
|
| 106 |
# Utilities
|
| 107 |
# -----------------------------------------------------------------------------
|
|
|
|
| 470 |
return value
|
| 471 |
|
| 472 |
|
| 473 |
+
def _to_float_scalar(value: Any) -> Optional[float]:
|
| 474 |
+
value = _py_scalar(value)
|
| 475 |
+
if isinstance(value, (int, float, np.integer, np.floating)):
|
| 476 |
+
return float(value)
|
| 477 |
+
return None
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def _normalize_indices(value: Any) -> List[int]:
|
| 481 |
+
indices: List[int] = []
|
| 482 |
+
for item in _as_list(value):
|
| 483 |
+
item = _py_scalar(item)
|
| 484 |
+
if isinstance(item, (int, np.integer)):
|
| 485 |
+
indices.append(int(item))
|
| 486 |
+
return indices
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
def _format_answer(values: List[Any]) -> Any:
|
| 490 |
+
if not values:
|
| 491 |
+
return None
|
| 492 |
+
clean = [str(_py_scalar(v)) for v in values]
|
| 493 |
+
if len(clean) == 1:
|
| 494 |
+
return clean[0]
|
| 495 |
+
return ", ".join(clean)
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
def _norm_answer(value: Any) -> str:
|
| 499 |
+
value = _py_scalar(value)
|
| 500 |
+
if value is None:
|
| 501 |
+
return ""
|
| 502 |
+
return str(value).strip()
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
def _pick_metric(
|
| 506 |
+
metric: Dict[str, Any],
|
| 507 |
+
benchmark_base: str,
|
| 508 |
+
preferred_metrics: Optional[List[str]] = None,
|
| 509 |
+
) -> tuple[Optional[str], Optional[float]]:
|
| 510 |
+
if not isinstance(metric, dict) or not metric:
|
| 511 |
+
return None, None
|
| 512 |
+
|
| 513 |
+
preferred = preferred_metrics or BENCHMARK_BASE_TO_METRICS.get(benchmark_base, [])
|
| 514 |
+
for name in preferred:
|
| 515 |
+
if name in metric:
|
| 516 |
+
val = _to_float_scalar(metric.get(name))
|
| 517 |
+
if val is not None:
|
| 518 |
+
return name, val
|
| 519 |
+
|
| 520 |
+
# Fallback for known detail formats.
|
| 521 |
+
for name in ["normalized_score_norm", "BERTScore-F", "acc", "accuracy"]:
|
| 522 |
+
if name in metric:
|
| 523 |
+
val = _to_float_scalar(metric.get(name))
|
| 524 |
+
if val is not None:
|
| 525 |
+
return name, val
|
| 526 |
+
|
| 527 |
+
for name, raw_val in metric.items():
|
| 528 |
+
val = _to_float_scalar(raw_val)
|
| 529 |
+
if val is not None:
|
| 530 |
+
return str(name), val
|
| 531 |
+
return None, None
|
| 532 |
+
|
| 533 |
+
|
| 534 |
+
def _is_binary_metric_name(metric_name: Optional[str]) -> bool:
|
| 535 |
+
if not metric_name:
|
| 536 |
+
return False
|
| 537 |
+
n = metric_name.lower()
|
| 538 |
+
return (
|
| 539 |
+
n.startswith("acc")
|
| 540 |
+
or "accuracy" in n
|
| 541 |
+
or "score_norm" in n
|
| 542 |
+
or n in {"exact_match", "fann_or_flop"}
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
|
| 546 |
+
def _is_choice_metric_name(metric_name: Optional[str]) -> bool:
|
| 547 |
+
if not metric_name:
|
| 548 |
+
return False
|
| 549 |
+
n = metric_name.lower()
|
| 550 |
+
return (
|
| 551 |
+
n.startswith("acc")
|
| 552 |
+
or "mc_prob" in n
|
| 553 |
+
or "score_norm" in n
|
| 554 |
+
or n in {"exact_match", "fann_or_flop"}
|
| 555 |
+
)
|
| 556 |
+
|
| 557 |
+
|
| 558 |
def _extract_predicted_answer(model_response: Dict[str, Any], choices: List[Any]) -> Any:
|
| 559 |
logprobs = model_response.get("logprobs")
|
| 560 |
if logprobs is not None and choices:
|
|
|
|
| 587 |
return None
|
| 588 |
|
| 589 |
|
| 590 |
+
def _read_detail_parquet(
|
| 591 |
+
path: str,
|
| 592 |
+
subtask: str,
|
| 593 |
+
benchmark_base: str,
|
| 594 |
+
preferred_metrics: Optional[List[str]] = None,
|
| 595 |
+
) -> List[Dict[str, Any]]:
|
| 596 |
try:
|
| 597 |
df = pd.read_parquet(path)
|
| 598 |
except Exception as e:
|
|
|
|
| 607 |
|
| 608 |
choices = _as_list(doc.get("choices"))
|
| 609 |
choices = [_py_scalar(c) for c in choices]
|
| 610 |
+
gold_indices = _normalize_indices(doc.get("gold_index"))
|
| 611 |
+
gold_values: List[Any] = []
|
| 612 |
+
for idx in gold_indices:
|
| 613 |
+
if 0 <= idx < len(choices):
|
| 614 |
+
gold_values.append(choices[idx])
|
| 615 |
+
gold_answer = _format_answer(gold_values)
|
| 616 |
+
|
| 617 |
+
metric_name, metric_value = _pick_metric(metric, benchmark_base, preferred_metrics)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
|
| 619 |
model_response_dict = model_response if isinstance(model_response, dict) else {}
|
| 620 |
predicted_answer = _extract_predicted_answer(model_response_dict, choices)
|
|
|
|
| 625 |
output_text = str(predicted_answer)
|
| 626 |
|
| 627 |
is_correct = None
|
| 628 |
+
if metric_value is not None and _is_binary_metric_name(metric_name) and metric_value in (0.0, 1.0):
|
| 629 |
is_correct = bool(metric_value)
|
| 630 |
+
else:
|
| 631 |
+
binary_score = _to_float_scalar(metric.get("normalized_score_norm"))
|
| 632 |
+
if binary_score is not None and binary_score in (0.0, 1.0):
|
| 633 |
+
is_correct = bool(binary_score)
|
| 634 |
+
|
| 635 |
+
# For multi-gold classification (e.g. Mizan), accept prediction if it matches any gold option.
|
| 636 |
+
pred_norm = _norm_answer(predicted_answer)
|
| 637 |
+
choice_norms = {_norm_answer(c) for c in choices if _norm_answer(c)}
|
| 638 |
+
gold_norms = {_norm_answer(g) for g in gold_values if _norm_answer(g)}
|
| 639 |
+
if _is_choice_metric_name(metric_name) and pred_norm and pred_norm in choice_norms and gold_norms:
|
| 640 |
+
is_correct = pred_norm in gold_norms
|
| 641 |
+
|
| 642 |
+
predicted_answer = _py_scalar(predicted_answer)
|
| 643 |
+
if isinstance(predicted_answer, list):
|
| 644 |
+
predicted_answer = _format_answer(predicted_answer)
|
| 645 |
|
| 646 |
prompt = (
|
| 647 |
doc.get("query")
|
|
|
|
| 690 |
if not benchmark_bases:
|
| 691 |
benchmark_bases = [benchmark_display]
|
| 692 |
|
| 693 |
+
selected_entries: List[tuple[str, str, Dict[str, Any], List[str]]] = []
|
| 694 |
for base in benchmark_bases:
|
| 695 |
subtasks = model_bucket.get(base, {})
|
| 696 |
+
selected_base = base
|
| 697 |
if not subtasks:
|
| 698 |
base_l = base.strip().lower()
|
| 699 |
for indexed_base, bucket in model_bucket.items():
|
| 700 |
if indexed_base.strip().lower() == base_l:
|
| 701 |
+
selected_base = indexed_base
|
| 702 |
subtasks = bucket
|
| 703 |
break
|
| 704 |
+
display_metric_bucket = BENCHMARK_DISPLAY_TO_BASE_METRICS.get(benchmark_display, {})
|
| 705 |
+
preferred_metrics = display_metric_bucket.get(selected_base)
|
| 706 |
+
if preferred_metrics is None:
|
| 707 |
+
# Case-insensitive fallback.
|
| 708 |
+
for k, v in display_metric_bucket.items():
|
| 709 |
+
if k.strip().lower() == selected_base.strip().lower():
|
| 710 |
+
preferred_metrics = v
|
| 711 |
+
break
|
| 712 |
+
preferred_metrics = preferred_metrics or BENCHMARK_BASE_TO_METRICS.get(selected_base, [])
|
| 713 |
for subtask, info in subtasks.items():
|
| 714 |
+
selected_entries.append((selected_base, subtask, info, preferred_metrics))
|
| 715 |
|
| 716 |
if not selected_entries:
|
| 717 |
return {"benchmark": benchmark_display, "subtasks": [], "rows": []}
|
| 718 |
|
| 719 |
+
selected_entries.sort(key=lambda x: x[1].lower())
|
| 720 |
|
| 721 |
rows_by_subtask: List[List[Dict[str, Any]]] = []
|
| 722 |
subtasks_summary: List[Dict[str, Any]] = []
|
| 723 |
+
for base, subtask, info, preferred_metrics in selected_entries:
|
| 724 |
+
rows = _read_detail_parquet(info["path"], subtask, base, preferred_metrics)
|
| 725 |
rows_by_subtask.append(rows)
|
| 726 |
|
| 727 |
+
scored_rows = [r for r in rows if r.get("metric") is not None]
|
| 728 |
+
metric_name = next((str(r.get("metric_name")) for r in scored_rows if r.get("metric_name")), None)
|
| 729 |
+
use_metric_mode = metric_name is not None and not _is_binary_metric_name(metric_name)
|
| 730 |
+
|
| 731 |
+
if use_metric_mode:
|
| 732 |
+
correct = None
|
| 733 |
+
scored = len(scored_rows)
|
| 734 |
+
avg_metric = (sum(float(r["metric"]) for r in scored_rows) / scored) if scored > 0 else None
|
| 735 |
+
accuracy = round(avg_metric * 100, 2) if avg_metric is not None else None
|
| 736 |
+
summary_mode = "metric"
|
| 737 |
+
else:
|
| 738 |
+
binary_rows = [r for r in rows if isinstance(r.get("is_correct"), bool)]
|
| 739 |
+
correct = sum(1 for r in binary_rows if r["is_correct"])
|
| 740 |
+
scored = len(binary_rows)
|
| 741 |
+
accuracy = round((correct / scored) * 100, 2) if scored > 0 else None
|
| 742 |
+
summary_mode = "binary"
|
| 743 |
+
|
| 744 |
subtasks_summary.append({
|
| 745 |
"subtask": subtask,
|
| 746 |
"total": len(rows),
|
| 747 |
+
"scored": scored,
|
| 748 |
"correct": correct,
|
| 749 |
"accuracy": accuracy,
|
| 750 |
+
"mode": summary_mode,
|
| 751 |
})
|
| 752 |
|
| 753 |
total_rows = sum(len(rows) for rows in rows_by_subtask)
|
frontend/leaderboard.html
CHANGED
|
@@ -763,16 +763,21 @@
|
|
| 763 |
<div class="p-3 rounded-lg border border-slate-200 dark:border-slate-700 bg-slate-50 dark:bg-slate-800/70">
|
| 764 |
<div class="text-xs text-slate-500 dark:text-slate-400">${escapeHtml(s.subtask)}</div>
|
| 765 |
<div class="text-sm font-bold text-slate-800 dark:text-slate-100 mt-1">${s.accuracy === null ? "Unknown" : `${s.accuracy}%`}</div>
|
| 766 |
-
<div class="text-xs text-slate-500 dark:text-slate-400 mt-0.5">${s.correct}/${s.scored} correct</div>
|
| 767 |
</div>
|
| 768 |
`).join("");
|
| 769 |
|
| 770 |
$('#benchmarkRows').innerHTML = rows.map(r => {
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
const prompt = escapeHtml(asUnknown(r.prompt));
|
| 777 |
const output = escapeHtml(asUnknown(r.output));
|
| 778 |
const sampleMeta = [
|
|
|
|
| 763 |
<div class="p-3 rounded-lg border border-slate-200 dark:border-slate-700 bg-slate-50 dark:bg-slate-800/70">
|
| 764 |
<div class="text-xs text-slate-500 dark:text-slate-400">${escapeHtml(s.subtask)}</div>
|
| 765 |
<div class="text-sm font-bold text-slate-800 dark:text-slate-100 mt-1">${s.accuracy === null ? "Unknown" : `${s.accuracy}%`}</div>
|
| 766 |
+
<div class="text-xs text-slate-500 dark:text-slate-400 mt-0.5">${s.mode === "metric" ? `${s.scored} scored` : `${s.correct}/${s.scored} correct`}</div>
|
| 767 |
</div>
|
| 768 |
`).join("");
|
| 769 |
|
| 770 |
$('#benchmarkRows').innerHTML = rows.map(r => {
|
| 771 |
+
let correctBadge = `<span class="text-slate-500 dark:text-slate-400 font-semibold">Unknown</span>`;
|
| 772 |
+
if (r.is_correct === true) {
|
| 773 |
+
correctBadge = `<span class="text-emerald-600 dark:text-emerald-400 font-semibold">Correct</span>`;
|
| 774 |
+
} else if (r.is_correct === false) {
|
| 775 |
+
correctBadge = `<span class="text-rose-600 dark:text-rose-400 font-semibold">Wrong</span>`;
|
| 776 |
+
} else if (r.metric !== null && r.metric !== undefined) {
|
| 777 |
+
const n = Number(r.metric);
|
| 778 |
+
const scoreText = Number.isFinite(n) ? n.toFixed(4) : escapeHtml(r.metric);
|
| 779 |
+
correctBadge = `<span class="text-sky-600 dark:text-sky-400 font-semibold">Score: ${scoreText}</span>`;
|
| 780 |
+
}
|
| 781 |
const prompt = escapeHtml(asUnknown(r.prompt));
|
| 782 |
const output = escapeHtml(asUnknown(r.output));
|
| 783 |
const sampleMeta = [
|