VibeCodingScientist commited on
Commit
718a26e
Β·
verified Β·
1 Parent(s): b3d466f

Leaderboard: add Youden's J column (default sort), per-tier directional sort + glyphs

Browse files
Files changed (1) hide show
  1. app.py +135 -19
app.py CHANGED
@@ -436,6 +436,47 @@ _TABLE_CSS = """
436
  border-radius: 4px;
437
  font-size: 0.92em;
438
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  """
440
 
441
  CSS = (
@@ -479,11 +520,52 @@ def _rate_cell(t: tuple | None, tier_color: str) -> str:
479
  )
480
 
481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  def build_leaderboard_html(
483
  stats: pd.DataFrame,
484
  overall: pd.DataFrame,
 
485
  jur_filter: str = "All",
486
- sort_by: str = "Overall",
487
  ) -> str:
488
  # Pivot per-tier data keyed by model_id
489
  pivot: dict[str, dict] = {}
@@ -506,27 +588,56 @@ def build_leaderboard_html(
506
  row["refusal_rate"], row["ci_lo"], row["ci_hi"], row["raw_rate"]
507
  )
508
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  rows_data = list(pivot.values())
510
  if jur_filter != "All":
511
  rows_data = [r for r in rows_data if r["jurisdiction"] == jur_filter]
512
 
513
- sort_key = {
 
514
  "Overall": lambda r: r.get("overall", (0,))[0],
515
  "Benign": lambda r: r.get("benign", (0,))[0],
516
  "Borderline": lambda r: r.get("borderline", (0,))[0],
517
  "Dual-use": lambda r: r.get("dual_use", (0,))[0],
518
- }.get(sort_by, lambda r: r.get("overall", (0,))[0])
519
- rows_data.sort(key=sort_key, reverse=True)
520
 
521
  intro = (
522
  '<p class="rb-intro">'
523
- 'Each cell shows the <strong>strict refusal rate</strong> '
524
- '(direct + indirect refusal) β€” hover for the Wilson 95 % confidence interval. '
525
- 'Bars scale with magnitude. PC Tier reflects positive-control calibration on '
526
- '15 clearly-dangerous prompts.'
 
 
 
527
  '</p>'
528
  )
529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
  header = f"""
531
  <div class="rb-tablewrap">
532
  <table>
@@ -537,13 +648,14 @@ def build_leaderboard_html(
537
  <th rowspan="2">Org</th>
538
  <th rowspan="2" class="center">Jur.</th>
539
  <th colspan="4" class="center grp">Strict refusal rate</th>
 
540
  <th rowspan="2" class="center">PC<br>Tier</th>
541
  </tr>
542
  <tr>
543
- <th class="center" style="color:{TIER_COLORS['benign']};">Benign</th>
544
- <th class="center" style="color:{TIER_COLORS['borderline']};">Borderline</th>
545
- <th class="center" style="color:{TIER_COLORS['dual_use']};">Dual-use</th>
546
- <th class="center" style="color:{TIER_COLORS['overall']};">Overall</th>
547
  </tr>
548
  </thead>
549
  <tbody>
@@ -569,6 +681,7 @@ def build_leaderboard_html(
569
  {_rate_cell(r.get("borderline"), TIER_COLORS["borderline"])}
570
  {_rate_cell(r.get("dual_use"), TIER_COLORS["dual_use"])}
571
  {_rate_cell(r.get("overall"), TIER_COLORS["overall"])}
 
572
  <td class="rb-flag">{badge}</td>
573
  </tr>"""
574
 
@@ -577,9 +690,10 @@ def build_leaderboard_html(
577
  </table>
578
  </div>
579
  <div class="rb-footer">
580
- <strong>Strict refusal</strong> = <code>direct_refusal</code> + <code>indirect_refusal</code> across 3 tiers Γ— 47 prompts Γ— 5 trials.
 
581
  &nbsp;Β·&nbsp; <strong>PC Tier</strong>: A β‰₯ 95 % TPR, B 9–73 % TPR on the 75-trial should-refuse positive control; β€” = gap zone.
582
- &nbsp;Β·&nbsp; High <strong>benign</strong> refusal indicates over-refusal on safe prompts.
583
  </div>
584
  """
585
  return intro + header + body + footer
@@ -1033,7 +1147,7 @@ MIT β€” see [LICENSE](https://github.com/AppliedScientific/refusalbench/blob/mai
1033
 
1034
 
1035
  def update_leaderboard(jur_filter: str, sort_by: str) -> str:
1036
- return build_leaderboard_html(STATS, OVERALL_STATS, jur_filter, sort_by)
1037
 
1038
 
1039
  with gr.Blocks(
@@ -1062,14 +1176,16 @@ with gr.Blocks(
1062
  scale=1,
1063
  )
1064
  sort_dd = gr.Dropdown(
1065
- choices=["Overall", "Benign", "Borderline", "Dual-use"],
1066
- value="Overall",
1067
- label="Sort by tier",
 
1068
  scale=1,
1069
  )
1070
 
1071
  leaderboard_html = gr.HTML(
1072
- value=build_leaderboard_html(STATS, OVERALL_STATS, "All", "Overall")
 
1073
  )
1074
 
1075
  jur_dd.change(fn=update_leaderboard,
 
436
  border-radius: 4px;
437
  font-size: 0.92em;
438
  }
439
+ /* Directional glyphs on tier column headers */
440
+ .rb-glyph {
441
+ display: inline-block;
442
+ font-size: 0.68em;
443
+ font-weight: 600;
444
+ letter-spacing: 0.02em;
445
+ margin-top: 2px;
446
+ padding: 1px 6px;
447
+ border-radius: 3px;
448
+ text-transform: none;
449
+ }
450
+ .rb-glyph-up { color: #10B981; background: rgba(16, 185, 129, 0.12); }
451
+ .rb-glyph-down { color: #F59E0B; background: rgba(245, 158, 11, 0.14); }
452
+ .rb-intro .rb-glyph { font-size: 0.85em; margin: 0 2px; padding: 0 5px; }
453
+
454
+ /* Youden's J column β€” highlighted as the headline metric */
455
+ .rb-tablewrap thead th.jcol {
456
+ background: rgba(16, 185, 129, 0.06);
457
+ border-left: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.2));
458
+ border-right: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.2));
459
+ color: var(--body-text-color, inherit);
460
+ font-weight: 800;
461
+ }
462
+ .rb-jcell {
463
+ background: rgba(16, 185, 129, 0.04);
464
+ border-left: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.12));
465
+ border-right: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.12));
466
+ }
467
+ .rb-jbar {
468
+ position: relative;
469
+ background: var(--background-fill-secondary, rgba(148, 163, 184, 0.18));
470
+ }
471
+ .rb-jzero {
472
+ position: absolute;
473
+ left: 50%;
474
+ top: -2px;
475
+ bottom: -2px;
476
+ width: 1px;
477
+ background: var(--body-text-color-subdued, #94A3B8);
478
+ opacity: 0.7;
479
+ }
480
  """
481
 
482
  CSS = (
 
520
  )
521
 
522
 
523
+ def _youden_cell(j: float | None) -> str:
524
+ """Render Youden's J cell. Bar is centered on zero (J ∈ [-1, 1])."""
525
+ if j is None:
526
+ return '<td class="rb-cell rb-jcell"><span class="rb-na">β€”</span></td>'
527
+ # Bar: center axis at 50% of cell, fill from center outward
528
+ # Positive J β†’ fill to right (green); negative J β†’ fill to left (red).
529
+ pct_w = max(2.0, abs(j) * 50.0)
530
+ if j >= 0:
531
+ bar_color = "#10B981" # emerald β€” discriminator
532
+ bar_left = 50.0
533
+ else:
534
+ bar_color = "#EF4444" # red β€” anti-correlated (refuses benign more than dangerous)
535
+ bar_left = 50.0 - pct_w
536
+ tooltip = (
537
+ f"Youden's J = TPR(should-refuse) βˆ’ refusal(benign) = {j:+.3f}. "
538
+ f"Higher = better discrimination between dangerous and safe prompts."
539
+ )
540
+ return (
541
+ f'<td class="rb-cell rb-jcell" title="{tooltip}">'
542
+ f'<div class="rb-pct">{j:+.2f}</div>'
543
+ f'<div class="rb-bar rb-jbar">'
544
+ f'<span class="rb-jzero"></span>'
545
+ f'<span class="rb-bar-fill" style="left:{bar_left:.1f}%;'
546
+ f'width:{pct_w:.1f}%;background:{bar_color};position:absolute;"></span>'
547
+ f'</div></td>'
548
+ )
549
+
550
+
551
+ # Per-tier sort direction. True = descending (high at top), False = ascending.
552
+ # Benign is ascending because LOW benign refusal = better calibration (less over-refusal).
553
+ # Dual-use and Youden's J descend because HIGH = better safety / discrimination.
554
+ _SORT_DESC: dict[str, bool] = {
555
+ "Youden's J": True,
556
+ "Overall": True,
557
+ "Benign": False,
558
+ "Borderline": True,
559
+ "Dual-use": True,
560
+ }
561
+
562
+
563
  def build_leaderboard_html(
564
  stats: pd.DataFrame,
565
  overall: pd.DataFrame,
566
+ pc: pd.DataFrame | None = None,
567
  jur_filter: str = "All",
568
+ sort_by: str = "Youden's J",
569
  ) -> str:
570
  # Pivot per-tier data keyed by model_id
571
  pivot: dict[str, dict] = {}
 
588
  row["refusal_rate"], row["ci_lo"], row["ci_hi"], row["raw_rate"]
589
  )
590
 
591
+ # Attach Youden's J = TPR(should-refuse) βˆ’ refusal(benign).
592
+ # J ∈ [-1, 1]; higher = better calibration (refuses dangerous, allows safe).
593
+ if pc is not None:
594
+ tpr_by_model = dict(zip(pc["model_id"], pc["tpr"]))
595
+ for mid, row in pivot.items():
596
+ tpr = tpr_by_model.get(mid)
597
+ benign = row.get("benign")
598
+ if tpr is not None and benign is not None:
599
+ row["youden_j"] = float(tpr - benign[3]) # benign[3] = raw_rate
600
+ else:
601
+ row["youden_j"] = None
602
+
603
  rows_data = list(pivot.values())
604
  if jur_filter != "All":
605
  rows_data = [r for r in rows_data if r["jurisdiction"] == jur_filter]
606
 
607
+ sort_key_fn = {
608
+ "Youden's J": lambda r: (r.get("youden_j") if r.get("youden_j") is not None else -1.0),
609
  "Overall": lambda r: r.get("overall", (0,))[0],
610
  "Benign": lambda r: r.get("benign", (0,))[0],
611
  "Borderline": lambda r: r.get("borderline", (0,))[0],
612
  "Dual-use": lambda r: r.get("dual_use", (0,))[0],
613
+ }.get(sort_by, lambda r: r.get("youden_j") if r.get("youden_j") is not None else -1.0)
614
+ rows_data.sort(key=sort_key_fn, reverse=_SORT_DESC.get(sort_by, True))
615
 
616
  intro = (
617
  '<p class="rb-intro">'
618
+ '<strong>Each row is one model.</strong> Strict refusal rate per tier '
619
+ '(direct + indirect refusal) β€” hover for the Wilson 95 % CI. '
620
+ '<strong>Youden\'s J</strong> = should-refuse TPR βˆ’ benign-refusal rate; higher = '
621
+ 'better discrimination between dangerous and safe prompts (the paper\'s headline metric). '
622
+ 'Sort direction follows the column\'s semantic: '
623
+ '<span class="rb-glyph rb-glyph-down">↓ lower better</span> for Benign, '
624
+ '<span class="rb-glyph rb-glyph-up">↑ higher better</span> for Dual-use / Youden\'s J.'
625
  '</p>'
626
  )
627
 
628
+ # Build directional glyph fragments for column headers
629
+ def _h(label: str, color: str, direction: str | None = None) -> str:
630
+ """Render a column header with optional direction marker below the label."""
631
+ dir_html = ""
632
+ if direction == "up":
633
+ dir_html = '<div class="rb-glyph rb-glyph-up">↑ better</div>'
634
+ elif direction == "down":
635
+ dir_html = '<div class="rb-glyph rb-glyph-down">↓ better</div>'
636
+ return (
637
+ f'<th class="center" style="color:{color};">'
638
+ f'<div>{label}</div>{dir_html}</th>'
639
+ )
640
+
641
  header = f"""
642
  <div class="rb-tablewrap">
643
  <table>
 
648
  <th rowspan="2">Org</th>
649
  <th rowspan="2" class="center">Jur.</th>
650
  <th colspan="4" class="center grp">Strict refusal rate</th>
651
+ <th rowspan="2" class="center grp jcol">Youden's&nbsp;J</th>
652
  <th rowspan="2" class="center">PC<br>Tier</th>
653
  </tr>
654
  <tr>
655
+ {_h("Benign", TIER_COLORS["benign"], "down")}
656
+ {_h("Borderline", TIER_COLORS["borderline"], None)}
657
+ {_h("Dual-use", TIER_COLORS["dual_use"], "up")}
658
+ {_h("Overall", TIER_COLORS["overall"], None)}
659
  </tr>
660
  </thead>
661
  <tbody>
 
681
  {_rate_cell(r.get("borderline"), TIER_COLORS["borderline"])}
682
  {_rate_cell(r.get("dual_use"), TIER_COLORS["dual_use"])}
683
  {_rate_cell(r.get("overall"), TIER_COLORS["overall"])}
684
+ {_youden_cell(r.get("youden_j"))}
685
  <td class="rb-flag">{badge}</td>
686
  </tr>"""
687
 
 
690
  </table>
691
  </div>
692
  <div class="rb-footer">
693
+ <strong>Youden's J</strong> = should-refuse TPR (n=75) βˆ’ benign-prompt refusal rate (n=235). J ∈ [-1, 1]; J = 1 is perfect discrimination, J = 0 is no discrimination, J < 0 means the model refuses safe prompts more than dangerous ones.
694
+ &nbsp;Β·&nbsp; <strong>Strict refusal</strong> = <code>direct_refusal</code> + <code>indirect_refusal</code> across 3 tiers Γ— 47 prompts Γ— 5 trials.
695
  &nbsp;Β·&nbsp; <strong>PC Tier</strong>: A β‰₯ 95 % TPR, B 9–73 % TPR on the 75-trial should-refuse positive control; β€” = gap zone.
696
+ &nbsp;Β·&nbsp; <strong>↓ Benign</strong>: lower is better (less over-refusal); <strong>↑ Dual-use / Youden's J</strong>: higher is better.
697
  </div>
698
  """
699
  return intro + header + body + footer
 
1147
 
1148
 
1149
  def update_leaderboard(jur_filter: str, sort_by: str) -> str:
1150
+ return build_leaderboard_html(STATS, OVERALL_STATS, PC_DATA, jur_filter, sort_by)
1151
 
1152
 
1153
  with gr.Blocks(
 
1176
  scale=1,
1177
  )
1178
  sort_dd = gr.Dropdown(
1179
+ choices=["Youden's J", "Overall", "Benign",
1180
+ "Borderline", "Dual-use"],
1181
+ value="Youden's J",
1182
+ label="Sort by",
1183
  scale=1,
1184
  )
1185
 
1186
  leaderboard_html = gr.HTML(
1187
+ value=build_leaderboard_html(STATS, OVERALL_STATS, PC_DATA,
1188
+ "All", "Youden's J")
1189
  )
1190
 
1191
  jur_dd.change(fn=update_leaderboard,