county commited on
Commit
0da3d51
·
1 Parent(s): bcb2aa6

Add OE/MC tabs and place MC order SVG above benchmark table

Browse files
Files changed (4) hide show
  1. assets/mc_order_gap_paper_style_v3.svg +1348 -0
  2. index.html +12 -5
  3. script.js +109 -31
  4. styles.css +22 -0
assets/mc_order_gap_paper_style_v3.svg ADDED
index.html CHANGED
@@ -207,6 +207,7 @@
207
  </div>
208
  <section class="cross-judge-card" aria-label="Cross-judge model ranking comparison">
209
  <div class="cross-toolbar">
 
210
  <div class="toggle-group" id="judge-toggle"></div>
211
  <div class="toggle-group metric" id="metric-toggle"></div>
212
  </div>
@@ -214,11 +215,17 @@
214
  <div class="results-panel" id="results-table"></div>
215
  </section>
216
  </div>
217
- <figure class="paper-figure compact benchmark-figure">
218
- <img src="./assets/wildtom_benchmark_comparison.png" alt="Benchmark capability coverage comparison including WildToM-Bench">
219
- <figcaption>Benchmark capability coverage across representative ToM datasets.</figcaption>
220
- <p class="benchmark-footnote">Rank consistency: Spearman &rho; = 0.867, p &lt; 0.01.</p>
221
- </figure>
 
 
 
 
 
 
222
  </div>
223
  </div>
224
  </section>
 
207
  </div>
208
  <section class="cross-judge-card" aria-label="Cross-judge model ranking comparison">
209
  <div class="cross-toolbar">
210
+ <div class="toggle-group task" id="task-toggle"></div>
211
  <div class="toggle-group" id="judge-toggle"></div>
212
  <div class="toggle-group metric" id="metric-toggle"></div>
213
  </div>
 
215
  <div class="results-panel" id="results-table"></div>
216
  </section>
217
  </div>
218
+ <div class="analysis-side">
219
+ <figure class="paper-figure compact mc-figure">
220
+ <img src="./assets/mc_order_gap_paper_style_v3.svg" alt="Mean MC accuracy by reasoning order across mental-state dimensions">
221
+ <figcaption>Mean MC accuracy by reasoning order across dimensions.</figcaption>
222
+ </figure>
223
+ <figure class="paper-figure compact benchmark-figure">
224
+ <img src="./assets/wildtom_benchmark_comparison.png" alt="Benchmark capability coverage comparison including WildToM-Bench">
225
+ <figcaption>Benchmark capability coverage across representative ToM datasets.</figcaption>
226
+ <p class="benchmark-footnote">Rank consistency: Spearman &rho; = 0.867, p &lt; 0.01.</p>
227
+ </figure>
228
+ </div>
229
  </div>
230
  </div>
231
  </section>
script.js CHANGED
@@ -194,9 +194,22 @@ const judgeResults = {
194
  }
195
  };
196
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  const filterRoot = document.getElementById("showcase-filters");
198
  const showcaseRoot = document.getElementById("showcase-browser");
199
  const resultsRoot = document.getElementById("results-table");
 
200
  const judgeToggleRoot = document.getElementById("judge-toggle");
201
  const metricToggleRoot = document.getElementById("metric-toggle");
202
  const resultsCaptionRoot = document.getElementById("results-caption");
@@ -205,6 +218,7 @@ const carouselState = {
205
  activeIndex: 0
206
  };
207
  const crossJudgeState = {
 
208
  activeJudge: "gpt4o",
209
  activeMetric: "oeAcc"
210
  };
@@ -336,53 +350,74 @@ function getDeltaClass(delta) {
336
  return delta > 0 ? "positive" : "negative";
337
  }
338
 
339
- function renderJudgeControls() {
340
- if (!judgeToggleRoot || !metricToggleRoot) {
341
  return;
342
  }
343
 
344
- const judgeItems = [
345
- { key: "gpt4o", label: "GPT-4o (full set)" },
346
- { key: "gemini", label: "Gemini-2.5-Flash (N=40)" }
347
  ];
348
- judgeToggleRoot.innerHTML = "";
349
- judgeItems.forEach((item) => {
350
  const button = document.createElement("button");
351
  button.type = "button";
352
- button.className = `toggle-btn${item.key === crossJudgeState.activeJudge ? " active" : ""}`;
353
- button.textContent = item.label;
354
  button.addEventListener("click", () => {
355
- renderCrossJudgeResults(item.key, crossJudgeState.activeMetric);
356
  });
357
- judgeToggleRoot.appendChild(button);
358
  });
359
 
360
- const metricItems = [
361
- { key: "oeAcc", label: "OE_acc" },
362
- { key: "oeScr", label: "OE_scr" }
363
- ];
364
  metricToggleRoot.innerHTML = "";
365
- metricItems.forEach((item) => {
366
- const button = document.createElement("button");
367
- button.type = "button";
368
- button.className = `toggle-btn${item.key === crossJudgeState.activeMetric ? " active" : ""}`;
369
- button.textContent = item.label;
370
- button.addEventListener("click", () => {
371
- renderCrossJudgeResults(crossJudgeState.activeJudge, item.key);
 
 
 
 
 
 
 
 
 
 
372
  });
373
- metricToggleRoot.appendChild(button);
374
- });
375
- }
376
 
377
- function renderCrossJudgeResults(activeJudge = crossJudgeState.activeJudge, activeMetric = crossJudgeState.activeMetric) {
378
- if (!resultsRoot) {
 
 
 
 
 
 
 
 
 
 
 
 
379
  return;
380
  }
381
 
382
- crossJudgeState.activeJudge = activeJudge;
383
- crossJudgeState.activeMetric = activeMetric;
384
- renderJudgeControls();
 
 
 
 
385
 
 
386
  const otherJudge = getOtherJudge(activeJudge);
387
  const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
388
  const valueDiff = getMetricValue(b, activeMetric) - getMetricValue(a, activeMetric);
@@ -424,5 +459,48 @@ function renderCrossJudgeResults(activeJudge = crossJudgeState.activeJudge, acti
424
  });
425
  }
426
 
427
- renderCrossJudgeResults("gpt4o", "oeAcc");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  renderSamples("Belief", 0);
 
194
  }
195
  };
196
 
197
+ const mcResults = [
198
+ { model: "WildToM-Reasoner", mcAcc: 72.7, isOurs: true },
199
+ { model: "Qwen3-VL", mcAcc: 62.1 },
200
+ { model: "Qwen3-Omni", mcAcc: 61.8 },
201
+ { model: "GPT-4o-mini", mcAcc: 57.2 },
202
+ { model: "Emotion-Qwen", mcAcc: 54.2 },
203
+ { model: "GLM-4.6V", mcAcc: 51.2 },
204
+ { model: "MiniCPM-V-4.5", mcAcc: 46.8 },
205
+ { model: "AffectGPT", mcAcc: 35.9 },
206
+ { model: "Video-LLaVA", mcAcc: 25.8 }
207
+ ];
208
+
209
  const filterRoot = document.getElementById("showcase-filters");
210
  const showcaseRoot = document.getElementById("showcase-browser");
211
  const resultsRoot = document.getElementById("results-table");
212
+ const taskToggleRoot = document.getElementById("task-toggle");
213
  const judgeToggleRoot = document.getElementById("judge-toggle");
214
  const metricToggleRoot = document.getElementById("metric-toggle");
215
  const resultsCaptionRoot = document.getElementById("results-caption");
 
218
  activeIndex: 0
219
  };
220
  const crossJudgeState = {
221
+ activeTask: "oe",
222
  activeJudge: "gpt4o",
223
  activeMetric: "oeAcc"
224
  };
 
350
  return delta > 0 ? "positive" : "negative";
351
  }
352
 
353
+ function renderControls() {
354
+ if (!taskToggleRoot || !judgeToggleRoot || !metricToggleRoot) {
355
  return;
356
  }
357
 
358
+ const taskItems = [
359
+ { key: "oe", label: "OE" },
360
+ { key: "mc", label: "MC" }
361
  ];
362
+ taskToggleRoot.innerHTML = "";
363
+ taskItems.forEach((item) => {
364
  const button = document.createElement("button");
365
  button.type = "button";
366
+ button.className = `toggle-btn${item.key === crossJudgeState.activeTask ? " active" : ""}`;
367
+ button.textContent = item.label === "MC" ? "MC (main benchmark)" : "OE (cross-judge)";
368
  button.addEventListener("click", () => {
369
+ renderResultsPanel(item.key, crossJudgeState.activeJudge, crossJudgeState.activeMetric);
370
  });
371
+ taskToggleRoot.appendChild(button);
372
  });
373
 
374
+ judgeToggleRoot.innerHTML = "";
 
 
 
375
  metricToggleRoot.innerHTML = "";
376
+
377
+ if (crossJudgeState.activeTask === "oe") {
378
+ judgeToggleRoot.classList.remove("is-hidden");
379
+
380
+ const judgeItems = [
381
+ { key: "gpt4o", label: "GPT-4o (full set)" },
382
+ { key: "gemini", label: "Gemini-2.5-Flash (N=40)" }
383
+ ];
384
+ judgeItems.forEach((item) => {
385
+ const button = document.createElement("button");
386
+ button.type = "button";
387
+ button.className = `toggle-btn${item.key === crossJudgeState.activeJudge ? " active" : ""}`;
388
+ button.textContent = item.label;
389
+ button.addEventListener("click", () => {
390
+ renderResultsPanel(crossJudgeState.activeTask, item.key, crossJudgeState.activeMetric);
391
+ });
392
+ judgeToggleRoot.appendChild(button);
393
  });
 
 
 
394
 
395
+ const metricItems = [
396
+ { key: "oeAcc", label: "OE_acc" },
397
+ { key: "oeScr", label: "OE_scr" }
398
+ ];
399
+ metricItems.forEach((item) => {
400
+ const button = document.createElement("button");
401
+ button.type = "button";
402
+ button.className = `toggle-btn${item.key === crossJudgeState.activeMetric ? " active" : ""}`;
403
+ button.textContent = item.label;
404
+ button.addEventListener("click", () => {
405
+ renderResultsPanel(crossJudgeState.activeTask, crossJudgeState.activeJudge, item.key);
406
+ });
407
+ metricToggleRoot.appendChild(button);
408
+ });
409
  return;
410
  }
411
 
412
+ judgeToggleRoot.classList.add("is-hidden");
413
+ const mcButton = document.createElement("button");
414
+ mcButton.type = "button";
415
+ mcButton.className = "toggle-btn active";
416
+ mcButton.textContent = "MC_acc";
417
+ metricToggleRoot.appendChild(mcButton);
418
+ }
419
 
420
+ function renderOeResults(activeJudge, activeMetric) {
421
  const otherJudge = getOtherJudge(activeJudge);
422
  const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
423
  const valueDiff = getMetricValue(b, activeMetric) - getMetricValue(a, activeMetric);
 
459
  });
460
  }
461
 
462
+ function renderMcResults() {
463
+ const ranked = [...mcResults].sort((a, b) => b.mcAcc - a.mcAcc);
464
+ if (resultsCaptionRoot) {
465
+ resultsCaptionRoot.textContent = "Main benchmark (MC) · Sorted by MC_acc";
466
+ }
467
+
468
+ resultsRoot.innerHTML = "";
469
+ ranked.forEach((item, index) => {
470
+ const row = document.createElement("div");
471
+ row.className = `result-row${item.isOurs ? " ours" : ""}`;
472
+ row.innerHTML = `
473
+ <div class="result-rank">#${index + 1}</div>
474
+ <div class="result-model-wrap">
475
+ <div class="result-model">${item.model}</div>
476
+ ${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
477
+ </div>
478
+ <div class="result-score">${item.mcAcc.toFixed(1)}%</div>
479
+ <div class="result-delta neutral">single-judge MC setting</div>
480
+ `;
481
+ resultsRoot.appendChild(row);
482
+ });
483
+ }
484
+
485
+ function renderResultsPanel(
486
+ activeTask = crossJudgeState.activeTask,
487
+ activeJudge = crossJudgeState.activeJudge,
488
+ activeMetric = crossJudgeState.activeMetric
489
+ ) {
490
+ if (!resultsRoot) {
491
+ return;
492
+ }
493
+ crossJudgeState.activeTask = activeTask;
494
+ crossJudgeState.activeJudge = activeJudge;
495
+ crossJudgeState.activeMetric = activeMetric;
496
+
497
+ renderControls();
498
+ if (crossJudgeState.activeTask === "mc") {
499
+ renderMcResults();
500
+ } else {
501
+ renderOeResults(crossJudgeState.activeJudge, crossJudgeState.activeMetric);
502
+ }
503
+ }
504
+
505
+ renderResultsPanel("oe", "gpt4o", "oeAcc");
506
  renderSamples("Belief", 0);
styles.css CHANGED
@@ -451,6 +451,12 @@ code {
451
  align-items: start;
452
  }
453
 
 
 
 
 
 
 
454
  .text-content {
455
  font-size: 1.05rem;
456
  }
@@ -476,6 +482,10 @@ code {
476
  margin-bottom: 8px;
477
  }
478
 
 
 
 
 
479
  .toggle-group {
480
  display: flex;
481
  flex-wrap: wrap;
@@ -500,6 +510,10 @@ code {
500
  color: #ffffff;
501
  }
502
 
 
 
 
 
503
  .results-caption {
504
  margin: 0 0 8px;
505
  font-size: 0.85rem;
@@ -599,6 +613,10 @@ code {
599
  border-radius: 10px;
600
  }
601
 
 
 
 
 
602
  .benchmark-figure figcaption {
603
  margin-bottom: 3px;
604
  }
@@ -858,6 +876,10 @@ code {
858
  border-top: 1px solid var(--line);
859
  }
860
 
 
 
 
 
861
  .table-grid {
862
  grid-template-columns: 1fr;
863
  }
 
451
  align-items: start;
452
  }
453
 
454
+ .analysis-side {
455
+ display: grid;
456
+ gap: 12px;
457
+ margin-top: 72px;
458
+ }
459
+
460
  .text-content {
461
  font-size: 1.05rem;
462
  }
 
482
  margin-bottom: 8px;
483
  }
484
 
485
+ .toggle-group.task .toggle-btn {
486
+ font-size: 0.84rem;
487
+ }
488
+
489
  .toggle-group {
490
  display: flex;
491
  flex-wrap: wrap;
 
510
  color: #ffffff;
511
  }
512
 
513
+ .is-hidden {
514
+ display: none !important;
515
+ }
516
+
517
  .results-caption {
518
  margin: 0 0 8px;
519
  font-size: 0.85rem;
 
613
  border-radius: 10px;
614
  }
615
 
616
+ .mc-figure {
617
+ background: #fdfcf9;
618
+ }
619
+
620
  .benchmark-figure figcaption {
621
  margin-bottom: 3px;
622
  }
 
876
  border-top: 1px solid var(--line);
877
  }
878
 
879
+ .analysis-side {
880
+ margin-top: 0;
881
+ }
882
+
883
  .table-grid {
884
  grid-template-columns: 1fr;
885
  }