county commited on
Commit ·
0da3d51
1
Parent(s): bcb2aa6
Add OE/MC tabs and place MC order SVG above benchmark table
Browse files- assets/mc_order_gap_paper_style_v3.svg +1348 -0
- index.html +12 -5
- script.js +109 -31
- styles.css +22 -0
assets/mc_order_gap_paper_style_v3.svg
ADDED
|
|
index.html
CHANGED
|
@@ -207,6 +207,7 @@
|
|
| 207 |
</div>
|
| 208 |
<section class="cross-judge-card" aria-label="Cross-judge model ranking comparison">
|
| 209 |
<div class="cross-toolbar">
|
|
|
|
| 210 |
<div class="toggle-group" id="judge-toggle"></div>
|
| 211 |
<div class="toggle-group metric" id="metric-toggle"></div>
|
| 212 |
</div>
|
|
@@ -214,11 +215,17 @@
|
|
| 214 |
<div class="results-panel" id="results-table"></div>
|
| 215 |
</section>
|
| 216 |
</div>
|
| 217 |
-
<
|
| 218 |
-
<
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
</div>
|
| 223 |
</div>
|
| 224 |
</section>
|
|
|
|
| 207 |
</div>
|
| 208 |
<section class="cross-judge-card" aria-label="Cross-judge model ranking comparison">
|
| 209 |
<div class="cross-toolbar">
|
| 210 |
+
<div class="toggle-group task" id="task-toggle"></div>
|
| 211 |
<div class="toggle-group" id="judge-toggle"></div>
|
| 212 |
<div class="toggle-group metric" id="metric-toggle"></div>
|
| 213 |
</div>
|
|
|
|
| 215 |
<div class="results-panel" id="results-table"></div>
|
| 216 |
</section>
|
| 217 |
</div>
|
| 218 |
+
<div class="analysis-side">
|
| 219 |
+
<figure class="paper-figure compact mc-figure">
|
| 220 |
+
<img src="./assets/mc_order_gap_paper_style_v3.svg" alt="Mean MC accuracy by reasoning order across mental-state dimensions">
|
| 221 |
+
<figcaption>Mean MC accuracy by reasoning order across dimensions.</figcaption>
|
| 222 |
+
</figure>
|
| 223 |
+
<figure class="paper-figure compact benchmark-figure">
|
| 224 |
+
<img src="./assets/wildtom_benchmark_comparison.png" alt="Benchmark capability coverage comparison including WildToM-Bench">
|
| 225 |
+
<figcaption>Benchmark capability coverage across representative ToM datasets.</figcaption>
|
| 226 |
+
<p class="benchmark-footnote">Rank consistency: Spearman ρ = 0.867, p < 0.01.</p>
|
| 227 |
+
</figure>
|
| 228 |
+
</div>
|
| 229 |
</div>
|
| 230 |
</div>
|
| 231 |
</section>
|
script.js
CHANGED
|
@@ -194,9 +194,22 @@ const judgeResults = {
|
|
| 194 |
}
|
| 195 |
};
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
const filterRoot = document.getElementById("showcase-filters");
|
| 198 |
const showcaseRoot = document.getElementById("showcase-browser");
|
| 199 |
const resultsRoot = document.getElementById("results-table");
|
|
|
|
| 200 |
const judgeToggleRoot = document.getElementById("judge-toggle");
|
| 201 |
const metricToggleRoot = document.getElementById("metric-toggle");
|
| 202 |
const resultsCaptionRoot = document.getElementById("results-caption");
|
|
@@ -205,6 +218,7 @@ const carouselState = {
|
|
| 205 |
activeIndex: 0
|
| 206 |
};
|
| 207 |
const crossJudgeState = {
|
|
|
|
| 208 |
activeJudge: "gpt4o",
|
| 209 |
activeMetric: "oeAcc"
|
| 210 |
};
|
|
@@ -336,53 +350,74 @@ function getDeltaClass(delta) {
|
|
| 336 |
return delta > 0 ? "positive" : "negative";
|
| 337 |
}
|
| 338 |
|
| 339 |
-
function
|
| 340 |
-
if (!judgeToggleRoot || !metricToggleRoot) {
|
| 341 |
return;
|
| 342 |
}
|
| 343 |
|
| 344 |
-
const
|
| 345 |
-
{ key: "
|
| 346 |
-
{ key: "
|
| 347 |
];
|
| 348 |
-
|
| 349 |
-
|
| 350 |
const button = document.createElement("button");
|
| 351 |
button.type = "button";
|
| 352 |
-
button.className = `toggle-btn${item.key === crossJudgeState.
|
| 353 |
-
button.textContent = item.label;
|
| 354 |
button.addEventListener("click", () => {
|
| 355 |
-
|
| 356 |
});
|
| 357 |
-
|
| 358 |
});
|
| 359 |
|
| 360 |
-
|
| 361 |
-
{ key: "oeAcc", label: "OE_acc" },
|
| 362 |
-
{ key: "oeScr", label: "OE_scr" }
|
| 363 |
-
];
|
| 364 |
metricToggleRoot.innerHTML = "";
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
});
|
| 373 |
-
metricToggleRoot.appendChild(button);
|
| 374 |
-
});
|
| 375 |
-
}
|
| 376 |
|
| 377 |
-
|
| 378 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
return;
|
| 380 |
}
|
| 381 |
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
|
|
|
|
| 386 |
const otherJudge = getOtherJudge(activeJudge);
|
| 387 |
const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
|
| 388 |
const valueDiff = getMetricValue(b, activeMetric) - getMetricValue(a, activeMetric);
|
|
@@ -424,5 +459,48 @@ function renderCrossJudgeResults(activeJudge = crossJudgeState.activeJudge, acti
|
|
| 424 |
});
|
| 425 |
}
|
| 426 |
|
| 427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
renderSamples("Belief", 0);
|
|
|
|
| 194 |
}
|
| 195 |
};
|
| 196 |
|
| 197 |
+
const mcResults = [
|
| 198 |
+
{ model: "WildToM-Reasoner", mcAcc: 72.7, isOurs: true },
|
| 199 |
+
{ model: "Qwen3-VL", mcAcc: 62.1 },
|
| 200 |
+
{ model: "Qwen3-Omni", mcAcc: 61.8 },
|
| 201 |
+
{ model: "GPT-4o-mini", mcAcc: 57.2 },
|
| 202 |
+
{ model: "Emotion-Qwen", mcAcc: 54.2 },
|
| 203 |
+
{ model: "GLM-4.6V", mcAcc: 51.2 },
|
| 204 |
+
{ model: "MiniCPM-V-4.5", mcAcc: 46.8 },
|
| 205 |
+
{ model: "AffectGPT", mcAcc: 35.9 },
|
| 206 |
+
{ model: "Video-LLaVA", mcAcc: 25.8 }
|
| 207 |
+
];
|
| 208 |
+
|
| 209 |
const filterRoot = document.getElementById("showcase-filters");
|
| 210 |
const showcaseRoot = document.getElementById("showcase-browser");
|
| 211 |
const resultsRoot = document.getElementById("results-table");
|
| 212 |
+
const taskToggleRoot = document.getElementById("task-toggle");
|
| 213 |
const judgeToggleRoot = document.getElementById("judge-toggle");
|
| 214 |
const metricToggleRoot = document.getElementById("metric-toggle");
|
| 215 |
const resultsCaptionRoot = document.getElementById("results-caption");
|
|
|
|
| 218 |
activeIndex: 0
|
| 219 |
};
|
| 220 |
const crossJudgeState = {
|
| 221 |
+
activeTask: "oe",
|
| 222 |
activeJudge: "gpt4o",
|
| 223 |
activeMetric: "oeAcc"
|
| 224 |
};
|
|
|
|
| 350 |
return delta > 0 ? "positive" : "negative";
|
| 351 |
}
|
| 352 |
|
| 353 |
+
function renderControls() {
|
| 354 |
+
if (!taskToggleRoot || !judgeToggleRoot || !metricToggleRoot) {
|
| 355 |
return;
|
| 356 |
}
|
| 357 |
|
| 358 |
+
const taskItems = [
|
| 359 |
+
{ key: "oe", label: "OE" },
|
| 360 |
+
{ key: "mc", label: "MC" }
|
| 361 |
];
|
| 362 |
+
taskToggleRoot.innerHTML = "";
|
| 363 |
+
taskItems.forEach((item) => {
|
| 364 |
const button = document.createElement("button");
|
| 365 |
button.type = "button";
|
| 366 |
+
button.className = `toggle-btn${item.key === crossJudgeState.activeTask ? " active" : ""}`;
|
| 367 |
+
button.textContent = item.label === "MC" ? "MC (main benchmark)" : "OE (cross-judge)";
|
| 368 |
button.addEventListener("click", () => {
|
| 369 |
+
renderResultsPanel(item.key, crossJudgeState.activeJudge, crossJudgeState.activeMetric);
|
| 370 |
});
|
| 371 |
+
taskToggleRoot.appendChild(button);
|
| 372 |
});
|
| 373 |
|
| 374 |
+
judgeToggleRoot.innerHTML = "";
|
|
|
|
|
|
|
|
|
|
| 375 |
metricToggleRoot.innerHTML = "";
|
| 376 |
+
|
| 377 |
+
if (crossJudgeState.activeTask === "oe") {
|
| 378 |
+
judgeToggleRoot.classList.remove("is-hidden");
|
| 379 |
+
|
| 380 |
+
const judgeItems = [
|
| 381 |
+
{ key: "gpt4o", label: "GPT-4o (full set)" },
|
| 382 |
+
{ key: "gemini", label: "Gemini-2.5-Flash (N=40)" }
|
| 383 |
+
];
|
| 384 |
+
judgeItems.forEach((item) => {
|
| 385 |
+
const button = document.createElement("button");
|
| 386 |
+
button.type = "button";
|
| 387 |
+
button.className = `toggle-btn${item.key === crossJudgeState.activeJudge ? " active" : ""}`;
|
| 388 |
+
button.textContent = item.label;
|
| 389 |
+
button.addEventListener("click", () => {
|
| 390 |
+
renderResultsPanel(crossJudgeState.activeTask, item.key, crossJudgeState.activeMetric);
|
| 391 |
+
});
|
| 392 |
+
judgeToggleRoot.appendChild(button);
|
| 393 |
});
|
|
|
|
|
|
|
|
|
|
| 394 |
|
| 395 |
+
const metricItems = [
|
| 396 |
+
{ key: "oeAcc", label: "OE_acc" },
|
| 397 |
+
{ key: "oeScr", label: "OE_scr" }
|
| 398 |
+
];
|
| 399 |
+
metricItems.forEach((item) => {
|
| 400 |
+
const button = document.createElement("button");
|
| 401 |
+
button.type = "button";
|
| 402 |
+
button.className = `toggle-btn${item.key === crossJudgeState.activeMetric ? " active" : ""}`;
|
| 403 |
+
button.textContent = item.label;
|
| 404 |
+
button.addEventListener("click", () => {
|
| 405 |
+
renderResultsPanel(crossJudgeState.activeTask, crossJudgeState.activeJudge, item.key);
|
| 406 |
+
});
|
| 407 |
+
metricToggleRoot.appendChild(button);
|
| 408 |
+
});
|
| 409 |
return;
|
| 410 |
}
|
| 411 |
|
| 412 |
+
judgeToggleRoot.classList.add("is-hidden");
|
| 413 |
+
const mcButton = document.createElement("button");
|
| 414 |
+
mcButton.type = "button";
|
| 415 |
+
mcButton.className = "toggle-btn active";
|
| 416 |
+
mcButton.textContent = "MC_acc";
|
| 417 |
+
metricToggleRoot.appendChild(mcButton);
|
| 418 |
+
}
|
| 419 |
|
| 420 |
+
function renderOeResults(activeJudge, activeMetric) {
|
| 421 |
const otherJudge = getOtherJudge(activeJudge);
|
| 422 |
const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
|
| 423 |
const valueDiff = getMetricValue(b, activeMetric) - getMetricValue(a, activeMetric);
|
|
|
|
| 459 |
});
|
| 460 |
}
|
| 461 |
|
| 462 |
+
function renderMcResults() {
|
| 463 |
+
const ranked = [...mcResults].sort((a, b) => b.mcAcc - a.mcAcc);
|
| 464 |
+
if (resultsCaptionRoot) {
|
| 465 |
+
resultsCaptionRoot.textContent = "Main benchmark (MC) · Sorted by MC_acc";
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
resultsRoot.innerHTML = "";
|
| 469 |
+
ranked.forEach((item, index) => {
|
| 470 |
+
const row = document.createElement("div");
|
| 471 |
+
row.className = `result-row${item.isOurs ? " ours" : ""}`;
|
| 472 |
+
row.innerHTML = `
|
| 473 |
+
<div class="result-rank">#${index + 1}</div>
|
| 474 |
+
<div class="result-model-wrap">
|
| 475 |
+
<div class="result-model">${item.model}</div>
|
| 476 |
+
${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
|
| 477 |
+
</div>
|
| 478 |
+
<div class="result-score">${item.mcAcc.toFixed(1)}%</div>
|
| 479 |
+
<div class="result-delta neutral">single-judge MC setting</div>
|
| 480 |
+
`;
|
| 481 |
+
resultsRoot.appendChild(row);
|
| 482 |
+
});
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
function renderResultsPanel(
|
| 486 |
+
activeTask = crossJudgeState.activeTask,
|
| 487 |
+
activeJudge = crossJudgeState.activeJudge,
|
| 488 |
+
activeMetric = crossJudgeState.activeMetric
|
| 489 |
+
) {
|
| 490 |
+
if (!resultsRoot) {
|
| 491 |
+
return;
|
| 492 |
+
}
|
| 493 |
+
crossJudgeState.activeTask = activeTask;
|
| 494 |
+
crossJudgeState.activeJudge = activeJudge;
|
| 495 |
+
crossJudgeState.activeMetric = activeMetric;
|
| 496 |
+
|
| 497 |
+
renderControls();
|
| 498 |
+
if (crossJudgeState.activeTask === "mc") {
|
| 499 |
+
renderMcResults();
|
| 500 |
+
} else {
|
| 501 |
+
renderOeResults(crossJudgeState.activeJudge, crossJudgeState.activeMetric);
|
| 502 |
+
}
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
renderResultsPanel("oe", "gpt4o", "oeAcc");
|
| 506 |
renderSamples("Belief", 0);
|
styles.css
CHANGED
|
@@ -451,6 +451,12 @@ code {
|
|
| 451 |
align-items: start;
|
| 452 |
}
|
| 453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
.text-content {
|
| 455 |
font-size: 1.05rem;
|
| 456 |
}
|
|
@@ -476,6 +482,10 @@ code {
|
|
| 476 |
margin-bottom: 8px;
|
| 477 |
}
|
| 478 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
.toggle-group {
|
| 480 |
display: flex;
|
| 481 |
flex-wrap: wrap;
|
|
@@ -500,6 +510,10 @@ code {
|
|
| 500 |
color: #ffffff;
|
| 501 |
}
|
| 502 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 503 |
.results-caption {
|
| 504 |
margin: 0 0 8px;
|
| 505 |
font-size: 0.85rem;
|
|
@@ -599,6 +613,10 @@ code {
|
|
| 599 |
border-radius: 10px;
|
| 600 |
}
|
| 601 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
.benchmark-figure figcaption {
|
| 603 |
margin-bottom: 3px;
|
| 604 |
}
|
|
@@ -858,6 +876,10 @@ code {
|
|
| 858 |
border-top: 1px solid var(--line);
|
| 859 |
}
|
| 860 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 861 |
.table-grid {
|
| 862 |
grid-template-columns: 1fr;
|
| 863 |
}
|
|
|
|
| 451 |
align-items: start;
|
| 452 |
}
|
| 453 |
|
| 454 |
+
.analysis-side {
|
| 455 |
+
display: grid;
|
| 456 |
+
gap: 12px;
|
| 457 |
+
margin-top: 72px;
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
.text-content {
|
| 461 |
font-size: 1.05rem;
|
| 462 |
}
|
|
|
|
| 482 |
margin-bottom: 8px;
|
| 483 |
}
|
| 484 |
|
| 485 |
+
.toggle-group.task .toggle-btn {
|
| 486 |
+
font-size: 0.84rem;
|
| 487 |
+
}
|
| 488 |
+
|
| 489 |
.toggle-group {
|
| 490 |
display: flex;
|
| 491 |
flex-wrap: wrap;
|
|
|
|
| 510 |
color: #ffffff;
|
| 511 |
}
|
| 512 |
|
| 513 |
+
.is-hidden {
|
| 514 |
+
display: none !important;
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
.results-caption {
|
| 518 |
margin: 0 0 8px;
|
| 519 |
font-size: 0.85rem;
|
|
|
|
| 613 |
border-radius: 10px;
|
| 614 |
}
|
| 615 |
|
| 616 |
+
.mc-figure {
|
| 617 |
+
background: #fdfcf9;
|
| 618 |
+
}
|
| 619 |
+
|
| 620 |
.benchmark-figure figcaption {
|
| 621 |
margin-bottom: 3px;
|
| 622 |
}
|
|
|
|
| 876 |
border-top: 1px solid var(--line);
|
| 877 |
}
|
| 878 |
|
| 879 |
+
.analysis-side {
|
| 880 |
+
margin-top: 0;
|
| 881 |
+
}
|
| 882 |
+
|
| 883 |
.table-grid {
|
| 884 |
grid-template-columns: 1fr;
|
| 885 |
}
|