county commited on
Commit ·
fdc1e03
1
Parent(s): caf6918
Align cross-judge layout and remove full-set label
Browse files- index.html +8 -12
- script.js +2 -13
- styles.css +17 -5
index.html
CHANGED
|
@@ -204,16 +204,16 @@
|
|
| 204 |
robustness and expose judge-specific gaps.
|
| 205 |
</p>
|
| 206 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
<div class="analysis-grid">
|
| 208 |
<div class="analysis-main">
|
| 209 |
-
<div class="text-content result-copy">
|
| 210 |
-
<p>
|
| 211 |
-
Across both judges, <strong>WildToM-Reasoner</strong> ranks first. On Gemini
|
| 212 |
-
common-exam grading, its lead widens, while rank correlation remains high
|
| 213 |
-
(<strong>Spearman ρ = 0.867</strong>), indicating that the OE ranking trend is
|
| 214 |
-
robust to judge replacement.
|
| 215 |
-
</p>
|
| 216 |
-
</div>
|
| 217 |
<section class="cross-judge-card" aria-label="Cross-judge model ranking comparison">
|
| 218 |
<div class="cross-toolbar">
|
| 219 |
<div class="toolbar-row">
|
|
@@ -224,10 +224,6 @@
|
|
| 224 |
<span class="toolbar-label">Judge</span>
|
| 225 |
<div class="toggle-group" id="judge-toggle"></div>
|
| 226 |
</div>
|
| 227 |
-
<div class="toolbar-row">
|
| 228 |
-
<span class="toolbar-label">Metric</span>
|
| 229 |
-
<div class="toggle-group metric" id="metric-toggle"></div>
|
| 230 |
-
</div>
|
| 231 |
</div>
|
| 232 |
<div class="results-caption" id="results-caption"></div>
|
| 233 |
<div class="results-panel" id="results-table"></div>
|
|
|
|
| 204 |
robustness and expose judge-specific gaps.
|
| 205 |
</p>
|
| 206 |
</div>
|
| 207 |
+
<div class="text-content result-copy result-copy-wide">
|
| 208 |
+
<p>
|
| 209 |
+
Across both judges, <strong>WildToM-Reasoner</strong> ranks first. On Gemini grading,
|
| 210 |
+
its lead widens, while rank correlation remains high
|
| 211 |
+
(<strong>Spearman ρ = 0.867</strong>), indicating that the OE ranking trend is
|
| 212 |
+
robust to judge replacement.
|
| 213 |
+
</p>
|
| 214 |
+
</div>
|
| 215 |
<div class="analysis-grid">
|
| 216 |
<div class="analysis-main">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
<section class="cross-judge-card" aria-label="Cross-judge model ranking comparison">
|
| 218 |
<div class="cross-toolbar">
|
| 219 |
<div class="toolbar-row">
|
|
|
|
| 224 |
<span class="toolbar-label">Judge</span>
|
| 225 |
<div class="toggle-group" id="judge-toggle"></div>
|
| 226 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
</div>
|
| 228 |
<div class="results-caption" id="results-caption"></div>
|
| 229 |
<div class="results-panel" id="results-table"></div>
|
script.js
CHANGED
|
@@ -161,7 +161,7 @@ const filters = ["Belief", "Desire", "Intention", "Emotion", "Knowledge"];
|
|
| 161 |
|
| 162 |
const judgeResults = {
|
| 163 |
gpt4o: {
|
| 164 |
-
label: "GPT-4o
|
| 165 |
deltaLabel: "Gemini",
|
| 166 |
rows: [
|
| 167 |
{ model: "WildToM-Reasoner", rank: 1, oeAcc: 32.7, oeScr: 2.6, isOurs: true },
|
|
@@ -211,7 +211,6 @@ const showcaseRoot = document.getElementById("showcase-browser");
|
|
| 211 |
const resultsRoot = document.getElementById("results-table");
|
| 212 |
const taskToggleRoot = document.getElementById("task-toggle");
|
| 213 |
const judgeToggleRoot = document.getElementById("judge-toggle");
|
| 214 |
-
const metricToggleRoot = document.getElementById("metric-toggle");
|
| 215 |
const resultsCaptionRoot = document.getElementById("results-caption");
|
| 216 |
const carouselState = {
|
| 217 |
activeDimension: "Belief",
|
|
@@ -385,7 +384,6 @@ function renderControls() {
|
|
| 385 |
return;
|
| 386 |
}
|
| 387 |
const judgeToggleRow = judgeToggleRoot.closest(".toolbar-row");
|
| 388 |
-
const metricToggleRow = metricToggleRoot ? metricToggleRoot.closest(".toolbar-row") : null;
|
| 389 |
|
| 390 |
const taskItems = [
|
| 391 |
{ key: "oe", label: "OE" },
|
|
@@ -404,21 +402,15 @@ function renderControls() {
|
|
| 404 |
});
|
| 405 |
|
| 406 |
judgeToggleRoot.innerHTML = "";
|
| 407 |
-
if (metricToggleRoot) {
|
| 408 |
-
metricToggleRoot.innerHTML = "";
|
| 409 |
-
}
|
| 410 |
|
| 411 |
if (crossJudgeState.activeTask === "oe") {
|
| 412 |
judgeToggleRoot.classList.remove("is-hidden");
|
| 413 |
if (judgeToggleRow) {
|
| 414 |
judgeToggleRow.classList.remove("is-hidden");
|
| 415 |
}
|
| 416 |
-
if (metricToggleRow) {
|
| 417 |
-
metricToggleRow.classList.add("is-hidden");
|
| 418 |
-
}
|
| 419 |
|
| 420 |
const judgeItems = [
|
| 421 |
-
{ key: "gpt4o", label: "GPT-4o
|
| 422 |
{ key: "gemini", label: "Gemini-2.5-Flash" }
|
| 423 |
];
|
| 424 |
judgeItems.forEach((item) => {
|
|
@@ -434,9 +426,6 @@ function renderControls() {
|
|
| 434 |
if (judgeToggleRow) {
|
| 435 |
judgeToggleRow.classList.add("is-hidden");
|
| 436 |
}
|
| 437 |
-
if (metricToggleRow) {
|
| 438 |
-
metricToggleRow.classList.add("is-hidden");
|
| 439 |
-
}
|
| 440 |
}
|
| 441 |
|
| 442 |
function renderOeResults(activeJudge) {
|
|
|
|
| 161 |
|
| 162 |
const judgeResults = {
|
| 163 |
gpt4o: {
|
| 164 |
+
label: "GPT-4o",
|
| 165 |
deltaLabel: "Gemini",
|
| 166 |
rows: [
|
| 167 |
{ model: "WildToM-Reasoner", rank: 1, oeAcc: 32.7, oeScr: 2.6, isOurs: true },
|
|
|
|
| 211 |
const resultsRoot = document.getElementById("results-table");
|
| 212 |
const taskToggleRoot = document.getElementById("task-toggle");
|
| 213 |
const judgeToggleRoot = document.getElementById("judge-toggle");
|
|
|
|
| 214 |
const resultsCaptionRoot = document.getElementById("results-caption");
|
| 215 |
const carouselState = {
|
| 216 |
activeDimension: "Belief",
|
|
|
|
| 384 |
return;
|
| 385 |
}
|
| 386 |
const judgeToggleRow = judgeToggleRoot.closest(".toolbar-row");
|
|
|
|
| 387 |
|
| 388 |
const taskItems = [
|
| 389 |
{ key: "oe", label: "OE" },
|
|
|
|
| 402 |
});
|
| 403 |
|
| 404 |
judgeToggleRoot.innerHTML = "";
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
if (crossJudgeState.activeTask === "oe") {
|
| 407 |
judgeToggleRoot.classList.remove("is-hidden");
|
| 408 |
if (judgeToggleRow) {
|
| 409 |
judgeToggleRow.classList.remove("is-hidden");
|
| 410 |
}
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
const judgeItems = [
|
| 413 |
+
{ key: "gpt4o", label: "GPT-4o" },
|
| 414 |
{ key: "gemini", label: "Gemini-2.5-Flash" }
|
| 415 |
];
|
| 416 |
judgeItems.forEach((item) => {
|
|
|
|
| 426 |
if (judgeToggleRow) {
|
| 427 |
judgeToggleRow.classList.add("is-hidden");
|
| 428 |
}
|
|
|
|
|
|
|
|
|
|
| 429 |
}
|
| 430 |
|
| 431 |
function renderOeResults(activeJudge) {
|
styles.css
CHANGED
|
@@ -358,13 +358,14 @@ code {
|
|
| 358 |
display: grid;
|
| 359 |
grid-template-columns: minmax(0, 1fr) minmax(0, 1fr);
|
| 360 |
gap: 14px;
|
| 361 |
-
align-items:
|
| 362 |
}
|
| 363 |
|
| 364 |
.stats-left-stack {
|
| 365 |
display: grid;
|
| 366 |
gap: 10px;
|
| 367 |
align-content: start;
|
|
|
|
| 368 |
}
|
| 369 |
|
| 370 |
.distribution-figure {
|
|
@@ -383,6 +384,7 @@ code {
|
|
| 383 |
.table-stack {
|
| 384 |
display: grid;
|
| 385 |
gap: 10px;
|
|
|
|
| 386 |
}
|
| 387 |
|
| 388 |
.dimension-grid {
|
|
@@ -416,6 +418,8 @@ code {
|
|
| 416 |
display: grid;
|
| 417 |
grid-template-columns: repeat(2, minmax(0, 1fr));
|
| 418 |
gap: 10px;
|
|
|
|
|
|
|
| 419 |
}
|
| 420 |
|
| 421 |
.table-card {
|
|
@@ -423,6 +427,7 @@ code {
|
|
| 423 |
border-radius: 12px;
|
| 424 |
background: #ffffff;
|
| 425 |
padding: 12px 12px 8px;
|
|
|
|
| 426 |
}
|
| 427 |
|
| 428 |
.table-card h3 {
|
|
@@ -463,19 +468,16 @@ code {
|
|
| 463 |
display: grid;
|
| 464 |
grid-template-columns: minmax(0, 1.15fr) minmax(320px, 0.85fr);
|
| 465 |
gap: 16px;
|
| 466 |
-
align-items:
|
| 467 |
}
|
| 468 |
|
| 469 |
.analysis-main {
|
| 470 |
-
display: grid;
|
| 471 |
-
grid-template-rows: auto 1fr;
|
| 472 |
min-height: 100%;
|
| 473 |
}
|
| 474 |
|
| 475 |
.analysis-side {
|
| 476 |
display: grid;
|
| 477 |
gap: 12px;
|
| 478 |
-
margin-top: 92px;
|
| 479 |
}
|
| 480 |
|
| 481 |
.text-content {
|
|
@@ -490,6 +492,16 @@ code {
|
|
| 490 |
margin-bottom: 12px;
|
| 491 |
}
|
| 492 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
.cross-judge-card {
|
| 494 |
border: 1px solid var(--line);
|
| 495 |
border-radius: 14px;
|
|
|
|
| 358 |
display: grid;
|
| 359 |
grid-template-columns: minmax(0, 1fr) minmax(0, 1fr);
|
| 360 |
gap: 14px;
|
| 361 |
+
align-items: stretch;
|
| 362 |
}
|
| 363 |
|
| 364 |
.stats-left-stack {
|
| 365 |
display: grid;
|
| 366 |
gap: 10px;
|
| 367 |
align-content: start;
|
| 368 |
+
height: 100%;
|
| 369 |
}
|
| 370 |
|
| 371 |
.distribution-figure {
|
|
|
|
| 384 |
.table-stack {
|
| 385 |
display: grid;
|
| 386 |
gap: 10px;
|
| 387 |
+
height: 100%;
|
| 388 |
}
|
| 389 |
|
| 390 |
.dimension-grid {
|
|
|
|
| 418 |
display: grid;
|
| 419 |
grid-template-columns: repeat(2, minmax(0, 1fr));
|
| 420 |
gap: 10px;
|
| 421 |
+
height: 100%;
|
| 422 |
+
align-items: stretch;
|
| 423 |
}
|
| 424 |
|
| 425 |
.table-card {
|
|
|
|
| 427 |
border-radius: 12px;
|
| 428 |
background: #ffffff;
|
| 429 |
padding: 12px 12px 8px;
|
| 430 |
+
height: 100%;
|
| 431 |
}
|
| 432 |
|
| 433 |
.table-card h3 {
|
|
|
|
| 468 |
display: grid;
|
| 469 |
grid-template-columns: minmax(0, 1.15fr) minmax(320px, 0.85fr);
|
| 470 |
gap: 16px;
|
| 471 |
+
align-items: start;
|
| 472 |
}
|
| 473 |
|
| 474 |
.analysis-main {
|
|
|
|
|
|
|
| 475 |
min-height: 100%;
|
| 476 |
}
|
| 477 |
|
| 478 |
.analysis-side {
|
| 479 |
display: grid;
|
| 480 |
gap: 12px;
|
|
|
|
| 481 |
}
|
| 482 |
|
| 483 |
.text-content {
|
|
|
|
| 492 |
margin-bottom: 12px;
|
| 493 |
}
|
| 494 |
|
| 495 |
+
.result-copy-wide {
|
| 496 |
+
margin-bottom: 14px;
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
.result-copy-wide p {
|
| 500 |
+
text-align: justify;
|
| 501 |
+
text-justify: inter-word;
|
| 502 |
+
max-width: none;
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
.cross-judge-card {
|
| 506 |
border: 1px solid var(--line);
|
| 507 |
border-radius: 14px;
|