county commited on
Commit
bcb2aa6
·
1 Parent(s): 755164f

Update results section with cross-judge comparison and benchmark figure

Browse files
.gitattributes CHANGED
@@ -1,2 +1,2 @@
1
- *.mp4 filter=lfs diff=lfs merge=lfs -text
2
  *.png filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  *.png filter=lfs diff=lfs merge=lfs -text
2
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
assets/wildtom_benchmark_comparison.pdf ADDED
Binary file (79.5 kB). View file
 
assets/wildtom_benchmark_comparison.png ADDED

Git LFS Details

  • SHA256: 388bf381a26742775bd6b1e63f8bd705aa1bd5931bd93efd702090ae2572d506
  • Pointer size: 130 Bytes
  • Size of remote file: 65.7 kB
index.html CHANGED
@@ -189,27 +189,35 @@
189
  <section class="section-block section-alt" id="results">
190
  <div class="container wide">
191
  <div class="section-head">
192
- <h2 class="section-title">Evidence of Difficulty</h2>
193
  <p class="section-subtitle">
194
- Performance gaps reveal that nested social reasoning is still a major bottleneck for
195
- current multimodal systems.
196
  </p>
197
  </div>
198
  <div class="analysis-grid">
199
  <div>
200
  <div class="text-content result-copy">
201
  <p>
202
- In the current draft, <strong>WildToM-Reasoner</strong> reaches
203
- <strong>72.7%</strong> MC accuracy, compared with <strong>62.1%</strong> for the
204
- strongest baseline Qwen3-VL-32B. The first-to-second order gap is not uniform: it
205
- is small for Belief/Desire but expands sharply for Intention and Knowledge.
206
  </p>
207
  </div>
208
- <div class="results-panel" id="results-table"></div>
 
 
 
 
 
 
 
209
  </div>
210
- <figure class="paper-figure compact">
211
- <img src="./assets/mc_order_gap_paper_style.png" alt="Difficulty gap between first-order and second-order reasoning">
212
- <figcaption>Dimension-specific gap between 1st-order and 2nd-order reasoning.</figcaption>
 
213
  </figure>
214
  </div>
215
  </div>
 
189
  <section class="section-block section-alt" id="results">
190
  <div class="container wide">
191
  <div class="section-head">
192
+ <h2 class="section-title">Cross-Judge Evaluation</h2>
193
  <p class="section-subtitle">
194
+ We compare open-ended performance under two independent judges to test ranking
195
+ robustness and expose judge-specific gaps.
196
  </p>
197
  </div>
198
  <div class="analysis-grid">
199
  <div>
200
  <div class="text-content result-copy">
201
  <p>
202
+ Across both judges, <strong>WildToM-Reasoner</strong> ranks first. On Gemini
203
+ common-exam grading, its lead widens, while rank correlation remains high
204
+ (<strong>Spearman &rho; = 0.867</strong>), indicating that the OE ranking trend is
205
+ robust to judge replacement.
206
  </p>
207
  </div>
208
+ <section class="cross-judge-card" aria-label="Cross-judge model ranking comparison">
209
+ <div class="cross-toolbar">
210
+ <div class="toggle-group" id="judge-toggle"></div>
211
+ <div class="toggle-group metric" id="metric-toggle"></div>
212
+ </div>
213
+ <div class="results-caption" id="results-caption"></div>
214
+ <div class="results-panel" id="results-table"></div>
215
+ </section>
216
  </div>
217
+ <figure class="paper-figure compact benchmark-figure">
218
+ <img src="./assets/wildtom_benchmark_comparison.png" alt="Benchmark capability coverage comparison including WildToM-Bench">
219
+ <figcaption>Benchmark capability coverage across representative ToM datasets.</figcaption>
220
+ <p class="benchmark-footnote">Rank consistency: Spearman &rho; = 0.867, p &lt; 0.01.</p>
221
  </figure>
222
  </div>
223
  </div>
script.js CHANGED
@@ -159,25 +159,55 @@ const sampleData = [
159
 
160
  const filters = ["Belief", "Desire", "Intention", "Emotion", "Knowledge"];
161
 
162
- const modelResults = [
163
- { model: "Video-LLaVA-7B", score: 25.8 },
164
- { model: "AffectGPT-7B", score: 35.9 },
165
- { model: "MiniCPM-V-4.5", score: 46.8 },
166
- { model: "GLM-4.6V-9B", score: 51.2 },
167
- { model: "Emotion-Qwen-7B", score: 54.2 },
168
- { model: "GPT-4o-mini (8F)", score: 57.2 },
169
- { model: "Qwen3-Omni-30B", score: 61.8 },
170
- { model: "Qwen3-VL-32B", score: 62.1 },
171
- { model: "Reasoner (Ours)", score: 72.7, ours: true }
172
- ];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  const filterRoot = document.getElementById("showcase-filters");
175
  const showcaseRoot = document.getElementById("showcase-browser");
176
  const resultsRoot = document.getElementById("results-table");
 
 
 
177
  const carouselState = {
178
  activeDimension: "Belief",
179
  activeIndex: 0
180
  };
 
 
 
 
181
 
182
  function renderFilters(active) {
183
  filterRoot.innerHTML = "";
@@ -275,21 +305,124 @@ function renderSamples(active = carouselState.activeDimension, index = 0) {
275
  });
276
  }
277
 
278
- function renderResults() {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  resultsRoot.innerHTML = "";
280
- modelResults.forEach((item) => {
281
  const row = document.createElement("div");
282
- row.className = "result-row";
 
 
 
 
 
 
 
 
 
283
  row.innerHTML = `
284
- <div class="result-model">${item.model}</div>
285
- <div class="result-bar">
286
- <div class="result-fill${item.ours ? " ours" : ""}" style="width:${item.score}%;"></div>
 
287
  </div>
288
- <div class="result-score">${item.score.toFixed(1)}%</div>
 
289
  `;
290
  resultsRoot.appendChild(row);
291
  });
292
  }
293
 
294
- renderResults();
295
  renderSamples("Belief", 0);
 
159
 
160
  const filters = ["Belief", "Desire", "Intention", "Emotion", "Knowledge"];
161
 
162
+ const judgeResults = {
163
+ gpt4o: {
164
+ label: "GPT-4o (full set)",
165
+ deltaLabel: "Gemini",
166
+ rows: [
167
+ { model: "WildToM-Reasoner", rank: 1, oeAcc: 32.7, oeScr: 2.6, isOurs: true },
168
+ { model: "GPT-4o-mini", rank: 2, oeAcc: 32.2, oeScr: 2.6 },
169
+ { model: "Qwen3-Omni", rank: 3, oeAcc: 29.1, oeScr: 2.3 },
170
+ { model: "Qwen3-VL", rank: 4, oeAcc: 28.6, oeScr: 2.2 },
171
+ { model: "GPT-5-mini", rank: 5, oeAcc: 28.0, oeScr: 2.11 },
172
+ { model: "GLM-4.6V", rank: 6, oeAcc: 16.6, oeScr: 1.9 },
173
+ { model: "MiniCPM-V-4.5", rank: 7, oeAcc: 14.6, oeScr: 1.7 },
174
+ { model: "Emotion-Qwen", rank: 8, oeAcc: 12.1, oeScr: 1.6 },
175
+ { model: "Video-LLaVA", rank: 9, oeAcc: 8.4, oeScr: 1.7 },
176
+ { model: "AffectGPT", rank: 10, oeAcc: 4.8, oeScr: 1.1 }
177
+ ]
178
+ },
179
+ gemini: {
180
+ label: "Gemini-2.5-Flash (N=40 common exam)",
181
+ deltaLabel: "GPT-4o",
182
+ rows: [
183
+ { model: "WildToM-Reasoner", rank: 1, oeAcc: 47.5, oeScr: 2.45, isOurs: true },
184
+ { model: "Qwen3-VL", rank: 2, oeAcc: 40.0, oeScr: 2.3 },
185
+ { model: "GPT-4o-mini", rank: 3, oeAcc: 37.5, oeScr: 2.23 },
186
+ { model: "GLM-4.6V", rank: 4, oeAcc: 35.0, oeScr: 2.02 },
187
+ { model: "GPT-5-mini", rank: 5, oeAcc: 25.0, oeScr: 1.4 },
188
+ { model: "Qwen3-Omni", rank: 6, oeAcc: 22.5, oeScr: 1.68 },
189
+ { model: "Emotion-Qwen", rank: 7, oeAcc: 20.0, oeScr: 1.43 },
190
+ { model: "MiniCPM-V-4.5", rank: 8, oeAcc: 17.5, oeScr: 1.35 },
191
+ { model: "AffectGPT", rank: 9, oeAcc: 15.0, oeScr: 1.35 },
192
+ { model: "Video-LLaVA", rank: 10, oeAcc: 12.5, oeScr: 1.43 }
193
+ ]
194
+ }
195
+ };
196
 
197
  const filterRoot = document.getElementById("showcase-filters");
198
  const showcaseRoot = document.getElementById("showcase-browser");
199
  const resultsRoot = document.getElementById("results-table");
200
+ const judgeToggleRoot = document.getElementById("judge-toggle");
201
+ const metricToggleRoot = document.getElementById("metric-toggle");
202
+ const resultsCaptionRoot = document.getElementById("results-caption");
203
  const carouselState = {
204
  activeDimension: "Belief",
205
  activeIndex: 0
206
  };
207
+ const crossJudgeState = {
208
+ activeJudge: "gpt4o",
209
+ activeMetric: "oeAcc"
210
+ };
211
 
212
  function renderFilters(active) {
213
  filterRoot.innerHTML = "";
 
305
  });
306
  }
307
 
308
+ function getMetricLabel(metric) {
309
+ return metric === "oeAcc" ? "OE_acc" : "OE_scr";
310
+ }
311
+
312
+ function getMetricValue(item, metric) {
313
+ return metric === "oeAcc" ? item.oeAcc : item.oeScr;
314
+ }
315
+
316
+ function getOtherJudge(activeJudge) {
317
+ return activeJudge === "gpt4o" ? "gemini" : "gpt4o";
318
+ }
319
+
320
+ function formatMetricValue(value, metric) {
321
+ return metric === "oeAcc" ? `${value.toFixed(1)}%` : value.toFixed(2);
322
+ }
323
+
324
+ function formatDeltaValue(delta, metric) {
325
+ const sign = delta >= 0 ? "+" : "";
326
+ if (metric === "oeAcc") {
327
+ return `${sign}${delta.toFixed(1)} pp`;
328
+ }
329
+ return `${sign}${delta.toFixed(2)}`;
330
+ }
331
+
332
+ function getDeltaClass(delta) {
333
+ if (Math.abs(delta) < 0.05) {
334
+ return "neutral";
335
+ }
336
+ return delta > 0 ? "positive" : "negative";
337
+ }
338
+
339
+ function renderJudgeControls() {
340
+ if (!judgeToggleRoot || !metricToggleRoot) {
341
+ return;
342
+ }
343
+
344
+ const judgeItems = [
345
+ { key: "gpt4o", label: "GPT-4o (full set)" },
346
+ { key: "gemini", label: "Gemini-2.5-Flash (N=40)" }
347
+ ];
348
+ judgeToggleRoot.innerHTML = "";
349
+ judgeItems.forEach((item) => {
350
+ const button = document.createElement("button");
351
+ button.type = "button";
352
+ button.className = `toggle-btn${item.key === crossJudgeState.activeJudge ? " active" : ""}`;
353
+ button.textContent = item.label;
354
+ button.addEventListener("click", () => {
355
+ renderCrossJudgeResults(item.key, crossJudgeState.activeMetric);
356
+ });
357
+ judgeToggleRoot.appendChild(button);
358
+ });
359
+
360
+ const metricItems = [
361
+ { key: "oeAcc", label: "OE_acc" },
362
+ { key: "oeScr", label: "OE_scr" }
363
+ ];
364
+ metricToggleRoot.innerHTML = "";
365
+ metricItems.forEach((item) => {
366
+ const button = document.createElement("button");
367
+ button.type = "button";
368
+ button.className = `toggle-btn${item.key === crossJudgeState.activeMetric ? " active" : ""}`;
369
+ button.textContent = item.label;
370
+ button.addEventListener("click", () => {
371
+ renderCrossJudgeResults(crossJudgeState.activeJudge, item.key);
372
+ });
373
+ metricToggleRoot.appendChild(button);
374
+ });
375
+ }
376
+
377
+ function renderCrossJudgeResults(activeJudge = crossJudgeState.activeJudge, activeMetric = crossJudgeState.activeMetric) {
378
+ if (!resultsRoot) {
379
+ return;
380
+ }
381
+
382
+ crossJudgeState.activeJudge = activeJudge;
383
+ crossJudgeState.activeMetric = activeMetric;
384
+ renderJudgeControls();
385
+
386
+ const otherJudge = getOtherJudge(activeJudge);
387
+ const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
388
+ const valueDiff = getMetricValue(b, activeMetric) - getMetricValue(a, activeMetric);
389
+ if (Math.abs(valueDiff) > 1e-6) {
390
+ return valueDiff;
391
+ }
392
+ return a.rank - b.rank;
393
+ });
394
+ const otherMap = new Map(judgeResults[otherJudge].rows.map((item) => [item.model, item]));
395
+
396
+ if (resultsCaptionRoot) {
397
+ resultsCaptionRoot.textContent =
398
+ `${judgeResults[activeJudge].label} · Sorted by ${getMetricLabel(activeMetric)} · Δ vs ${judgeResults[activeJudge].deltaLabel}`;
399
+ }
400
+
401
  resultsRoot.innerHTML = "";
402
+ currentRows.forEach((item, index) => {
403
  const row = document.createElement("div");
404
+ row.className = `result-row${item.isOurs ? " ours" : ""}`;
405
+
406
+ const score = getMetricValue(item, activeMetric);
407
+ const other = otherMap.get(item.model);
408
+ const delta = other ? score - getMetricValue(other, activeMetric) : 0;
409
+ const deltaClass = getDeltaClass(delta);
410
+ const deltaText = other
411
+ ? `${formatDeltaValue(delta, activeMetric)} vs ${judgeResults[activeJudge].deltaLabel}`
412
+ : "N/A";
413
+
414
  row.innerHTML = `
415
+ <div class="result-rank">#${index + 1}</div>
416
+ <div class="result-model-wrap">
417
+ <div class="result-model">${item.model}</div>
418
+ ${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
419
  </div>
420
+ <div class="result-score">${formatMetricValue(score, activeMetric)}</div>
421
+ <div class="result-delta ${deltaClass}">${deltaText}</div>
422
  `;
423
  resultsRoot.appendChild(row);
424
  });
425
  }
426
 
427
+ renderCrossJudgeResults("gpt4o", "oeAcc");
428
  renderSamples("Belief", 0);
styles.css CHANGED
@@ -446,8 +446,8 @@ code {
446
 
447
  .analysis-grid {
448
  display: grid;
449
- grid-template-columns: minmax(0, 1.1fr) minmax(320px, 0.9fr);
450
- gap: 14px;
451
  align-items: start;
452
  }
453
 
@@ -460,51 +460,133 @@ code {
460
  }
461
 
462
  .result-copy {
463
- margin-bottom: 10px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  }
465
 
466
  .results-panel {
467
  display: grid;
468
- gap: 10px;
469
  }
470
 
471
  .result-row {
472
  display: grid;
473
- grid-template-columns: minmax(170px, 230px) minmax(0, 1fr) auto;
474
- gap: 14px;
475
  align-items: center;
476
- padding: 10px 12px;
477
- border: 1px solid var(--line);
478
- border-radius: 12px;
479
- background: #ffffff;
480
  }
481
 
482
- .result-model {
483
- font-weight: 700;
 
484
  }
485
 
486
- .result-bar {
487
- height: 13px;
 
 
488
  border-radius: 999px;
489
- background: #ebe7de;
490
- overflow: hidden;
 
 
 
491
  }
492
 
493
- .result-fill {
494
- height: 100%;
495
- background: var(--blue);
 
496
  }
497
 
498
- .result-fill.ours {
499
- background: var(--orange);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  }
501
 
502
  .result-score {
503
- min-width: 58px;
504
  text-align: right;
 
 
505
  font-weight: 700;
506
  }
507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  .compact {
509
  border: 1px solid var(--line);
510
  border-radius: var(--radius-lg);
@@ -517,6 +599,16 @@ code {
517
  border-radius: 10px;
518
  }
519
 
 
 
 
 
 
 
 
 
 
 
520
  .filter-row {
521
  display: flex;
522
  flex-wrap: wrap;
@@ -822,7 +914,32 @@ code {
822
  grid-template-columns: 1fr;
823
  }
824
 
 
 
 
 
 
825
  .result-row {
826
- grid-template-columns: 1fr;
 
 
 
 
 
 
 
 
827
  }
 
 
 
 
 
 
 
 
 
 
 
 
828
  }
 
446
 
447
  .analysis-grid {
448
  display: grid;
449
+ grid-template-columns: minmax(0, 1.15fr) minmax(320px, 0.85fr);
450
+ gap: 16px;
451
  align-items: start;
452
  }
453
 
 
460
  }
461
 
462
  .result-copy {
463
+ margin-bottom: 12px;
464
+ }
465
+
466
+ .cross-judge-card {
467
+ border: 1px solid var(--line);
468
+ border-radius: 14px;
469
+ background: #ffffff;
470
+ padding: 12px;
471
+ }
472
+
473
+ .cross-toolbar {
474
+ display: grid;
475
+ gap: 9px;
476
+ margin-bottom: 8px;
477
+ }
478
+
479
+ .toggle-group {
480
+ display: flex;
481
+ flex-wrap: wrap;
482
+ gap: 7px;
483
+ }
484
+
485
+ .toggle-btn {
486
+ border: 1px solid #cac4b8;
487
+ border-radius: 999px;
488
+ padding: 6px 11px;
489
+ background: #f8f5ee;
490
+ color: #47423a;
491
+ font: inherit;
492
+ font-size: 0.85rem;
493
+ font-weight: 600;
494
+ cursor: pointer;
495
+ }
496
+
497
+ .toggle-btn.active {
498
+ border-color: #171717;
499
+ background: #151515;
500
+ color: #ffffff;
501
+ }
502
+
503
+ .results-caption {
504
+ margin: 0 0 8px;
505
+ font-size: 0.85rem;
506
+ color: #5e5a52;
507
  }
508
 
509
  .results-panel {
510
  display: grid;
511
+ gap: 8px;
512
  }
513
 
514
  .result-row {
515
  display: grid;
516
+ grid-template-columns: 54px minmax(0, 1fr) 90px minmax(130px, 170px);
517
+ gap: 10px;
518
  align-items: center;
519
+ padding: 8px 10px;
520
+ border: 1px solid #ddd8cc;
521
+ border-radius: 10px;
522
+ background: #fcfbf8;
523
  }
524
 
525
+ .result-row.ours {
526
+ border-color: #d89984;
527
+ background: linear-gradient(90deg, #fff7f3 0%, #fffdfa 100%);
528
  }
529
 
530
+ .result-rank {
531
+ justify-self: start;
532
+ min-width: 44px;
533
+ padding: 3px 7px;
534
  border-radius: 999px;
535
+ border: 1px solid #d8d2c8;
536
+ font-size: 0.78rem;
537
+ font-weight: 700;
538
+ color: #5a554c;
539
+ background: #ffffff;
540
  }
541
 
542
+ .result-model-wrap {
543
+ display: flex;
544
+ align-items: center;
545
+ gap: 8px;
546
  }
547
 
548
+ .result-model {
549
+ min-width: 0;
550
+ font-size: 0.95rem;
551
+ font-weight: 700;
552
+ line-height: 1.2;
553
+ overflow-wrap: anywhere;
554
+ }
555
+
556
+ .result-tag {
557
+ border-radius: 999px;
558
+ border: 1px solid #cb7f66;
559
+ color: #9c3f22;
560
+ background: #fff1eb;
561
+ padding: 2px 7px;
562
+ font-size: 0.72rem;
563
+ font-weight: 700;
564
  }
565
 
566
  .result-score {
567
+ min-width: 0;
568
  text-align: right;
569
+ font-size: 0.92rem;
570
+ color: #202020;
571
  font-weight: 700;
572
  }
573
 
574
+ .result-delta {
575
+ min-width: 0;
576
+ text-align: right;
577
+ font-size: 0.8rem;
578
+ font-weight: 600;
579
+ color: #55514a;
580
+ }
581
+
582
+ .result-delta.positive {
583
+ color: #2f7c4f;
584
+ }
585
+
586
+ .result-delta.negative {
587
+ color: #9b4e39;
588
+ }
589
+
590
  .compact {
591
  border: 1px solid var(--line);
592
  border-radius: var(--radius-lg);
 
599
  border-radius: 10px;
600
  }
601
 
602
+ .benchmark-figure figcaption {
603
+ margin-bottom: 3px;
604
+ }
605
+
606
+ .benchmark-footnote {
607
+ margin: 0;
608
+ color: #67625a;
609
+ font-size: 0.83rem;
610
+ }
611
+
612
  .filter-row {
613
  display: flex;
614
  flex-wrap: wrap;
 
914
  grid-template-columns: 1fr;
915
  }
916
 
917
+ .toggle-btn {
918
+ padding: 5px 9px;
919
+ font-size: 0.78rem;
920
+ }
921
+
922
  .result-row {
923
+ grid-template-columns: 46px minmax(0, 1fr) auto;
924
+ gap: 6px 8px;
925
+ align-items: start;
926
+ padding: 8px;
927
+ }
928
+
929
+ .result-model-wrap {
930
+ grid-column: 2;
931
+ grid-row: 1;
932
  }
933
+
934
+ .result-score {
935
+ grid-column: 3;
936
+ grid-row: 1;
937
+ }
938
+
939
+ .result-delta {
940
+ grid-column: 2 / 4;
941
+ grid-row: 2;
942
+ text-align: left;
943
+ }
944
+
945
  }