Spaces:

county
/

WildToM

Running

App Files Files Community

county commited on 16 days ago

Commit

bcb2aa6

1 Parent(s): 755164f

Update results section with cross-judge comparison and benchmark figure

Browse files

Files changed (6) hide show

.gitattributes +1 -1
assets/wildtom_benchmark_comparison.pdf +0 -0
assets/wildtom_benchmark_comparison.png +3 -0
index.html +19 -11
script.js +152 -19
styles.css +140 -23

.gitattributes CHANGED Viewed

@@ -1,2 +1,2 @@
-*.mp4 filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text



1	*.png filter=lfs diff=lfs merge=lfs -text
2	+ *.mp4 filter=lfs diff=lfs merge=lfs -text

assets/wildtom_benchmark_comparison.pdf ADDED Viewed

Binary file (79.5 kB). View file

assets/wildtom_benchmark_comparison.png ADDED Viewed

Git LFS Details

SHA256: 388bf381a26742775bd6b1e63f8bd705aa1bd5931bd93efd702090ae2572d506
Pointer size: 130 Bytes
Size of remote file: 65.7 kB

index.html CHANGED Viewed

@@ -189,27 +189,35 @@
     <section class="section-block section-alt" id="results">
       <div class="container wide">
         <div class="section-head">
-          <h2 class="section-title">Evidence of Difficulty</h2>
           <p class="section-subtitle">
-            Performance gaps reveal that nested social reasoning is still a major bottleneck for
-            current multimodal systems.
           </p>
         </div>
         <div class="analysis-grid">
           <div>
             <div class="text-content result-copy">
               <p>
-                In the current draft, <strong>WildToM-Reasoner</strong> reaches
-                <strong>72.7%</strong> MC accuracy, compared with <strong>62.1%</strong> for the
-                strongest baseline Qwen3-VL-32B. The first-to-second order gap is not uniform: it
-                is small for Belief/Desire but expands sharply for Intention and Knowledge.
               </p>
             </div>
-            <div class="results-panel" id="results-table"></div>
           </div>
-          <figure class="paper-figure compact">
-            <img src="./assets/mc_order_gap_paper_style.png" alt="Difficulty gap between first-order and second-order reasoning">
-            <figcaption>Dimension-specific gap between 1st-order and 2nd-order reasoning.</figcaption>
           </figure>
         </div>
       </div>

     <section class="section-block section-alt" id="results">
       <div class="container wide">
         <div class="section-head">
+          <h2 class="section-title">Cross-Judge Evaluation</h2>
           <p class="section-subtitle">
+            We compare open-ended performance under two independent judges to test ranking
+            robustness and expose judge-specific gaps.
           </p>
         </div>
         <div class="analysis-grid">
           <div>
             <div class="text-content result-copy">
               <p>
+                Across both judges, <strong>WildToM-Reasoner</strong> ranks first. On Gemini
+                common-exam grading, its lead widens, while rank correlation remains high
+                (<strong>Spearman &rho; = 0.867</strong>), indicating that the OE ranking trend is
+                robust to judge replacement.
               </p>
             </div>
+            <section class="cross-judge-card" aria-label="Cross-judge model ranking comparison">
+              <div class="cross-toolbar">
+                <div class="toggle-group" id="judge-toggle"></div>
+                <div class="toggle-group metric" id="metric-toggle"></div>
+              </div>
+              <div class="results-caption" id="results-caption"></div>
+              <div class="results-panel" id="results-table"></div>
+            </section>
           </div>
+          <figure class="paper-figure compact benchmark-figure">
+            <img src="./assets/wildtom_benchmark_comparison.png" alt="Benchmark capability coverage comparison including WildToM-Bench">
+            <figcaption>Benchmark capability coverage across representative ToM datasets.</figcaption>
+            <p class="benchmark-footnote">Rank consistency: Spearman &rho; = 0.867, p &lt; 0.01.</p>
           </figure>
         </div>
       </div>

script.js CHANGED Viewed

@@ -159,25 +159,55 @@ const sampleData = [
 const filters = ["Belief", "Desire", "Intention", "Emotion", "Knowledge"];
-const modelResults = [
-  { model: "Video-LLaVA-7B", score: 25.8 },
-  { model: "AffectGPT-7B", score: 35.9 },
-  { model: "MiniCPM-V-4.5", score: 46.8 },
-  { model: "GLM-4.6V-9B", score: 51.2 },
-  { model: "Emotion-Qwen-7B", score: 54.2 },
-  { model: "GPT-4o-mini (8F)", score: 57.2 },
-  { model: "Qwen3-Omni-30B", score: 61.8 },
-  { model: "Qwen3-VL-32B", score: 62.1 },
-  { model: "Reasoner (Ours)", score: 72.7, ours: true }
-];
 const filterRoot = document.getElementById("showcase-filters");
 const showcaseRoot = document.getElementById("showcase-browser");
 const resultsRoot = document.getElementById("results-table");
 const carouselState = {
   activeDimension: "Belief",
   activeIndex: 0
 };
 function renderFilters(active) {
   filterRoot.innerHTML = "";
@@ -275,21 +305,124 @@ function renderSamples(active = carouselState.activeDimension, index = 0) {
   });
 }
-function renderResults() {
   resultsRoot.innerHTML = "";
-  modelResults.forEach((item) => {
     const row = document.createElement("div");
-    row.className = "result-row";
     row.innerHTML = `
-      <div class="result-model">${item.model}</div>
-      <div class="result-bar">
-        <div class="result-fill${item.ours ? " ours" : ""}" style="width:${item.score}%;"></div>
       </div>
-      <div class="result-score">${item.score.toFixed(1)}%</div>
     `;
     resultsRoot.appendChild(row);
   });
 }
-renderResults();
 renderSamples("Belief", 0);

 const filters = ["Belief", "Desire", "Intention", "Emotion", "Knowledge"];
+const judgeResults = {
+  gpt4o: {
+    label: "GPT-4o (full set)",
+    deltaLabel: "Gemini",
+    rows: [
+      { model: "WildToM-Reasoner", rank: 1, oeAcc: 32.7, oeScr: 2.6, isOurs: true },
+      { model: "GPT-4o-mini", rank: 2, oeAcc: 32.2, oeScr: 2.6 },
+      { model: "Qwen3-Omni", rank: 3, oeAcc: 29.1, oeScr: 2.3 },
+      { model: "Qwen3-VL", rank: 4, oeAcc: 28.6, oeScr: 2.2 },
+      { model: "GPT-5-mini", rank: 5, oeAcc: 28.0, oeScr: 2.11 },
+      { model: "GLM-4.6V", rank: 6, oeAcc: 16.6, oeScr: 1.9 },
+      { model: "MiniCPM-V-4.5", rank: 7, oeAcc: 14.6, oeScr: 1.7 },
+      { model: "Emotion-Qwen", rank: 8, oeAcc: 12.1, oeScr: 1.6 },
+      { model: "Video-LLaVA", rank: 9, oeAcc: 8.4, oeScr: 1.7 },
+      { model: "AffectGPT", rank: 10, oeAcc: 4.8, oeScr: 1.1 }
+    ]
+  },
+  gemini: {
+    label: "Gemini-2.5-Flash (N=40 common exam)",
+    deltaLabel: "GPT-4o",
+    rows: [
+      { model: "WildToM-Reasoner", rank: 1, oeAcc: 47.5, oeScr: 2.45, isOurs: true },
+      { model: "Qwen3-VL", rank: 2, oeAcc: 40.0, oeScr: 2.3 },
+      { model: "GPT-4o-mini", rank: 3, oeAcc: 37.5, oeScr: 2.23 },
+      { model: "GLM-4.6V", rank: 4, oeAcc: 35.0, oeScr: 2.02 },
+      { model: "GPT-5-mini", rank: 5, oeAcc: 25.0, oeScr: 1.4 },
+      { model: "Qwen3-Omni", rank: 6, oeAcc: 22.5, oeScr: 1.68 },
+      { model: "Emotion-Qwen", rank: 7, oeAcc: 20.0, oeScr: 1.43 },
+      { model: "MiniCPM-V-4.5", rank: 8, oeAcc: 17.5, oeScr: 1.35 },
+      { model: "AffectGPT", rank: 9, oeAcc: 15.0, oeScr: 1.35 },
+      { model: "Video-LLaVA", rank: 10, oeAcc: 12.5, oeScr: 1.43 }
+    ]
+  }
+};
 const filterRoot = document.getElementById("showcase-filters");
 const showcaseRoot = document.getElementById("showcase-browser");
 const resultsRoot = document.getElementById("results-table");
+const judgeToggleRoot = document.getElementById("judge-toggle");
+const metricToggleRoot = document.getElementById("metric-toggle");
+const resultsCaptionRoot = document.getElementById("results-caption");
 const carouselState = {
   activeDimension: "Belief",
   activeIndex: 0
 };
+const crossJudgeState = {
+  activeJudge: "gpt4o",
+  activeMetric: "oeAcc"
+};
 function renderFilters(active) {
   filterRoot.innerHTML = "";
   });
 }
+function getMetricLabel(metric) {
+  return metric === "oeAcc" ? "OE_acc" : "OE_scr";
+}
+function getMetricValue(item, metric) {
+  return metric === "oeAcc" ? item.oeAcc : item.oeScr;
+}
+function getOtherJudge(activeJudge) {
+  return activeJudge === "gpt4o" ? "gemini" : "gpt4o";
+}
+function formatMetricValue(value, metric) {
+  return metric === "oeAcc" ? `${value.toFixed(1)}%` : value.toFixed(2);
+}
+function formatDeltaValue(delta, metric) {
+  const sign = delta >= 0 ? "+" : "";
+  if (metric === "oeAcc") {
+    return `${sign}${delta.toFixed(1)} pp`;
+  }
+  return `${sign}${delta.toFixed(2)}`;
+}
+function getDeltaClass(delta) {
+  if (Math.abs(delta) < 0.05) {
+    return "neutral";
+  }
+  return delta > 0 ? "positive" : "negative";
+}
+function renderJudgeControls() {
+  if (!judgeToggleRoot || !metricToggleRoot) {
+    return;
+  }
+  const judgeItems = [
+    { key: "gpt4o", label: "GPT-4o (full set)" },
+    { key: "gemini", label: "Gemini-2.5-Flash (N=40)" }
+  ];
+  judgeToggleRoot.innerHTML = "";
+  judgeItems.forEach((item) => {
+    const button = document.createElement("button");
+    button.type = "button";
+    button.className = `toggle-btn${item.key === crossJudgeState.activeJudge ? " active" : ""}`;
+    button.textContent = item.label;
+    button.addEventListener("click", () => {
+      renderCrossJudgeResults(item.key, crossJudgeState.activeMetric);
+    });
+    judgeToggleRoot.appendChild(button);
+  });
+  const metricItems = [
+    { key: "oeAcc", label: "OE_acc" },
+    { key: "oeScr", label: "OE_scr" }
+  ];
+  metricToggleRoot.innerHTML = "";
+  metricItems.forEach((item) => {
+    const button = document.createElement("button");
+    button.type = "button";
+    button.className = `toggle-btn${item.key === crossJudgeState.activeMetric ? " active" : ""}`;
+    button.textContent = item.label;
+    button.addEventListener("click", () => {
+      renderCrossJudgeResults(crossJudgeState.activeJudge, item.key);
+    });
+    metricToggleRoot.appendChild(button);
+  });
+}
+function renderCrossJudgeResults(activeJudge = crossJudgeState.activeJudge, activeMetric = crossJudgeState.activeMetric) {
+  if (!resultsRoot) {
+    return;
+  }
+  crossJudgeState.activeJudge = activeJudge;
+  crossJudgeState.activeMetric = activeMetric;
+  renderJudgeControls();
+  const otherJudge = getOtherJudge(activeJudge);
+  const currentRows = [...judgeResults[activeJudge].rows].sort((a, b) => {
+    const valueDiff = getMetricValue(b, activeMetric) - getMetricValue(a, activeMetric);
+    if (Math.abs(valueDiff) > 1e-6) {
+      return valueDiff;
+    }
+    return a.rank - b.rank;
+  });
+  const otherMap = new Map(judgeResults[otherJudge].rows.map((item) => [item.model, item]));
+  if (resultsCaptionRoot) {
+    resultsCaptionRoot.textContent =
+      `${judgeResults[activeJudge].label} · Sorted by ${getMetricLabel(activeMetric)} · Δ vs ${judgeResults[activeJudge].deltaLabel}`;
+  }
   resultsRoot.innerHTML = "";
+  currentRows.forEach((item, index) => {
     const row = document.createElement("div");
+    row.className = `result-row${item.isOurs ? " ours" : ""}`;
+    const score = getMetricValue(item, activeMetric);
+    const other = otherMap.get(item.model);
+    const delta = other ? score - getMetricValue(other, activeMetric) : 0;
+    const deltaClass = getDeltaClass(delta);
+    const deltaText = other
+      ? `${formatDeltaValue(delta, activeMetric)} vs ${judgeResults[activeJudge].deltaLabel}`
+      : "N/A";
     row.innerHTML = `
+      <div class="result-rank">#${index + 1}</div>
+      <div class="result-model-wrap">
+        <div class="result-model">${item.model}</div>
+        ${item.isOurs ? '<span class="result-tag">Ours</span>' : ""}
       </div>
+      <div class="result-score">${formatMetricValue(score, activeMetric)}</div>
+      <div class="result-delta ${deltaClass}">${deltaText}</div>
     `;
     resultsRoot.appendChild(row);
   });
 }
+renderCrossJudgeResults("gpt4o", "oeAcc");
 renderSamples("Belief", 0);

styles.css CHANGED Viewed

@@ -446,8 +446,8 @@ code {
 .analysis-grid {
   display: grid;
-  grid-template-columns: minmax(0, 1.1fr) minmax(320px, 0.9fr);
-  gap: 14px;
   align-items: start;
 }
@@ -460,51 +460,133 @@ code {
 }
 .result-copy {
-  margin-bottom: 10px;
 }
 .results-panel {
   display: grid;
-  gap: 10px;
 }
 .result-row {
   display: grid;
-  grid-template-columns: minmax(170px, 230px) minmax(0, 1fr) auto;
-  gap: 14px;
   align-items: center;
-  padding: 10px 12px;
-  border: 1px solid var(--line);
-  border-radius: 12px;
-  background: #ffffff;
 }
-.result-model {
-  font-weight: 700;
 }
-.result-bar {
-  height: 13px;
   border-radius: 999px;
-  background: #ebe7de;
-  overflow: hidden;
 }
-.result-fill {
-  height: 100%;
-  background: var(--blue);
 }
-.result-fill.ours {
-  background: var(--orange);
 }
 .result-score {
-  min-width: 58px;
   text-align: right;
   font-weight: 700;
 }
 .compact {
   border: 1px solid var(--line);
   border-radius: var(--radius-lg);
@@ -517,6 +599,16 @@ code {
   border-radius: 10px;
 }
 .filter-row {
   display: flex;
   flex-wrap: wrap;
@@ -822,7 +914,32 @@ code {
     grid-template-columns: 1fr;
   }
   .result-row {
-    grid-template-columns: 1fr;
   }
 }

 .analysis-grid {
   display: grid;
+  grid-template-columns: minmax(0, 1.15fr) minmax(320px, 0.85fr);
+  gap: 16px;
   align-items: start;
 }
 }
 .result-copy {
+  margin-bottom: 12px;
+}
+.cross-judge-card {
+  border: 1px solid var(--line);
+  border-radius: 14px;
+  background: #ffffff;
+  padding: 12px;
+}
+.cross-toolbar {
+  display: grid;
+  gap: 9px;
+  margin-bottom: 8px;
+}
+.toggle-group {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 7px;
+}
+.toggle-btn {
+  border: 1px solid #cac4b8;
+  border-radius: 999px;
+  padding: 6px 11px;
+  background: #f8f5ee;
+  color: #47423a;
+  font: inherit;
+  font-size: 0.85rem;
+  font-weight: 600;
+  cursor: pointer;
+}
+.toggle-btn.active {
+  border-color: #171717;
+  background: #151515;
+  color: #ffffff;
+}
+.results-caption {
+  margin: 0 0 8px;
+  font-size: 0.85rem;
+  color: #5e5a52;
 }
 .results-panel {
   display: grid;
+  gap: 8px;
 }
 .result-row {
   display: grid;
+  grid-template-columns: 54px minmax(0, 1fr) 90px minmax(130px, 170px);
+  gap: 10px;
   align-items: center;
+  padding: 8px 10px;
+  border: 1px solid #ddd8cc;
+  border-radius: 10px;
+  background: #fcfbf8;
 }
+.result-row.ours {
+  border-color: #d89984;
+  background: linear-gradient(90deg, #fff7f3 0%, #fffdfa 100%);
 }
+.result-rank {
+  justify-self: start;
+  min-width: 44px;
+  padding: 3px 7px;
   border-radius: 999px;
+  border: 1px solid #d8d2c8;
+  font-size: 0.78rem;
+  font-weight: 700;
+  color: #5a554c;
+  background: #ffffff;
 }
+.result-model-wrap {
+  display: flex;
+  align-items: center;
+  gap: 8px;
 }
+.result-model {
+  min-width: 0;
+  font-size: 0.95rem;
+  font-weight: 700;
+  line-height: 1.2;
+  overflow-wrap: anywhere;
+}
+.result-tag {
+  border-radius: 999px;
+  border: 1px solid #cb7f66;
+  color: #9c3f22;
+  background: #fff1eb;
+  padding: 2px 7px;
+  font-size: 0.72rem;
+  font-weight: 700;
 }
 .result-score {
+  min-width: 0;
   text-align: right;
+  font-size: 0.92rem;
+  color: #202020;
   font-weight: 700;
 }
+.result-delta {
+  min-width: 0;
+  text-align: right;
+  font-size: 0.8rem;
+  font-weight: 600;
+  color: #55514a;
+}
+.result-delta.positive {
+  color: #2f7c4f;
+}
+.result-delta.negative {
+  color: #9b4e39;
+}
 .compact {
   border: 1px solid var(--line);
   border-radius: var(--radius-lg);
   border-radius: 10px;
 }
+.benchmark-figure figcaption {
+  margin-bottom: 3px;
+}
+.benchmark-footnote {
+  margin: 0;
+  color: #67625a;
+  font-size: 0.83rem;
+}
 .filter-row {
   display: flex;
   flex-wrap: wrap;
     grid-template-columns: 1fr;
   }
+  .toggle-btn {
+    padding: 5px 9px;
+    font-size: 0.78rem;
+  }
   .result-row {
+    grid-template-columns: 46px minmax(0, 1fr) auto;
+    gap: 6px 8px;
+    align-items: start;
+    padding: 8px;
+  }
+  .result-model-wrap {
+    grid-column: 2;
+    grid-row: 1;
   }
+  .result-score {
+    grid-column: 3;
+    grid-row: 1;
+  }
+  .result-delta {
+    grid-column: 2 / 4;
+    grid-row: 2;
+    text-align: left;
+  }
 }