Spaces:

Esvanth
/

mindscan

Running

App Files Files Community

Esvanth commited on Apr 20

Commit

818583c

verified ·

1 Parent(s): 528b24d

Update index.html

Browse files

Files changed (1) hide show

templates/index.html +656 -22

templates/index.html CHANGED Viewed

@@ -73,6 +73,32 @@ header{
 .sec-lead{font-size:14px;color:var(--ink2);max-width:560px;line-height:1.7;margin-bottom:36px}
 .section-divider{border:none;border-top:1px solid var(--border);margin:0}
 /* ── METHODOLOGY 3-STEP ── */
 .method-steps{display:grid;grid-template-columns:repeat(3,1fr);gap:0;position:relative}
 .method-steps::before{content:'';position:absolute;top:22px;left:22px;right:22px;height:2px;background:var(--bg3);z-index:0}
@@ -82,8 +108,6 @@ header{
 .method-step.active .ms-dot{background:var(--blue);border-color:var(--blue)}
 .ms-title{font-size:14px;font-weight:500;color:var(--ink);margin-bottom:7px}
 .ms-body{font-size:12px;color:var(--ink2);line-height:1.65}
-.ms-crisp{display:flex;flex-wrap:wrap;justify-content:center;gap:4px;margin-top:10px}
-.ms-crisp-tag{font-size:9px;font-family:'DM Mono',monospace;padding:2px 7px;border-radius:3px;background:var(--bg2);border:1px solid var(--border);color:var(--ink3);letter-spacing:.05em}
 /* Detail panel */
 .method-detail{margin-top:32px;background:#fff;border:1px solid var(--border);border-radius:14px;padding:28px 32px;box-shadow:var(--shadow);animation:fadeUp .25s ease both}
 .md-panel{display:none}
@@ -220,6 +244,52 @@ textarea::placeholder{color:var(--ink3)}
 .ci-title{font-size:13px;font-weight:500;color:var(--amber);margin-bottom:5px;display:flex;align-items:center;gap:7px}
 .ci-body{font-size:12px;color:#92400e;line-height:1.65}
 footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);font-size:11px;font-family:'DM Mono',monospace;color:var(--ink3);line-height:1.9}
 @media(max-width:768px){
   header,.section,.demo-section,footer{padding-left:20px;padding-right:20px}
@@ -245,11 +315,13 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
     <div class="logo-txt">Mind<em>Scan</em></div>
   </div>
   <nav class="nav-links">
     <a href="#methodology">Methodology</a>
     <a href="#matrix">Evidence Matrix</a>
     <a href="#findings">Findings</a>
     <a href="#verdict">Conclusions</a>
     <a href="#demo">Live Demo</a>
   </nav>
   <div class="nav-badge">NCI H9DAI 2026</div>
 </header>
@@ -275,38 +347,78 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
     <div class="stats-panel">
       <div class="stat-box"><div class="stat-num" data-target="3" data-suffix="">0</div><div class="stat-lbl">Datasets</div></div>
       <div class="stat-box"><div class="stat-num" data-target="12" data-suffix="">0</div><div class="stat-lbl">Models trained</div></div>
-      <div class="stat-box"><div class="stat-num" data-target="99.9" data-suffix="%" data-dec="1">0</div><div class="stat-lbl">Best Accuracy</div></div>
-      <div class="stat-box"><div class="stat-num" data-target="98.1" data-suffix="%" data-dec="1">0</div><div class="stat-lbl">D3 Best Accuracy</div></div>
     </div>
   </div>
 </div>
 <hr class="section-divider">
 <!-- METHODOLOGY -->
 <section class="section" id="methodology">
   <div class="sec-eyebrow">Methodology</div>
   <div class="sec-h2">Three-step <em>pipeline</em></div>
-  <p class="sec-lead">CRISP-DM applied across all three datasets — from raw social media text to parallel ensemble predictions.</p>
   <div class="method-steps">
     <div class="method-step active" onclick="showMethodDetail(0)">
       <div class="ms-dot">01</div>
       <div class="ms-title">Data</div>
       <div class="ms-body">3 clinical datasets spanning Twitter and Reddit, covering depression types, binary detection, and suicide risk.</div>
-      <div class="ms-crisp"><span class="ms-crisp-tag">CRISP-DM 1: Business Understanding</span><span class="ms-crisp-tag">CRISP-DM 2: Data Understanding</span></div>
     </div>
     <div class="method-step" onclick="showMethodDetail(1)">
       <div class="ms-dot">02</div>
       <div class="ms-title">Preprocessing</div>
       <div class="ms-body">6-stage text cleaning pipeline + SMOTE oversampling to address class imbalance left unresolved by the base paper.</div>
-      <div class="ms-crisp"><span class="ms-crisp-tag">CRISP-DM 3: Data Preparation</span></div>
     </div>
     <div class="method-step" onclick="showMethodDetail(2)">
       <div class="ms-dot">03</div>
       <div class="ms-title">Modelling</div>
       <div class="ms-body">Parallel ensemble of 12 classifiers — all run independently on every prediction, never as a sequential cascade.</div>
-      <div class="ms-crisp"><span class="ms-crisp-tag">CRISP-DM 4: Modelling</span><span class="ms-crisp-tag">CRISP-DM 5: Evaluation</span><span class="ms-crisp-tag">CRISP-DM 6: Deployment</span></div>
     </div>
   </div>
@@ -338,19 +450,19 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
       <div class="md-title"><div class="md-title-dot"></div>Preprocessing Pipeline</div>
       <div class="md-grid">
         <div class="md-block">
-          <div class="md-block-lbl">6-Stage Text Cleaning</div>
           <div class="md-block-val">1. Lowercase · 2. Strip URLs &amp; http links · 3. Remove @mentions · 4. Remove # symbols · 5. Strip punctuation · 6. Collapse whitespace. Applied identically across all three datasets for consistency.</div>
         </div>
         <div class="md-block">
-          <div class="md-block-lbl">SMOTE — Synthetic Oversampling</div>
           <div class="md-block-val">Applied to D1 and D2 training sets only (D3 is pre-balanced). D1: 11,986 → <em>17,982 samples</em>. D2: 8,251 → <em>12,800 samples</em>. Creates synthetic clinical neighbours in TF-IDF feature space. Directly addresses the base paper's (Tumaliuan 2024) biggest limitation — they trained on raw imbalanced data.</div>
         </div>
         <div class="md-block">
-          <div class="md-block-lbl">Feature Extraction — Classical Models</div>
-          <div class="md-block-val">TF-IDF vectoriser with unigrams + bigrams, fitted per-dataset on training data only. Captures frequency-weighted term co-occurrence patterns, well-suited for short Twitter text.</div>
         </div>
         <div class="md-block">
-          <div class="md-block-lbl">Feature Extraction — Transformers</div>
           <div class="md-block-val">XLM-RoBERTa tokeniser (max 128 tokens D1/D2, 256 tokens D3) with padding. Pre-trained multilingual contextual embeddings capture semantic meaning and long-range dependencies — critical for Reddit's longer posts.</div>
         </div>
       </div>
@@ -361,10 +473,10 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
       <div class="md-grid">
         <div class="md-block">
           <div class="md-block-lbl">4 Models per Dataset (12 total)</div>
-          <div class="md-block-val"><em>Logistic Regression</em> — L2 regularised, max_iter=1000. <em>SVM</em> — LinearSVC, C=1.0. <em>XGBoost</em> — 300 estimators, max_depth=6. <em>XLM-RoBERTa</em> — fine-tuned multilingual transformer, <em>278M parameters</em>, lr=2e-5, 3 epochs.</div>
         </div>
         <div class="md-block">
-          <div class="md-block-lbl">Parallel Architecture — Clinical Rationale</div>
           <div class="md-block-val">All 12 models run simultaneously on every input. A sequential design (check depression first, then suicide risk) would <strong>miss masked suicidality</strong> — a clinically documented pre-crisis pattern where affect appears normal but intent is resolved. Parallelism is a safety requirement, not a design preference.</div>
         </div>
         <div class="md-block">
@@ -372,7 +484,7 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
           <div class="md-block-val">XGBoost accuracy on D3: <em>91.6% (50K sample) → 70.5% (Full 232K) → 60.1% (H1 116K)</em>. Performance degrades as training data grows. The H1/H2 results are also inconsistent (60.1% vs 71.0%) — gradient boosting is highly sensitive to data distribution shifts at this scale, making it unreliable for large Reddit corpora.</div>
         </div>
         <div class="md-block">
-          <div class="md-block-lbl">D3 Split Study (RQ2)</div>
           <div class="md-block-val">D3 trained on 4 configurations: Full (232K), Half 1 (116K), Half 2 (116K), Sample (50K). XLM-RoBERTa accuracy: <em>98.1% (50K) → 97.8% (H1) → 98.0% (H2/Full)</em>. Δ = 0.3% across 4× more data. Kolmogorov-Smirnov tests confirm all splits share identical distributions (p &gt; 0.49), validating the comparison.</div>
         </div>
       </div>
@@ -386,17 +498,17 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
 <section class="section" id="matrix">
   <div class="sec-eyebrow">Core evaluation</div>
   <div class="sec-h2">Accuracy <em>Evidence Matrix</em></div>
-  <p class="sec-lead">All 4 models evaluated across all dataset splits. <strong>Bold</strong> = winner per row. <span style="color:var(--red)">Red</span> = XGBoost collapse on larger training sets.</p>
   <div class="matrix-wrap">
     <table class="matrix-tbl">
       <thead>
         <tr>
           <th>Dataset / Split</th>
-          <th>Logistic Regression</th>
-          <th>SVM</th>
-          <th>XGBoost</th>
-          <th>XLM-RoBERTa</th>
         </tr>
       </thead>
       <tbody>
@@ -532,7 +644,8 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
 <div class="demo-section" id="demo">
   <div class="sec-eyebrow">Live inference</div>
   <div class="sec-h2" style="margin-bottom:8px">Try it — <em>winner model per task</em></div>
-  <p class="sec-lead" style="margin-bottom:24px">Sample 3 demonstrates masked suicidality. Try typing clinical-style depressive language ("I feel exhausted, nothing feels enjoyable") to observe the Affective vs. Clinical Lexicon Gap documented in Finding 04.</p>
   <div class="disclaimer"><strong>Research prototype only.</strong> Not a clinical tool. If you or someone you know is in crisis, please contact a mental health professional or emergency services immediately.</div>
@@ -594,6 +707,107 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
   </div>
 </div>
 <footer>
   MindScan · NCI H9DAI Research Project 2026 · Academic Prototype Only<br>
   Datasets: Zenodo 14233292 · Kaggle albertobellardini · Kaggle nikhileswarkomati<br>
@@ -761,6 +975,426 @@ function setW(id,res){
 }
 function pct(v){return(v*100).toFixed(1)+'%'}
 </script>
 </body>
 </html>

 .sec-lead{font-size:14px;color:var(--ink2);max-width:560px;line-height:1.7;margin-bottom:36px}
 .section-divider{border:none;border-top:1px solid var(--border);margin:0}
+/* ── BASE PAPER COMPARISON ── */
+.comparison-wrap{display:grid;grid-template-columns:1fr auto 1fr;gap:16px;align-items:center}
+.comp-card{border-radius:14px;padding:26px;border:1px solid;box-shadow:var(--shadow)}
+.comp-card.theirs{background:var(--bg2);border-color:var(--border2)}
+.comp-card.ours{background:#fff;border-color:rgba(21,128,61,.25);box-shadow:var(--shadow-md)}
+.comp-label{font-size:10px;font-family:'DM Mono',monospace;letter-spacing:.1em;text-transform:uppercase;margin-bottom:12px;padding:4px 10px;border-radius:4px;display:inline-block}
+.comp-card.theirs .comp-label{background:var(--bg3);color:var(--ink3)}
+.comp-card.ours .comp-label{background:var(--green-bg);color:var(--green)}
+.comp-title{font-family:'Instrument Serif',serif;font-size:18px;letter-spacing:-.01em;color:var(--ink);margin-bottom:4px}
+.comp-sub{font-size:12px;color:var(--ink2);margin-bottom:18px}
+.comp-row{display:flex;align-items:flex-start;gap:8px;margin-bottom:9px;font-size:13px}
+.comp-icon{width:16px;height:16px;border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:9px;flex-shrink:0;margin-top:1px}
+.comp-icon.bad{background:rgba(185,28,28,.1);color:var(--red)}
+.comp-icon.good{background:var(--green-bg);color:var(--green)}
+.comp-text{color:var(--ink2);line-height:1.45}
+.comp-text strong{color:var(--ink)}
+.comp-f1-row{margin-top:18px;padding-top:14px;border-top:1px solid var(--border);display:flex;align-items:center;gap:10px}
+.comp-f1-label{font-size:11px;font-family:'DM Mono',monospace;color:var(--ink3)}
+.comp-f1-val{font-family:'Instrument Serif',serif;font-size:24px;letter-spacing:-.02em}
+.comp-card.theirs .comp-f1-val{color:var(--ink3)}
+.comp-card.ours .comp-f1-val{color:var(--green)}
+.comp-middle{text-align:center;padding:16px 12px}
+.comp-arrow{font-size:24px;color:var(--green);margin-bottom:6px}
+.comp-delta{font-family:'Instrument Serif',serif;font-size:28px;color:var(--green);letter-spacing:-.02em}
+.comp-delta-lbl{font-size:10px;font-family:'DM Mono',monospace;color:var(--ink3);margin-top:2px;text-transform:uppercase}
 /* ── METHODOLOGY 3-STEP ── */
 .method-steps{display:grid;grid-template-columns:repeat(3,1fr);gap:0;position:relative}
 .method-steps::before{content:'';position:absolute;top:22px;left:22px;right:22px;height:2px;background:var(--bg3);z-index:0}
 .method-step.active .ms-dot{background:var(--blue);border-color:var(--blue)}
 .ms-title{font-size:14px;font-weight:500;color:var(--ink);margin-bottom:7px}
 .ms-body{font-size:12px;color:var(--ink2);line-height:1.65}
 /* Detail panel */
 .method-detail{margin-top:32px;background:#fff;border:1px solid var(--border);border-radius:14px;padding:28px 32px;box-shadow:var(--shadow);animation:fadeUp .25s ease both}
 .md-panel{display:none}
 .ci-title{font-size:13px;font-weight:500;color:var(--amber);margin-bottom:5px;display:flex;align-items:center;gap:7px}
 .ci-body{font-size:12px;color:#92400e;line-height:1.65}
+/* ── GLOSSARY TOOLTIPS ── */
+.gloss{border-bottom:1px dashed var(--ink3);cursor:help;position:relative;display:inline}
+.gloss::after{content:attr(data-tip);position:absolute;bottom:calc(100% + 8px);left:50%;transform:translateX(-50%);background:var(--ink);color:#fff;font-size:11.5px;padding:8px 12px;border-radius:8px;width:230px;white-space:normal;line-height:1.5;font-family:'Geist',sans-serif;letter-spacing:0;text-align:left;opacity:0;pointer-events:none;transition:opacity .15s;z-index:300;box-shadow:0 4px 16px rgba(0,0,0,.18)}
+.gloss::before{content:'';position:absolute;bottom:calc(100% + 2px);left:50%;transform:translateX(-50%);border:5px solid transparent;border-top-color:var(--ink);opacity:0;transition:opacity .15s;z-index:301}
+.gloss:hover::after,.gloss:hover::before{opacity:1}
+/* ── CODE MODAL ── */
+.cm-term{border-bottom:1px dashed var(--blue);color:var(--ink);cursor:pointer;display:inline-flex;align-items:center;gap:5px;transition:color .15s}
+.cm-term:hover{color:var(--blue)}
+.cm-term::after{content:'</>';font-family:'DM Mono',monospace;font-size:9px;color:var(--blue);opacity:.7;letter-spacing:-.03em}
+.cm-overlay{position:fixed;inset:0;background:rgba(26,24,22,.45);z-index:500;display:none;align-items:center;justify-content:center;padding:20px;backdrop-filter:blur(3px)}
+.cm-overlay.open{display:flex}
+.cm-box{background:#fff;border-radius:16px;width:100%;max-width:680px;max-height:88vh;display:flex;flex-direction:column;box-shadow:0 24px 80px rgba(0,0,0,.18);overflow:hidden}
+.cm-head{padding:20px 24px 0;display:flex;align-items:flex-start;justify-content:space-between;gap:16px}
+.cm-title{font-family:'Instrument Serif',serif;font-size:22px;letter-spacing:-.02em;color:var(--ink)}
+.cm-close{width:28px;height:28px;border-radius:50%;border:1px solid var(--border);background:var(--bg2);cursor:pointer;font-size:14px;display:flex;align-items:center;justify-content:center;flex-shrink:0;color:var(--ink2)}
+.cm-close:hover{background:var(--bg3)}
+.cm-tabs{display:flex;gap:2px;padding:12px 24px 0;border-bottom:1px solid var(--border)}
+.cm-tab{font-size:11px;font-family:'DM Mono',monospace;padding:6px 14px;border-radius:6px 6px 0 0;cursor:pointer;border:1px solid transparent;border-bottom:none;color:var(--ink2);margin-bottom:-1px;background:none}
+.cm-tab.active{background:#fff;border-color:var(--border);color:var(--ink)}
+.cm-body{overflow-y:auto;padding:20px 24px 24px}
+.cm-panel{display:none}
+.cm-panel.active{display:block}
+.cm-pre{background:var(--ink);color:#e8e4dc;font-family:'DM Mono',monospace;font-size:12px;line-height:1.7;padding:16px 18px;border-radius:10px;overflow-x:auto;white-space:pre;margin-bottom:10px}
+.cm-src{font-size:11px;color:var(--ink3);font-family:'DM Mono',monospace;margin-top:6px}
+.cm-why-body{font-size:13.5px;color:var(--ink2);line-height:1.8}
+.cm-why-body strong{color:var(--ink)}
+.cm-out-row{display:flex;align-items:flex-start;gap:12px;padding:10px 14px;background:var(--bg2);border-radius:8px;margin-bottom:8px}
+.cm-out-lbl{font-size:10px;font-family:'DM Mono',monospace;color:var(--ink3);min-width:28px;text-transform:uppercase;margin-top:2px}
+.cm-out-val{font-size:13px;color:var(--ink);line-height:1.5}
+.cm-out-val em{font-family:'DM Mono',monospace;font-size:11.5px;color:var(--blue)}
+/* ── FAQ ACCORDION ── */
+.faq-section{max-width:1040px;margin:0 auto;padding:64px 48px}
+.faq-group{margin-bottom:32px}
+.faq-group-title{font-size:10px;font-family:'DM Mono',monospace;letter-spacing:.12em;text-transform:uppercase;color:var(--blue);margin-bottom:12px}
+.faq-item{border:1px solid var(--border);border-radius:10px;margin-bottom:6px;overflow:hidden;background:#fff}
+.faq-q{width:100%;text-align:left;background:none;border:none;padding:14px 18px;font-size:13px;font-family:'Geist',sans-serif;color:var(--ink);cursor:pointer;display:flex;justify-content:space-between;align-items:center;gap:16px;line-height:1.45}
+.faq-q:hover{background:var(--bg2)}
+.faq-q .faq-chevron{font-size:10px;color:var(--ink3);flex-shrink:0;transition:transform .2s}
+.faq-item.open .faq-chevron{transform:rotate(180deg)}
+.faq-a{max-height:0;overflow:hidden;transition:max-height .25s ease}
+.faq-item.open .faq-a{max-height:500px}
+.faq-a-inner{padding:0 18px 14px;font-size:12.5px;color:var(--ink2);line-height:1.75;border-top:1px solid var(--border)}
+.faq-a-inner code{font-family:'DM Mono',monospace;font-size:11px;background:var(--bg2);padding:1px 5px;border-radius:3px;color:var(--ink)}
 footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);font-size:11px;font-family:'DM Mono',monospace;color:var(--ink3);line-height:1.9}
 @media(max-width:768px){
   header,.section,.demo-section,footer{padding-left:20px;padding-right:20px}
     <div class="logo-txt">Mind<em>Scan</em></div>
   </div>
   <nav class="nav-links">
+    <a href="#comparison">vs Base Paper</a>
     <a href="#methodology">Methodology</a>
     <a href="#matrix">Evidence Matrix</a>
     <a href="#findings">Findings</a>
     <a href="#verdict">Conclusions</a>
     <a href="#demo">Live Demo</a>
+    <a href="#faq">FAQ</a>
   </nav>
   <div class="nav-badge">NCI H9DAI 2026</div>
 </header>
     <div class="stats-panel">
       <div class="stat-box"><div class="stat-num" data-target="3" data-suffix="">0</div><div class="stat-lbl">Datasets</div></div>
       <div class="stat-box"><div class="stat-num" data-target="12" data-suffix="">0</div><div class="stat-lbl">Models trained</div></div>
+      <div class="stat-box"><div class="stat-num" data-target="98.1" data-suffix="%" data-dec="1">0</div><div class="stat-lbl">D3 Accuracy (4-class)</div></div>
+      <div class="stat-box"><div class="stat-num" data-target="12.7" data-suffix="%" data-dec="1">0</div><div class="stat-lbl">vs Base Paper ↑</div></div>
     </div>
   </div>
 </div>
 <hr class="section-divider">
+<!-- BASE PAPER COMPARISON -->
+<section class="section" id="comparison">
+  <div class="sec-eyebrow">Extending prior work</div>
+  <div class="sec-h2">Our work vs <em>Tumaliuan et al. (2024)</em></div>
+  <p class="sec-lead">Dataset 1 is structurally equivalent to the base paper's Filipino Twitter corpus — same 6-class task, same clinical annotation method — making a direct F1 comparison valid.</p>
+  <div class="comparison-wrap">
+    <div class="comp-card theirs">
+      <div class="comp-label">Tumaliuan et al. — 2024</div>
+      <div class="comp-title">Filipino Twitter Depression</div>
+      <div class="comp-sub">Frontiers in Computer Science · word2vec pipeline</div>
+      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text">Used <strong><span class="gloss" data-tip="word2vec (2013): maps words to fixed vectors based on co-occurrence. Cannot understand negation ('not happy' ≈ 'happy') or context. Superseded by transformers.">word2vec</span></strong> (2013) — static embeddings, no negation handling</div></div>
+      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text"><strong><span class="gloss" data-tip="SVM (Support Vector Machine): finds the maximum-margin hyperplane separating classes. Very effective for high-dimensional text features like TF-IDF. Gold standard for NLP before transformers.">SVM never tested</span></strong> — absent from evaluation despite being NLP gold standard</div></div>
+      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text"><strong><span class="gloss" data-tip="XGBoost: gradient-boosted decision trees. Sequentially builds trees to correct previous errors. Handles imbalanced data well and often beats random forests on tabular/sparse features.">XGBoost never tested</span></strong> — gradient boosting entirely absent</div></div>
+      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text">Class imbalance listed as <strong>limitation — never resolved</strong></div></div>
+      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text"><strong>Restricted dataset</strong> — requires author permission to access</div></div>
+      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text"><strong>Accuracy not verified</strong> — no reproducible baseline reported</div></div>
+      <div class="comp-f1-row"><span class="comp-f1-label">Best Accuracy</span><span class="comp-f1-val">~81%</span></div>
+    </div>
+    <div class="comp-middle">
+      <div class="comp-arrow">→</div>
+      <div class="comp-delta">+11.4%</div>
+      <div class="comp-delta-lbl">accuracy gain (D1)</div>
+    </div>
+    <div class="comp-card ours">
+      <div class="comp-label">MindScan — 2026</div>
+      <div class="comp-title">English Twitter + Reddit</div>
+      <div class="comp-sub">Zenodo (Nusrat 2024) · XLM-RoBERTa + SVM + XGBoost</div>
+      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><strong><span class="gloss" data-tip="XLM-RoBERTa: 278M-parameter multilingual transformer. Fine-tuned on 100 languages. Produces contextual embeddings — the same word gets different vectors depending on surrounding context. Understands negation, irony, and long-range dependencies.">XLM-RoBERTa</span></strong> (2019) — contextual embeddings, understands negation</div></div>
+      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><strong>SVM added</strong> — best D1 accuracy 92.36%, beats transformer (90.52%)</div></div>
+      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><strong>XGBoost added</strong> — accuracy 91.76%, gradient boosting for imbalanced data</div></div>
+      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><span class="gloss" data-tip="SMOTE (Synthetic Minority Oversampling Technique): generates synthetic training samples for minority classes by interpolating between existing minority-class examples in feature space. Applied to training data only — never the test set.">SMOTE</span> applied — <strong>imbalance resolved</strong>, all 6 classes equalised to 2,997</div></div>
+      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><strong>Public dataset</strong> — fully reproducible, anyone can verify results</div></div>
+      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><strong>Accuracy verified</strong> on held-out 20% test set, same 6-class task</div></div>
+      <div class="comp-f1-row"><span class="comp-f1-label">Best Accuracy (D1 SVM)</span><span class="comp-f1-val">92.4%</span></div>
+    </div>
+  </div>
+</section>
+<hr class="section-divider">
 <!-- METHODOLOGY -->
 <section class="section" id="methodology">
   <div class="sec-eyebrow">Methodology</div>
   <div class="sec-h2">Three-step <em>pipeline</em></div>
+  <p class="sec-lead"><span class="gloss" data-tip="CRISP-DM: Cross-Industry Standard Process for Data Mining. 6 phases: Business Understanding → Data Understanding → Data Preparation → Modelling → Evaluation → Deployment. The de facto lifecycle framework for data science projects.">CRISP-DM</span> applied across all three datasets — from raw social media text to parallel ensemble predictions.</p>
   <div class="method-steps">
     <div class="method-step active" onclick="showMethodDetail(0)">
       <div class="ms-dot">01</div>
       <div class="ms-title">Data</div>
       <div class="ms-body">3 clinical datasets spanning Twitter and Reddit, covering depression types, binary detection, and suicide risk.</div>
     </div>
     <div class="method-step" onclick="showMethodDetail(1)">
       <div class="ms-dot">02</div>
       <div class="ms-title">Preprocessing</div>
       <div class="ms-body">6-stage text cleaning pipeline + SMOTE oversampling to address class imbalance left unresolved by the base paper.</div>
     </div>
     <div class="method-step" onclick="showMethodDetail(2)">
       <div class="ms-dot">03</div>
       <div class="ms-title">Modelling</div>
       <div class="ms-body">Parallel ensemble of 12 classifiers — all run independently on every prediction, never as a sequential cascade.</div>
     </div>
   </div>
       <div class="md-title"><div class="md-title-dot"></div>Preprocessing Pipeline</div>
       <div class="md-grid">
         <div class="md-block">
+          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('clean_text')">6-Stage Text Cleaning</span></div>
           <div class="md-block-val">1. Lowercase · 2. Strip URLs &amp; http links · 3. Remove @mentions · 4. Remove # symbols · 5. Strip punctuation · 6. Collapse whitespace. Applied identically across all three datasets for consistency.</div>
         </div>
         <div class="md-block">
+          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('smote')">SMOTE — Synthetic Oversampling</span></div>
           <div class="md-block-val">Applied to D1 and D2 training sets only (D3 is pre-balanced). D1: 11,986 → <em>17,982 samples</em>. D2: 8,251 → <em>12,800 samples</em>. Creates synthetic clinical neighbours in TF-IDF feature space. Directly addresses the base paper's (Tumaliuan 2024) biggest limitation — they trained on raw imbalanced data.</div>
         </div>
         <div class="md-block">
+          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('tfidf')">Feature Extraction — TF-IDF</span></div>
+          <div class="md-block-val"><span class="gloss" data-tip="TF-IDF (Term Frequency–Inverse Document Frequency): scores each word by how often it appears in a document (TF) divided by how common it is across all documents (IDF). Settings: max_features=50,000, ngram_range=(1,2), sublinear_tf=True, min_df=2.">TF-IDF</span> vectoriser with unigrams + bigrams, fitted per-dataset on training data only. Captures frequency-weighted term co-occurrence patterns, well-suited for short Twitter text.</div>
         </div>
         <div class="md-block">
+          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('tokeniser')">Feature Extraction — Tokeniser</span></div>
           <div class="md-block-val">XLM-RoBERTa tokeniser (max 128 tokens D1/D2, 256 tokens D3) with padding. Pre-trained multilingual contextual embeddings capture semantic meaning and long-range dependencies — critical for Reddit's longer posts.</div>
         </div>
       </div>
       <div class="md-grid">
         <div class="md-block">
           <div class="md-block-lbl">4 Models per Dataset (12 total)</div>
+          <div class="md-block-val"><span class="cm-term" onclick="openCM('lr')">Logistic Regression</span> — L2 regularised, max_iter=1000. <span class="cm-term" onclick="openCM('svm')">SVM</span> — LinearSVC, C=1.0. <span class="cm-term" onclick="openCM('xgb')">XGBoost</span> — 300 estimators, max_depth=6. <span class="cm-term" onclick="openCM('xlmr_ft')">XLM-RoBERTa</span> — fine-tuned multilingual transformer, <em>278M parameters</em>, lr=2e-5, 3 epochs.</div>
         </div>
         <div class="md-block">
+          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('majority_vote')">Ensemble Vote — Risk Flag Logic</span></div>
           <div class="md-block-val">All 12 models run simultaneously on every input. A sequential design (check depression first, then suicide risk) would <strong>miss masked suicidality</strong> — a clinically documented pre-crisis pattern where affect appears normal but intent is resolved. Parallelism is a safety requirement, not a design preference.</div>
         </div>
         <div class="md-block">
           <div class="md-block-val">XGBoost accuracy on D3: <em>91.6% (50K sample) → 70.5% (Full 232K) → 60.1% (H1 116K)</em>. Performance degrades as training data grows. The H1/H2 results are also inconsistent (60.1% vs 71.0%) — gradient boosting is highly sensitive to data distribution shifts at this scale, making it unreliable for large Reddit corpora.</div>
         </div>
         <div class="md-block">
+          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('split_study')">D3 Split Study (RQ2)</span></div>
           <div class="md-block-val">D3 trained on 4 configurations: Full (232K), Half 1 (116K), Half 2 (116K), Sample (50K). XLM-RoBERTa accuracy: <em>98.1% (50K) → 97.8% (H1) → 98.0% (H2/Full)</em>. Δ = 0.3% across 4× more data. Kolmogorov-Smirnov tests confirm all splits share identical distributions (p &gt; 0.49), validating the comparison.</div>
         </div>
       </div>
 <section class="section" id="matrix">
   <div class="sec-eyebrow">Core evaluation</div>
   <div class="sec-h2">Accuracy <em>Evidence Matrix</em></div>
+  <p class="sec-lead">All 4 models evaluated across all dataset splits. <strong>Bold</strong> = winner per row. <span style="color:var(--red)">Red</span> = XGBoost collapse on larger training sets. — <span class="cm-term" onclick="openCM('eval_metrics')">How metrics are computed</span></p>
   <div class="matrix-wrap">
     <table class="matrix-tbl">
       <thead>
         <tr>
           <th>Dataset / Split</th>
+          <th><span class="gloss" data-tip="Logistic Regression: linear model trained with L2 regularisation (max_iter=1000). Fast, interpretable baseline. Outputs class probabilities via sigmoid/softmax. Works well with TF-IDF sparse vectors.">Logistic Regression</span></th>
+          <th><span class="gloss" data-tip="SVM (Support Vector Machine): LinearSVC, C=1.0. Finds maximum-margin hyperplane in TF-IDF feature space. Best classical model on D1 — short tweets give TF-IDF enough signal to beat contextual embeddings.">SVM</span></th>
+          <th><span class="gloss" data-tip="XGBoost: gradient-boosted trees, 300 estimators, max_depth=6. Sequentially corrects previous errors. Collapses on D3 (71%) — vocabulary overlap between depressive and suicidal language confuses boosted trees.">XGBoost</span></th>
+          <th><span class="gloss" data-tip="XLM-RoBERTa: 278M-parameter multilingual transformer. Fine-tuned with lr=2e-5, 3 epochs. Max 128 tokens (D1/D2) or 256 tokens (D3). Best on long-form Reddit posts — contextual embeddings capture meaning beyond keyword matching.">XLM-RoBERTa</span></th>
         </tr>
       </thead>
       <tbody>
 <div class="demo-section" id="demo">
   <div class="sec-eyebrow">Live inference</div>
   <div class="sec-h2" style="margin-bottom:8px">Try it — <em>winner model per task</em></div>
+  <p class="sec-lead" style="margin-bottom:12px">Sample 3 demonstrates masked suicidality. Try typing clinical-style depressive language ("I feel exhausted, nothing feels enjoyable") to observe the Affective vs. Clinical Lexicon Gap documented in Finding 04.</p>
+  <p style="font-size:13px;color:var(--ink2);margin-bottom:24px">How the demo works: <span class="cm-term" onclick="openCM('flask_deploy')">Flask → HuggingFace proxy</span> · <span class="cm-term" onclick="openCM('predict_flow')">predict_all() inference flow</span></p>
   <div class="disclaimer"><strong>Research prototype only.</strong> Not a clinical tool. If you or someone you know is in crisis, please contact a mental health professional or emergency services immediately.</div>
   </div>
 </div>
+<!-- CODE MODAL OVERLAY -->
+<div class="cm-overlay" id="cmOverlay" onclick="closeCMOutside(event)">
+  <div class="cm-box" id="cmBox">
+    <div class="cm-head">
+      <div class="cm-title" id="cmTitle"></div>
+      <button class="cm-close" onclick="closeCM()">✕</button>
+    </div>
+    <div class="cm-tabs">
+      <div class="cm-tab active" onclick="switchCMTab(0)">Code</div>
+      <div class="cm-tab" onclick="switchCMTab(1)">Why</div>
+      <div class="cm-tab" onclick="switchCMTab(2)">Output</div>
+    </div>
+    <div class="cm-body">
+      <div class="cm-panel active" id="cmt0"></div>
+      <div class="cm-panel" id="cmt1"></div>
+      <div class="cm-panel" id="cmt2"></div>
+    </div>
+  </div>
+</div>
+<hr class="section-divider">
+<!-- FAQ SECTION -->
+<section class="faq-section" id="faq">
+  <div class="sec-eyebrow">Defence prep</div>
+  <div class="sec-h2">Frequently asked <em>questions</em></div>
+  <p class="sec-lead">Click any question to expand the answer. Grouped by topic for quick navigation during Q&amp;A.</p>
+  <div class="faq-group">
+    <div class="faq-group-title">Data &amp; Datasets</div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">What are the three datasets and what makes them different? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">D1 — 6-class depression type classification (atypical, bipolar, major depressive, no depression, postpartum, psychotic) from Kaggle. Twitter-length text, 11,986 samples. D2 — binary suicide/non-suicide from Twitter (10,314 samples, severe 3.46× imbalance). D3 — 4-class suicide/depression/anxiety/normal from Reddit (232K samples, pre-balanced). Each dataset has a different task, different text length, and different vocabulary domain — which is precisely why running all three in parallel is informative.</div></div>
+    </div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">How did you handle class imbalance? Why SMOTE and not class weighting? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">D1 had 1.89× imbalance (atypical class), D2 had 3.46× imbalance. We applied <code>SMOTE</code> to training data only — never the test set. SMOTE interpolates new synthetic samples in TF-IDF feature space between existing minority-class examples. Class weighting was also evaluated; SMOTE showed equal or better Macro F1 in cross-validation. D3 was pre-balanced and required no oversampling.</div></div>
+    </div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">Is there any data leakage in your pipeline? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">No. The train/test split (stratified 80/20) is performed first. SMOTE is then applied only to the training portion. The TF-IDF vocabulary is fitted on training data only and applied as a read-only transform to the test set. XLM-RoBERTa uses a fixed pretrained tokeniser. No test sample was ever used to inform any training decision.</div></div>
+    </div>
+  </div>
+  <div class="faq-group">
+    <div class="faq-group-title">Methodology &amp; Models</div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">Why four model types per dataset? Why not just use the best one? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">Each captures a different inductive bias: Logistic Regression (linear decision boundary), SVM (maximum-margin), Random Forest/XGBoost (non-linear tree ensembles), XLM-RoBERTa (contextual transformer). Disagreement between models is itself a signal. On D1, SVM (92.4%) beats XLM-RoBERTa (90.5%) — short tweets don't give the transformer enough context to gain advantage. On D3 (212-word Reddit posts), XLM-RoBERTa (98.1%) dominates every classical model.</div></div>
+    </div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">What are your TF-IDF settings and why? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner"><code>max_features=50,000</code> — covers the full relevant vocabulary without noise. <code>ngram_range=(1,2)</code> — unigrams + bigrams capture local phrases ("not happy", "kill myself") that unigrams miss. <code>sublinear_tf=True</code> — applies log(1+tf) to dampen high-frequency word dominance. <code>min_df=2</code> — removes hapax legomena (words appearing only once) that add noise.</div></div>
+    </div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">How was XLM-RoBERTa fine-tuned? What hyperparameters? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">Standard sequence classification fine-tuning: Adam optimiser, <code>lr=2e-5</code>, <code>3 epochs</code>, linear warmup scheduler. Max token length: 128 for D1/D2 (Twitter-length text), 256 for D3 (Reddit posts average 212 words). Cross-entropy loss. Best checkpoint saved by validation accuracy. 278M parameters — multilingual pretraining covers 100 languages.</div></div>
+    </div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">Why did XGBoost collapse on D3 to only 71%? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">This is TF-IDF lexical overfitting. D3 is a 4-class task where depressive and suicidal vocabulary heavily overlap in Reddit posts — words like "exhausted", "hopeless", "nothing matters" appear in both classes. Boosted trees overfit to these majority-class token patterns and fail to distinguish fine-grained class boundaries. XLM-RoBERTa's contextual embeddings resolve this because it reads the full sentence, not just individual tokens.</div></div>
+    </div>
+  </div>
+  <div class="faq-group">
+    <div class="faq-group-title">Results &amp; Evaluation</div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">Why is SVM accuracy 92.4% on D1 but XLM-RoBERTa (278M params) only gets 90.5%? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">Text length. D1 tweets average ~31 words. Transformers need rich context to outperform classical methods — contextual embeddings add little value when the sentence is 5–10 tokens. TF-IDF bigrams on short explicit text (like tweets) already capture the full signal. This is Finding 01 and one of the key research conclusions: model selection must be text-length aware.</div></div>
+    </div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">Why show accuracy rather than Macro F1? Isn't accuracy misleading on imbalanced data? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">The dashboard shows accuracy for accessibility (non-specialist audience). After SMOTE, all training classes are equalised — so accuracy and Macro F1 are closely aligned. The full Macro F1, Cohen's Kappa, and per-class precision/recall are reported in the IEEE technical report. The evidence matrix footnote notes this explicitly.</div></div>
+    </div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">Did adding more training data (50K → 232K) improve D3 results? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">No — only 0.1% change in XLM-RoBERTa accuracy (98.1% → 98.0%). Kolmogorov-Smirnov tests confirm all four splits (Full 232K, H1 116K, H2 116K, Sample 50K) share identical distributions (p &gt; 0.49). The 50K sample fully captures the underlying signal. This is Finding 03 and validates our choice of the 50K sample for the final model.</div></div>
+    </div>
+  </div>
+  <div class="faq-group">
+    <div class="faq-group-title">Architecture &amp; Live Demo</div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">Is the live demo using real models or hardcoded responses? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">Real models. The Flask app proxies every request to a HuggingFace Space (<code>esvanth-mindscan.hf.space</code>) which runs <code>predict.py</code> with all 12 loaded models. There is no hardcoded data — every input goes through the full pipeline. If the Space is sleeping it auto-wakes within ~60 seconds.</div></div>
+    </div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">What does "Ensemble Conflict" (amber banner) mean? Why not just show red? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">It means classical D3 models (LR/SVM/XGBoost) flagged suicide risk by majority vote, but XLM-RoBERTa — the best model at 98.1% accuracy — disagrees. A pure majority vote could trigger false alarms on metaphorical language ("I'm dying of embarrassment"). The amber state expresses uncertainty rather than forcing a binary decision, which maps directly to "escalate for human review" — the appropriate clinical-conservative response.</div></div>
+    </div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">Why does D2 under-flag clinical-style text like "I feel exhausted, nothing feels enjoyable"? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">This is the Affective vs. Clinical Lexicon Gap (Finding 04, documented in NAACL 2024). D2 was trained on Twitter emotional language — explicit distress, slang, emotional punctuation. Clinical presentations use diagnostic vocabulary: anhedonia ("nothing feels enjoyable"), psychomotor fatigue, flat affect. These words are absent from D2's training distribution. This is not a bug — it is an empirical finding about the domain gap between social media affect and clinical language.</div></div>
+    </div>
+    <div class="faq-item">
+      <button class="faq-q" onclick="toggleFaq(this)">What is the single most important future direction? <span class="faq-chevron">▼</span></button>
+      <div class="faq-a"><div class="faq-a-inner">Replace TF-IDF classical models with <strong>MentalBERT/MentalRoBERTa</strong> (Ji et al. 2022) pretrained on mental health forum data. Combine all three tasks in a true multi-task learning setup with a shared encoder and task-specific heads — following the MTL precedent from Zogan et al. (2024). This would address both documented limitations (Affective Lexicon Gap and TF-IDF overfitting) simultaneously.</div></div>
+    </div>
+  </div>
+</section>
 <footer>
   MindScan · NCI H9DAI Research Project 2026 · Academic Prototype Only<br>
   Datasets: Zenodo 14233292 · Kaggle albertobellardini · Kaggle nikhileswarkomati<br>
 }
 function pct(v){return(v*100).toFixed(1)+'%'}
+/* ── CODE MODAL DATA ── */
+const CM_DATA = {
+  clean_text: {
+    title: 'clean_text() — Text Preprocessing',
+    code: `def clean_text(text):
+    text = str(text).lower()
+    # remove URLs
+    text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text)
+    # remove @mentions
+    text = re.sub(r'@\\w+', '', text)
+    # remove # symbol (keep hashtag word)
+    text = re.sub(r'#', '', text)
+    # strip all punctuation
+    text = text.translate(
+        str.maketrans('', '', string.punctuation)
+    )
+    # collapse whitespace
+    text = re.sub(r'\\s+', ' ', text).strip()
+    return text`,
+    src: 'notebooks/DA_Notebook_One.ipynb — cell 5 · notebooks/DA_2_Notebook.ipynb — cell 6 · predict.py lines 106–113',
+    why: '<strong>Why lowercase?</strong> "Sad" and "sad" must map to the same TF-IDF token. <strong>Why remove URLs?</strong> Hundreds of unique tokens, zero semantic value — pure noise. <strong>Why keep hashtag words?</strong> "#depressed" → "depressed" preserves the semantic signal, removes the markup. <strong>Why no stemming?</strong> Stemming degrades bigram quality — "kill myself" would become "kill myself" but "killing" → "kill" breaks n-gram boundaries. Same function is used at both training time (notebook) and inference time (predict.py) to guarantee identical preprocessing.',
+    outputs: [
+      {label:'Input', val:'"I been going through #Depression after @user check https://t.co/xyz!!"'},
+      {label:'Output', val:'"i been going through depression after check"'},
+      {label:'Note', val:'Applied to all 3 datasets before TF-IDF and before XLM-RoBERTa tokenisation'},
+    ]
+  },
+  smote: {
+    title: 'SMOTE — Synthetic Minority Oversampling',
+    code: `def apply_smote(X_train, y_train):
+    before = Counter(y_train)
+    smote = SMOTE(random_state=42)
+    X_bal, y_bal = smote.fit_resample(X_train, y_train)
+    after = Counter(y_bal)
+    print(f'SMOTE: {sum(before.values())} → {sum(after.values())}')
+    return X_bal, y_bal
+# Called AFTER TF-IDF vectorisation, AFTER train/test split
+X1_bal, y1_bal = apply_smote(X1_tr_tf, y1_tr)`,
+    src: 'notebooks/DA_Notebook_One.ipynb — cell 5 (apply_smote def) · cell 10 (D1 call) · cell 17 (D2 call) · D3 skipped',
+    why: '<strong>Why after TF-IDF?</strong> SMOTE interpolates in feature space — it creates synthetic TF-IDF vectors, not synthetic text. <strong>Why not before the split?</strong> Applying SMOTE before splitting would let synthetic samples leak into the test set — the test set must contain only real data. <strong>Why not class_weight instead?</strong> Class weighting reweights the loss function — it doesn\'t add new training examples. SMOTE was chosen because it physically fills the minority-class region of feature space, giving tree-based models (RF, XGB) more to learn from. <strong>D3 skipped:</strong> D3 is pre-balanced (116K each class) — no intervention needed.',
+    outputs: [
+      {label:'D1', val:'11,986 → 17,982 samples (atypical: 1,584 → 2,997, each class equalised)'},
+      {label:'D2', val:'8,251 → 12,800 samples (Depressed: 1,851 → 6,400)'},
+      {label:'D3', val:'Skipped — pre-balanced at 116,037 per class'},
+    ]
+  },
+  tfidf: {
+    title: 'TfidfVectorizer — Feature Extraction',
+    code: `def make_tfidf(X_train, X_test, max_features=50000):
+    tfidf = TfidfVectorizer(
+        max_features=50000,   # top 50K tokens by corpus frequency
+        ngram_range=(1, 2),   # unigrams AND bigrams
+        sublinear_tf=True,    # log(1+tf) instead of raw tf
+        min_df=2              # ignore tokens appearing < 2 times
+    )
+    Xtr = tfidf.fit_transform(X_train)  # fit on train only
+    Xte = tfidf.transform(X_test)       # apply to test (no fit)
+    return tfidf, Xtr, Xte`,
+    src: 'notebooks/DA_Notebook_One.ipynb — cell 5 (make_tfidf def) · cell 10 (D1) · cell 17 (D2) · cell 22 (D3)',
+    why: '<strong>ngram_range=(1,2):</strong> Bigrams capture "kill myself", "not happy", "feeling better" — critical signals that unigrams miss entirely. <strong>sublinear_tf=True:</strong> Applies log(1+tf) to dampen high-frequency word dominance. Without this, common words like "i", "feel" swamp the features. <strong>min_df=2:</strong> Removes hapax legomena (words appearing only once) — they add 0 generalisable information. <strong>fit only on train:</strong> Vocabulary is locked on training data — the test set is transformed using this fixed vocabulary, preventing any data leakage.',
+    outputs: [
+      {label:'D1 shape', val:'11,986 × 50,000 sparse matrix (tweets × features)'},
+      {label:'D2 shape', val:'8,251 × 50,000 sparse matrix'},
+      {label:'D3 shape', val:'40,000 × 50,000 sparse matrix'},
+      {label:'After SMOTE', val:'D1 becomes 17,982 × 50,000, D2 becomes 12,800 × 50,000'},
+    ]
+  },
+  tokeniser: {
+    title: 'XLM-RoBERTa Tokeniser',
+    code: `tokenizer = AutoTokenizer.from_pretrained(
+    'FacebookAI/xlm-roberta-base'
+)
+def tokenize_tweets(examples):
+    return tokenizer(
+        examples['text'],
+        max_length=128,      # 128 for D1/D2 (tweets avg ~40 tokens)
+        truncation=True,     # cut anything beyond max_length
+        padding='max_length' # pad shorter inputs to fixed length
+    )
+# D3 uses max_length=256 — Reddit posts avg 212 words (~280 tokens)
+def tokenize_reddit(examples):
+    return tokenizer(
+        examples['text'],
+        max_length=256,
+        truncation=True,
+        padding='max_length'
+    )`,
+    src: 'notebooks/DA_2_Notebook.ipynb — cell 9 (tokenize_tweets, max_length=128, D1/D2) · cell 21 (tokenize_reddit, max_length=256, D3)',
+    why: '<strong>SentencePiece subword tokenisation:</strong> Splits unknown words into subword pieces — "suicidal" might become ["su", "ici", "dal"]. No word is truly out-of-vocabulary. <strong>max_length=128 for D1/D2:</strong> Tweets average ~31 words ≈ 40 tokens. 128 is 3× headroom. <strong>max_length=256 for D3:</strong> Reddit posts average 212 words ≈ 280 tokens — 128 would truncate most of the signal. <strong>padding=\'max_length\':</strong> All batches must be identical length for GPU tensor operations — shorter inputs are padded with [PAD] tokens. The attention mask tells the model to ignore padding.',
+    outputs: [
+      {label:'D1/D2 shape', val:'Each input → tensor of shape [128] (input_ids) + [128] (attention_mask)'},
+      {label:'D3 shape', val:'Each input → tensor of shape [256] × 2'},
+      {label:'Example', val:'"i feel hopeless" → input_ids: [0, 444, 7809, 73542, 2, 1, 1, ...]'},
+    ]
+  },
+  lr: {
+    title: 'Logistic Regression',
+    code: `LogisticRegression(
+    max_iter=1000,            # enough iterations to converge on 50K features
+    class_weight='balanced',  # backup alongside SMOTE
+    random_state=42,
+    n_jobs=-1                 # use all CPU cores
+)`,
+    src: 'notebooks/DA_Notebook_One.ipynb — cell 11 (D1) · cell 18 (D2) · cell 23 (D3)',
+    why: '<strong>Why use it?</strong> Fast, interpretable linear baseline. On 50,000 TF-IDF features, L2 regularisation prevents overfitting by shrinking large weights toward zero. Outputs calibrated probabilities via softmax — important for confidence scores in the UI. <strong>class_weight=\'balanced\':</strong> Secondary guard alongside SMOTE — the model pays proportionally more attention to minority classes during gradient updates.',
+    outputs: [
+      {label:'D1', val:'91.5% accuracy — solid baseline, beaten by SVM'},
+      {label:'D2', val:'98.9% accuracy'},
+      {label:'D3', val:'93.2% accuracy'},
+    ]
+  },
+  svm: {
+    title: 'SVM — LinearSVC',
+    code: `LinearSVC(
+    C=1.0,                    # regularisation strength (lower = more reg)
+    class_weight='balanced',
+    max_iter=2000,
+    random_state=42
+)
+# LinearSVC has no predict_proba — use decision_function + softmax
+scores = model.decision_function(vec)[0]
+e = np.exp(scores - scores.max())
+conf = float(e[pred_idx] / e.sum())`,
+    src: 'notebooks/DA_Notebook_One.ipynb — cell 11 (D1) · cell 18 (D2) · cell 23 (D3) · predict.py lines 147–154 (confidence fallback)',
+    why: '<strong>Why SVM wins on D1?</strong> LinearSVC finds the maximum-margin hyperplane in TF-IDF feature space — the optimal linear decision boundary for sparse high-dimensional data. Tweets (31 words avg) produce sparse TF-IDF vectors where the margin is well-defined. Contextual embeddings (XLM-RoBERTa) add no value at this sentence length. <strong>Why LinearSVC over SVC(kernel=\'rbf\')?</strong> Linear kernel scales to 50,000 features. RBF kernel would be O(n²) — computationally infeasible.',
+    outputs: [
+      {label:'D1', val:'92.4% accuracy — best model on D1, beats XLM-RoBERTa (90.5%)'},
+      {label:'D2', val:'97.1% accuracy'},
+      {label:'D3', val:'77.8% accuracy'},
+    ]
+  },
+  xgb: {
+    title: 'XGBoost — XGBClassifier',
+    code: `XGBClassifier(
+    n_estimators=300,         # 300 trees built sequentially
+    learning_rate=0.1,        # each tree contributes 10% of its weight
+    max_depth=6,              # max tree depth — controls complexity
+    eval_metric='logloss',
+    random_state=42,
+    n_jobs=-1
+)`,
+    src: 'notebooks/DA_Notebook_One.ipynb — cell 11 (D1) · cell 18 (D2) · cell 23 (D3)',
+    why: '<strong>Gradient boosting principle:</strong> Each new tree is trained to correct the residual errors of all previous trees. 300 trees × learning_rate=0.1 = strong ensemble. <strong>Why does it collapse on D3?</strong> Vocabulary overlap between depressive and suicidal language in Reddit posts — words like "exhausted", "hopeless" appear in both classes. Boosted trees memorise these majority-class token patterns and fail at full scale (232K). XGBoost is highly sensitive to distribution shifts at this scale, shown by inconsistent H1/H2 results (60.1% vs 71.0%).',
+    outputs: [
+      {label:'D1', val:'91.8% accuracy'},
+      {label:'D2', val:'99.3% accuracy'},
+      {label:'D3 (50K)', val:'91.6% — performs well on sample'},
+      {label:'D3 (Full 232K)', val:'70.5% — collapse (lexical overfitting)'},
+    ]
+  },
+  xlmr_ft: {
+    title: 'XLM-RoBERTa Fine-Tuning',
+    code: `xlmr = AutoModelForSequenceClassification.from_pretrained(
+    'FacebookAI/xlm-roberta-base',
+    num_labels=NUM_LABELS   # 6 for D1, 2 for D2, 2 for D3
+)
+args = TrainingArguments(
+    num_train_epochs=3,
+    learning_rate=2e-5,           # standard BERT fine-tuning rate
+    per_device_train_batch_size=16,  # 8 for D3 (longer sequences)
+    gradient_accumulation_steps=2,   # D3 only — simulates batch=16
+    warmup_steps=200,             # gradual LR increase at start
+    weight_decay=0.01,            # L2 regularisation on weights
+    load_best_model_at_end=True,  # save epoch with lowest val loss
+    fp16=torch.cuda.is_available()  # half precision — 2× faster on GPU
+)
+trainer = Trainer(
+    model=xlmr, args=args,
+    train_dataset=train_tok,
+    eval_dataset=test_tok
+)
+trainer.train()`,
+    src: 'notebooks/DA_2_Notebook.ipynb — cell 10 (model init D1) · cell 11 (TrainingArguments D1) · cell 17 (D2) · cell 22 (model init D3) · cell 23 (TrainingArguments D3)',
+    why: '<strong>lr=2e-5:</strong> Standard for fine-tuning BERT-family models. Too high destroys pretrained weights (catastrophic forgetting). Too low fails to converge in 3 epochs. <strong>warmup_steps=200:</strong> LR starts at 0 and linearly ramps — prevents early instability when weights are far from the task optimum. <strong>load_best_model_at_end:</strong> Epoch 3 is not always best — we restore the checkpoint with the lowest validation loss. <strong>D3 batch=8 + accumulation=2:</strong> max_length=256 uses 2× GPU memory vs 128. Accumulation simulates batch=16 without OOM.',
+    outputs: [
+      {label:'D1', val:'90.5% accuracy (Macro F1: 0.9117, κ=0.8852)'},
+      {label:'D2', val:'99.95% accuracy (Macro F1: 0.9993)'},
+      {label:'D3', val:'98.1% accuracy (Macro F1: 0.9810, κ=0.9620)'},
+    ]
+  },
+  majority_vote: {
+    title: 'Ensemble Vote — Risk Flag Logic',
+    code: `# From predict.py — predict_all() function
+suicide_count = sum(
+    1 for r in d3.values()
+    if 'suicide' in r['label'].lower()
+    and 'non' not in r['label'].lower()
+)
+risk_flag = suicide_count >= 3  # majority = ≥3 of 4 models
+# d3.values() = results from LR, SVM, XGBoost, XLM-RoBERTa
+# XLM-RoBERTa is also checked separately for banner state:
+isSuicide = d3['XLM-RoBERTa'].label includes 'suicide' (JS)
+# Three UI states:
+# risk_flag=True  AND XLM-R agrees  → RED   (High Suicide Risk)
+# risk_flag=True  AND XLM-R dissents → AMBER (Ensemble Conflict)
+# risk_flag=False                   → GREEN (Low Risk)`,
+    src: 'predict.py lines 266–270 (suicide_count + risk_flag) · predict.py line 296 (suicide_votes string) · templates/index.html JS render() — banner state logic',
+    why: '<strong>Why ≥3/4 threshold?</strong> 1–2 flagging models could be TF-IDF false positives (lexical overfitting). 3+ represents genuine consensus — meaningful signal. <strong>Why check XLM-RoBERTa separately for the banner?</strong> XLM-RoBERTa has the highest D3 accuracy (98.1%) and understands context. If XLM-R disagrees with the majority, the amber "Ensemble Conflict" state is safer than a red alert — it flags uncertainty rather than over-alarming on metaphorical language ("I\'m dying of embarrassment").',
+    outputs: [
+      {label:'Threshold', val:'≥ 3/4 D3 models output "suicide" (not "non-suicide")'},
+      {label:'Red banner', val:'risk_flag=True AND XLM-RoBERTa confirms suicide'},
+      {label:'Amber banner', val:'risk_flag=True but XLM-RoBERTa says non-suicide'},
+      {label:'Green banner', val:'risk_flag=False — fewer than 3 models flagged'},
+    ]
+  },
+  eval_metrics: {
+    title: 'Evaluation — How Metrics Are Computed',
+    code: `def evaluate_transformer(name, y_true, y_pred,
+                           label_names, ds_key, results_store):
+    acc   = accuracy_score(y_true, y_pred)
+    macro = f1_score(y_true, y_pred, average='macro')
+    kappa = cohen_kappa_score(y_true, y_pred)
+    print(f'Accuracy     : {acc*100:.2f}%')
+    print(f'Macro F1     : {macro:.4f}')
+    print(f"Cohen's Kappa: {kappa:.4f}")
+    print(classification_report(y_true, y_pred,
+                                target_names=label_names))
+    results_store[name] = {
+        'accuracy': round(acc, 4),
+        'macro_f1': round(macro, 4),
+        'kappa':    round(kappa, 4)
+    }
+# Same function used for classical models in Notebook 1:
+for name, model in models_d1.items():
+    model.fit(X1_bal, y1_bal)         # train on SMOTE-balanced data
+    preds = model.predict(X1_te_tf)   # test on original held-out set
+    evaluate(name, y1_te, preds, le1.classes_, 'd1', d1_results)`,
+    src: 'notebooks/DA_Notebook_One.ipynb — cell 5 (evaluate def, classical) · notebooks/DA_2_Notebook.ipynb — cell 6 (evaluate_transformer def)',
+    why: '<strong>Accuracy:</strong> (correct predictions) / (total predictions). Simple but misleading on imbalanced data — a model predicting majority class always gets high accuracy. Valid here because SMOTE balanced the training set and D3 is pre-balanced. <strong>Macro F1:</strong> Averages F1 per class without weighting by class size — penalises models that ignore minority classes. This is the primary metric in the IEEE report. <strong>Cohen\'s Kappa:</strong> Measures agreement beyond what chance alone would produce. Formula: (observed − expected) / (1 − expected). κ > 0.8 = almost perfect agreement. Reported because the base paper (Tumaliuan 2024) did not report it — we added it as an improvement. <strong>classification_report:</strong> Shows per-class precision, recall, F1 — the full picture behind the headline number.',
+    outputs: [
+      {label:'D1 SVM', val:'Accuracy 92.4%, Macro F1 0.9269, κ=0.9072'},
+      {label:'D2 XLM-R', val:'Accuracy 99.95%, Macro F1 0.9993, κ=0.9986'},
+      {label:'D3 XLM-R', val:'Accuracy 98.1%, Macro F1 0.9810, κ=0.9620'},
+      {label:'Atypical F1', val:'0.992 — highest per-class score in the project (D1, after SMOTE)'},
+    ]
+  },
+  flask_deploy: {
+    title: 'Flask App — Deployment & Proxy Mode',
+    code: `# app.py — auto-detects LOCAL vs PROXY mode at startup
+_LOCAL_MODELS = os.path.join(BASE_DIR, 'models', 'classical')
+_use_local    = os.path.isdir(_LOCAL_MODELS)
+@app.route('/predict', methods=['POST'])
+def predict():
+    data = request.get_json()
+    text = data['text'].strip()
+    if len(text) > 5000:
+        return jsonify({'error': 'Text too long'}), 400
+    if _use_local:
+        # LOCAL mode — models loaded in memory
+        result = predict_all(text)
+        return jsonify(result)
+    else:
+        # PROXY mode — forward to HuggingFace Space
+        r = requests.post(
+            f'{HF_SPACE_URL}/predict',
+            json={'text': text},
+            timeout=120
+        )
+        return r.content, r.status_code
+# HF_SPACE_URL = 'https://esvanth-mindscan.hf.space'
+# Overridable via environment variable`,
+    src: 'app.py lines 25–27 (mode detection) · lines 61–97 (/predict endpoint) · line 70 (5000-char limit) · line 91 (timeout=120)',
+    why: '<strong>Why two modes?</strong> The 12 models total ~2GB on disk. Running locally requires the models folder. The HuggingFace Space hosts the same predict.py and models — the proxy just forwards requests there. <strong>Why timeout=120?</strong> The HF Space sleeps after inactivity and takes ~60s to wake. 120s gives headroom. <strong>Why 5000 char limit?</strong> XLM-RoBERTa max_length=256 tokens ≈ ~1500 characters. 5000 chars is a safe upper bound that prevents abuse without being restrictive. <strong>How the browser talks to Flask:</strong> JavaScript fetch() → POST /predict (localhost:5001) → Flask → HF Space → predict_all() → JSON response → render() updates the UI.',
+    outputs: [
+      {label:'LOCAL mode', val:'Triggered when models/classical/ directory exists. Loads all 12 models at startup (~30s on CPU).'},
+      {label:'PROXY mode', val:'Default — no local models needed. Forwards to esvanth-mindscan.hf.space'},
+      {label:'Timeout', val:'504 returned after 120s if HF Space is sleeping. Auto-wakes in ~60s.'},
+      {label:'Port', val:'localhost:5001 (overridable via PORT env var)'},
+    ]
+  },
+  predict_flow: {
+    title: 'predict_all() — Full Inference Flow',
+    code: `def predict_all(raw_text):
+    # Step 1 — clean text (same function as training)
+    clean = clean_text(raw_text)
+    # Step 2 — run all 3 classical models per dataset
+    #          (LR, SVM, XGBoost share the same TF-IDF vector)
+    def predict_classical(text_clean, ds):
+        tfidf = _models[f'tfidf_{ds}']
+        vec   = tfidf.transform([text_clean])  # sparse vector
+        for model_name in ['logistic_regression','svm','xgboost']:
+            model    = _models[f'{model_name}_{ds}']
+            pred_idx = model.predict(vec)[0]
+            label    = le.classes_[pred_idx]
+            # SVM has no predict_proba — use softmax(decision_function)
+            if hasattr(model, 'predict_proba'):
+                conf = model.predict_proba(vec)[0][pred_idx]
+            else:
+                scores = model.decision_function(vec)[0]
+                e = np.exp(scores - scores.max())
+                conf = e[pred_idx] / e.sum()
+    # Step 3 — run XLM-RoBERTa per dataset
+    def predict_transformer(text_raw, ds):
+        inputs = tokenizer(text_raw, max_length=max_len,
+                           truncation=True, padding='max_length')
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        probs    = torch.softmax(logits, dim=1)[0]
+        pred_idx = probs.argmax()
+    # Step 4 — majority vote for risk_flag
+    suicide_count = sum(1 for r in d3.values()
+        if 'suicide' in r['label'] and 'non' not in r['label'])
+    risk_flag = suicide_count >= 3`,
+    src: 'predict.py — clean_text lines 106–113 · predict_classical lines 119–163 · predict_transformer lines 166–215 · predict_all lines 221–302',
+    why: '<strong>Why clean the text first?</strong> The TF-IDF vocabulary was built on clean text — passing raw text would miss tokens. XLM-RoBERTa receives the raw text because its SentencePiece tokeniser handles punctuation/URLs natively. <strong>Why one TF-IDF vector for 3 classical models?</strong> All three (LR, SVM, XGBoost) use the same vectoriser — the vector is computed once and reused, saving 2 redundant transformations per dataset. <strong>Why torch.no_grad()?</strong> Inference doesn\'t need gradients — disabling them halves memory usage and speeds up the forward pass. <strong>Why softmax on logits?</strong> The model outputs raw logit scores (unbounded). Softmax converts them to probabilities that sum to 1 — required for the confidence percentage shown in the UI.',
+    outputs: [
+      {label:'Input', val:'"I feel exhausted, nothing feels enjoyable"'},
+      {label:'After clean', val:'"i feel exhausted nothing feels enjoyable"'},
+      {label:'D1 winner', val:'SVM → Major Depressive (highest confidence)'},
+      {label:'D2 winner', val:'XLM-RoBERTa → Not Depressed (Twitter Affect Bias — clinical text)'},
+      {label:'D3 result', val:'risk_flag computed from 4 model votes; XLM-R checked separately for banner'},
+      {label:'Response time', val:'~200ms local (GPU) · ~2–5s proxy (HF Space warm)'},
+    ]
+  },
+  split_study: {
+    title: 'D3 Split Study — RQ2',
+    code: `# Sample 25K per class (50K total) for the baseline
+df3_sample = df3.groupby('label').apply(
+    lambda x: x.sample(25000, random_state=42)
+).reset_index(drop=True)
+# Half splits — 12.5K per class each
+df3_h1 = df3.groupby('label').apply(
+    lambda x: x.iloc[:12500]
+).reset_index(drop=True)
+df3_h2 = df3.groupby('label').apply(
+    lambda x: x.iloc[12500:25000]
+).reset_index(drop=True)
+# Full dataset — 116K per class (232K total)
+df3_full = df3  # no sampling
+# KS test to confirm splits share same distribution
+from scipy.stats import ks_2samp
+stat, p = ks_2samp(len_sample, len_full)
+# p > 0.49 across all splits — identical distributions confirmed`,
+    src: 'notebooks/DA_3_SplitStudy.ipynb — cell 28 (sampling) · cell 4 (TrainingArguments) · cell 14 (KS test)',
+    why: '<strong>What is RQ2?</strong> "Does more training data improve performance?" The split study trains 4 separate XLM-RoBERTa models on 50K, 116K (×2), and 232K samples. <strong>KS test:</strong> Kolmogorov-Smirnov test verifies all splits come from the same distribution (p > 0.49) — ruling out that one split has easier examples. <strong>Finding:</strong> Accuracy changes by only 0.3% (98.1% → 98.0%) across 4× more data. The 50K sample fully captures the underlying signal distribution.',
+    outputs: [
+      {label:'50K sample', val:'98.1% accuracy (XLM-RoBERTa)'},
+      {label:'H1 (116K)', val:'97.8% accuracy'},
+      {label:'H2 (116K)', val:'98.0% accuracy'},
+      {label:'Full (232K)', val:'98.0% accuracy — Δ=0.1% vs 50K'},
+      {label:'KS p-value', val:'p > 0.49 across all split pairs — identical distributions'},
+    ]
+  }
+};
+function openCM(key){
+  const d = CM_DATA[key];
+  if(!d) return;
+  document.getElementById('cmTitle').textContent = d.title;
+  // Code tab
+  document.getElementById('cmt0').innerHTML =
+    '<pre class="cm-pre">'+escHTML(d.code)+'</pre>'+
+    '<div class="cm-src">Source: '+escHTML(d.src)+'</div>';
+  // Why tab
+  document.getElementById('cmt1').innerHTML =
+    '<div class="cm-why-body">'+d.why+'</div>';
+  // Output tab
+  const rows = d.outputs.map(o=>
+    '<div class="cm-out-row"><div class="cm-out-lbl">'+escHTML(o.label)+'</div>'+
+    '<div class="cm-out-val"><em>'+escHTML(o.val)+'</em></div></div>'
+  ).join('');
+  document.getElementById('cmt2').innerHTML = rows;
+  // Reset to code tab
+  switchCMTab(0);
+  document.getElementById('cmOverlay').classList.add('open');
+  document.body.style.overflow='hidden';
+}
+function closeCM(){
+  document.getElementById('cmOverlay').classList.remove('open');
+  document.body.style.overflow='';
+}
+function closeCMOutside(e){
+  if(e.target===document.getElementById('cmOverlay')) closeCM();
+}
+function switchCMTab(idx){
+  document.querySelectorAll('.cm-tab').forEach((t,i)=>t.classList.toggle('active',i===idx));
+  document.querySelectorAll('.cm-panel').forEach((p,i)=>p.classList.toggle('active',i===idx));
+}
+function escHTML(s){
+  return String(s).replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;');
+}
+// Close on Escape key
+document.addEventListener('keydown',e=>{ if(e.key==='Escape') closeCM(); });
+function toggleFaq(btn){
+  const item=btn.closest('.faq-item');
+  const wasOpen=item.classList.contains('open');
+  item.classList.toggle('open',!wasOpen);
+}
 </script>
 </body>
 </html>