Spaces:

Esvanth
/

mindscan

Running

File size: 97,360 Bytes

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1.0">
<title>MindScan — Multi-Model Framework for Depression & Suicide Risk Detection</title>
<link href="https://fonts.googleapis.com/css2?family=Instrument+Serif:ital@0;1&family=Geist:wght@300;400;500;600&family=DM+Mono:wght@400;500&display=swap" rel="stylesheet">
<style>
:root{
  --bg:#f7f5f0; --bg2:#efece8; --bg3:#e6e2da; --bg4:#dedad1;
  --ink:#1a1816; --ink2:#5c5750; --ink3:#9c9790;
  --border:rgba(26,24,22,0.09); --border2:rgba(26,24,22,0.16);
  --blue:#1d4ed8; --blue-bg:#eff6ff; --blue-mid:#3b82f6;
  --amber:#b45309; --amber-bg:#fffbeb; --amber-mid:#d97706;
  --red:#b91c1c; --red-bg:#fef2f2;
  --green:#15803d; --green-bg:#f0fdf4;
  --purple:#6d28d9; --purple-bg:#f5f3ff;
  --shadow:0 1px 3px rgba(26,24,22,0.06),0 4px 16px rgba(26,24,22,0.04);
  --shadow-md:0 2px 8px rgba(26,24,22,0.08),0 8px 32px rgba(26,24,22,0.06);
}
*{box-sizing:border-box;margin:0;padding:0}
html{scroll-behavior:smooth}
body{background:var(--bg);color:var(--ink);font-family:'Geist',sans-serif;font-size:15px;line-height:1.6;overflow-x:hidden}

/* ── HEADER ── */
header{
  padding:16px 48px;display:flex;align-items:center;justify-content:space-between;
  border-bottom:1px solid var(--border);background:rgba(247,245,240,0.94);
  position:sticky;top:0;z-index:100;backdrop-filter:blur(10px);
}
.logo{display:flex;align-items:center;gap:10px}
.logo-mark{width:28px;height:28px;background:var(--ink);border-radius:7px;display:flex;align-items:center;justify-content:center}
.logo-mark svg{width:14px;height:14px}
.logo-txt{font-family:'Instrument Serif',serif;font-size:18px;letter-spacing:-.02em}
.logo-txt em{font-style:italic;color:var(--ink2)}
.nav-links{display:flex;gap:2px}
.nav-links a{font-size:12px;color:var(--ink2);padding:5px 10px;border-radius:6px;text-decoration:none;transition:all .15s;font-family:'DM Mono',monospace}
.nav-links a:hover{background:var(--bg2);color:var(--ink)}
.nav-badge{font-size:10px;font-family:'DM Mono',monospace;background:var(--amber-bg);color:var(--amber);border:1px solid rgba(180,83,9,.2);padding:4px 10px;border-radius:20px}

/* ── HERO ── */
.hero{padding:80px 48px 64px;max-width:1040px;margin:0 auto}
.hero-top{display:grid;grid-template-columns:1fr 360px;gap:48px;align-items:start;margin-bottom:52px}
.hero-eyebrow{display:flex;align-items:center;gap:8px;margin-bottom:18px}
.eyebrow-dot{width:6px;height:6px;border-radius:50%;background:var(--green);animation:blink 2.5s infinite}
@keyframes blink{0%,100%{opacity:1}50%{opacity:.2}}
.eyebrow-txt{font-size:11px;font-family:'DM Mono',monospace;color:var(--ink3);letter-spacing:.1em;text-transform:uppercase}
.hero h1{font-family:'Instrument Serif',serif;font-size:clamp(28px,3.6vw,42px);font-weight:400;line-height:1.13;letter-spacing:-.03em;color:var(--ink);margin-bottom:18px}
.hero h1 em{font-style:italic;color:var(--ink2)}
.hero-sub{font-size:15px;color:var(--ink2);line-height:1.7;margin-bottom:24px;max-width:500px}

/* RQ Cards */
.rq-cards{display:flex;flex-direction:column;gap:10px}
.rq-card{border-radius:12px;padding:16px 18px;border:1px solid}
.rq-card.rq1{background:var(--blue-bg);border-color:rgba(29,78,216,.2)}
.rq-card.rq2{background:var(--amber-bg);border-color:rgba(180,83,9,.2)}
.rq-label{font-size:9px;font-family:'DM Mono',monospace;letter-spacing:.14em;text-transform:uppercase;font-weight:500;margin-bottom:5px}
.rq-card.rq1 .rq-label{color:var(--blue)}
.rq-card.rq2 .rq-label{color:var(--amber)}
.rq-text{font-size:13px;color:var(--ink);line-height:1.5}

/* Stats panel */
.stats-panel{background:var(--bg2);border:1px solid var(--border);border-radius:16px;padding:24px;display:grid;grid-template-columns:1fr 1fr;gap:16px;box-shadow:var(--shadow)}
.stat-box{text-align:center;padding:12px;background:var(--bg);border-radius:10px;border:1px solid var(--border)}
.stat-num{font-family:'Instrument Serif',serif;font-size:28px;letter-spacing:-.02em;color:var(--ink);line-height:1}
.stat-lbl{font-size:10px;font-family:'DM Mono',monospace;color:var(--ink3);margin-top:4px;text-transform:uppercase;letter-spacing:.08em}

/* ── SECTION SHARED ── */
.section{max-width:1040px;margin:0 auto;padding:64px 48px}
.sec-eyebrow{font-size:10px;font-family:'DM Mono',monospace;letter-spacing:.12em;text-transform:uppercase;color:var(--blue);margin-bottom:10px}
.sec-h2{font-family:'Instrument Serif',serif;font-size:clamp(24px,3.5vw,38px);font-weight:400;letter-spacing:-.02em;line-height:1.15;margin-bottom:8px}
.sec-h2 em{font-style:italic;color:var(--ink2)}
.sec-lead{font-size:14px;color:var(--ink2);max-width:560px;line-height:1.7;margin-bottom:36px}
.section-divider{border:none;border-top:1px solid var(--border);margin:0}

/* ── BASE PAPER COMPARISON ── */
.comparison-wrap{display:grid;grid-template-columns:1fr auto 1fr;gap:16px;align-items:center}
.comp-card{border-radius:14px;padding:26px;border:1px solid;box-shadow:var(--shadow)}
.comp-card.theirs{background:var(--bg2);border-color:var(--border2)}
.comp-card.ours{background:#fff;border-color:rgba(21,128,61,.25);box-shadow:var(--shadow-md)}
.comp-label{font-size:10px;font-family:'DM Mono',monospace;letter-spacing:.1em;text-transform:uppercase;margin-bottom:12px;padding:4px 10px;border-radius:4px;display:inline-block}
.comp-card.theirs .comp-label{background:var(--bg3);color:var(--ink3)}
.comp-card.ours .comp-label{background:var(--green-bg);color:var(--green)}
.comp-title{font-family:'Instrument Serif',serif;font-size:18px;letter-spacing:-.01em;color:var(--ink);margin-bottom:4px}
.comp-sub{font-size:12px;color:var(--ink2);margin-bottom:18px}
.comp-row{display:flex;align-items:flex-start;gap:8px;margin-bottom:9px;font-size:13px}
.comp-icon{width:16px;height:16px;border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:9px;flex-shrink:0;margin-top:1px}
.comp-icon.bad{background:rgba(185,28,28,.1);color:var(--red)}
.comp-icon.good{background:var(--green-bg);color:var(--green)}
.comp-text{color:var(--ink2);line-height:1.45}
.comp-text strong{color:var(--ink)}
.comp-f1-row{margin-top:18px;padding-top:14px;border-top:1px solid var(--border);display:flex;align-items:center;gap:10px}
.comp-f1-label{font-size:11px;font-family:'DM Mono',monospace;color:var(--ink3)}
.comp-f1-val{font-family:'Instrument Serif',serif;font-size:24px;letter-spacing:-.02em}
.comp-card.theirs .comp-f1-val{color:var(--ink3)}
.comp-card.ours .comp-f1-val{color:var(--green)}
.comp-middle{text-align:center;padding:16px 12px}
.comp-arrow{font-size:24px;color:var(--green);margin-bottom:6px}
.comp-delta{font-family:'Instrument Serif',serif;font-size:28px;color:var(--green);letter-spacing:-.02em}
.comp-delta-lbl{font-size:10px;font-family:'DM Mono',monospace;color:var(--ink3);margin-top:2px;text-transform:uppercase}

/* ── METHODOLOGY 3-STEP ── */
.method-steps{display:grid;grid-template-columns:repeat(3,1fr);gap:0;position:relative}
.method-steps::before{content:'';position:absolute;top:22px;left:22px;right:22px;height:2px;background:var(--bg3);z-index:0}
.method-step{text-align:center;position:relative;z-index:2;padding:0 20px;cursor:pointer;transition:opacity .15s}
.method-step:hover{opacity:.8}
.ms-dot{width:44px;height:44px;border-radius:50%;background:var(--ink);border:2px solid var(--ink);display:flex;align-items:center;justify-content:center;margin:0 auto 14px;font-size:12px;font-family:'DM Mono',monospace;color:#fff;transition:background .2s,border-color .2s}
.method-step.active .ms-dot{background:var(--blue);border-color:var(--blue)}
.ms-title{font-size:14px;font-weight:500;color:var(--ink);margin-bottom:7px}
.ms-body{font-size:12px;color:var(--ink2);line-height:1.65}
/* Detail panel */
.method-detail{margin-top:32px;background:#fff;border:1px solid var(--border);border-radius:14px;padding:28px 32px;box-shadow:var(--shadow);animation:fadeUp .25s ease both}
.md-panel{display:none}
.md-panel.active{display:block}
.md-title{font-size:13px;font-weight:600;color:var(--ink);margin-bottom:14px;display:flex;align-items:center;gap:8px}
.md-title-dot{width:8px;height:8px;border-radius:50%;background:var(--blue);flex-shrink:0}
.md-grid{display:grid;grid-template-columns:1fr 1fr;gap:16px}
.md-block{padding:14px;background:var(--bg);border-radius:8px;border:1px solid var(--border)}
.md-block-lbl{font-size:9px;font-family:'DM Mono',monospace;letter-spacing:.1em;text-transform:uppercase;color:var(--ink3);margin-bottom:5px}
.md-block-val{font-size:12px;color:var(--ink);line-height:1.6}
.md-block-val strong{color:var(--red)}
.md-block-val em{color:var(--blue);font-style:normal;font-weight:500}

/* ── EVIDENCE MATRIX ── */
.matrix-wrap{overflow-x:auto;margin-top:28px}
.matrix-tbl{width:100%;border-collapse:collapse;font-size:13px}
.matrix-tbl th{text-align:center;padding:11px 16px;background:var(--bg2);border-bottom:2px solid var(--border2);font-size:10px;font-family:'DM Mono',monospace;letter-spacing:.1em;text-transform:uppercase;color:var(--ink3);font-weight:500}
.matrix-tbl th:first-child{text-align:left;min-width:200px}
.matrix-tbl td{padding:12px 16px;border-bottom:1px solid var(--border);text-align:center;font-family:'DM Mono',monospace;font-size:13px;color:var(--ink2);vertical-align:middle}
.matrix-tbl td:first-child{text-align:left;font-family:'Geist',sans-serif;font-size:13px;color:var(--ink)}
.matrix-tbl tr:hover td{background:var(--bg2)}
.matrix-tbl tr:last-child td{border-bottom:none}
.matrix-tbl td.winner{font-weight:600;color:var(--ink)}
.matrix-tbl td.collapsed{background:rgba(185,28,28,.07);color:var(--red)}
.ds-label{display:inline-flex;align-items:center;gap:8px}
.ds-badge{font-size:9px;font-family:'DM Mono',monospace;padding:2px 7px;border-radius:3px;font-weight:500;flex-shrink:0}

/* ── MATRIX FOOTNOTE ── */
.matrix-footnote{margin-top:14px;font-size:11px;color:var(--ink3);line-height:1.6;padding:10px 14px;background:var(--bg2);border:1px solid var(--border);border-radius:7px;font-family:'DM Mono',monospace}
.matrix-footnote strong{color:var(--ink2)}

/* ── FINDINGS ── */
.findings-grid{display:grid;grid-template-columns:1fr 1fr;gap:14px}
.finding:nth-child(5){grid-column:1/-1;}
.finding{background:#fff;border:1px solid var(--border);border-radius:12px;padding:22px;box-shadow:var(--shadow)}
.finding-n{font-family:'Instrument Serif',serif;font-size:36px;color:var(--bg3);line-height:1;margin-bottom:8px}
.finding-t{font-size:13px;font-weight:500;color:var(--ink);margin-bottom:6px}
.finding-b{font-size:12px;color:var(--ink2);line-height:1.65}
.finding-chip{display:inline-block;font-family:'DM Mono',monospace;font-size:10px;background:var(--bg2);border:1px solid var(--border);padding:3px 8px;border-radius:4px;margin-top:8px;color:var(--ink2)}

/* ── VERDICT ── */
.verdict-grid{display:grid;grid-template-columns:1fr 1fr;gap:14px;margin-bottom:24px}
.verdict-card{background:#fff;border:1px solid var(--border);border-radius:12px;padding:20px;box-shadow:var(--shadow)}
.verdict-card.rq{border-left:3px solid var(--blue-mid)}
.verdict-card.lim{border-left:3px solid var(--amber-mid)}
.vc-eyebrow{font-size:9px;font-family:'DM Mono',monospace;letter-spacing:.12em;text-transform:uppercase;margin-bottom:6px;font-weight:500}
.verdict-card.rq .vc-eyebrow{color:var(--blue)}
.verdict-card.lim .vc-eyebrow{color:var(--amber)}
.vc-title{font-size:13px;font-weight:500;color:var(--ink);margin-bottom:6px}
.vc-body{font-size:12px;color:var(--ink2);line-height:1.65}
.vc-chip{display:inline-block;font-family:'DM Mono',monospace;font-size:10px;background:var(--blue-bg);border:1px solid rgba(29,78,216,.2);color:var(--blue);padding:3px 8px;border-radius:4px;margin-top:8px}
.verdict-card.lim .vc-chip{background:var(--amber-bg);border-color:rgba(180,83,9,.2);color:var(--amber)}

/* ── DEMO ── */
.demo-section{max-width:1040px;margin:0 auto;padding:64px 48px}
.input-card{background:#fff;border:1px solid var(--border);border-radius:14px;padding:24px;box-shadow:var(--shadow);margin-bottom:16px}
textarea{
  width:100%;background:var(--bg);border:1px solid var(--border2);border-radius:8px;
  padding:13px 15px;font-family:'Geist',sans-serif;font-size:14px;color:var(--ink);
  resize:vertical;min-height:100px;outline:none;line-height:1.6;transition:border-color .15s,box-shadow .15s;
}
textarea:focus{border-color:rgba(29,78,216,.4);box-shadow:0 0 0 3px rgba(29,78,216,.07)}
textarea::placeholder{color:var(--ink3)}
.input-foot{display:flex;align-items:center;justify-content:space-between;margin-top:10px;flex-wrap:wrap;gap:8px}
.char-count{font-size:11px;font-family:'DM Mono',monospace;color:var(--ink3)}
.samples{display:flex;gap:5px;flex-wrap:wrap}
.sbtn{font-size:10px;font-family:'DM Mono',monospace;background:var(--bg2);border:1px solid var(--border);border-radius:5px;padding:5px 10px;cursor:pointer;color:var(--ink2);transition:all .15s}
.sbtn:hover{border-color:var(--border2);color:var(--ink)}
.sbtn.danger{border-color:rgba(185,28,28,.25);color:var(--red);background:var(--red-bg);display:flex;align-items:center;gap:5px}
.sbtn-pulse{width:5px;height:5px;border-radius:50%;background:var(--red);animation:blink 1.5s infinite}
.run-btn{
  width:100%;margin-top:12px;background:var(--ink);color:#fff;border:none;
  border-radius:9px;padding:13px 24px;font-family:'Geist',sans-serif;font-size:14px;
  font-weight:500;cursor:pointer;display:flex;align-items:center;justify-content:center;
  gap:8px;transition:opacity .15s,transform .1s;letter-spacing:-.01em;
}
.run-btn:hover{opacity:.87}
.run-btn:active{transform:scale(.99)}
.run-btn:disabled{opacity:.45;cursor:not-allowed}
.spinner{width:14px;height:14px;border:2px solid rgba(255,255,255,.3);border-top-color:#fff;border-radius:50%;animation:spin .7s linear infinite;display:none}
@keyframes spin{to{transform:rotate(360deg)}}
.disclaimer{font-size:11px;color:var(--ink3);line-height:1.6;padding:10px 14px;background:var(--amber-bg);border:1px solid rgba(180,83,9,.15);border-radius:7px;margin-bottom:20px}

/* Results */
.results{display:none;animation:fadeUp .35s ease both}
@keyframes fadeUp{from{opacity:0;transform:translateY(8px)}to{opacity:1;transform:translateY(0)}}
.risk-banner{border-radius:10px;padding:14px 18px;margin-bottom:16px;border:1px solid;display:flex;align-items:flex-start;gap:12px}
.risk-banner.danger{background:var(--red-bg);border-color:rgba(185,28,28,.25)}
.risk-banner.safe{background:var(--green-bg);border-color:rgba(21,128,61,.2)}
.risk-banner.warn{background:var(--amber-bg);border-color:rgba(180,83,9,.25)}
.rb-icon{font-size:18px;flex-shrink:0;margin-top:1px}
.rb-title{font-size:13px;font-weight:500;margin-bottom:3px}
.rb-body{font-size:12px;line-height:1.55;color:var(--ink2)}
.risk-banner.danger .rb-title{color:var(--red)}
.risk-banner.safe .rb-title{color:var(--green)}
.risk-banner.warn .rb-title{color:var(--amber)}
.masked-callout{background:var(--amber-bg);border:1px solid rgba(180,83,9,.2);border-radius:10px;padding:14px 18px;margin-bottom:16px;display:none;animation:fadeUp .3s ease both}
.mc-callout-title{font-size:13px;font-weight:500;color:var(--amber);margin-bottom:4px;display:flex;align-items:center;gap:7px}
.mc-callout-body{font-size:12px;color:#92400e;line-height:1.6}

.results-hdr{display:flex;align-items:center;justify-content:space-between;margin-bottom:14px}
.results-hdr-title{font-size:14px;font-weight:500;color:var(--ink)}
.elapsed-chip{font-size:10px;font-family:'DM Mono',monospace;color:var(--ink3);background:var(--bg2);border:1px solid var(--border);padding:3px 9px;border-radius:20px}
.winner-grid{display:grid;grid-template-columns:repeat(3,1fr);gap:12px}
.win-card{border-radius:12px;padding:18px;border:1px solid;animation:fadeUp .4s ease both;background:#fff;box-shadow:var(--shadow);transition:all .3s ease}
.win-card.d1{border-color:rgba(29,78,216,.2);animation-delay:.14s}
.win-card.d2{border-color:rgba(180,83,9,.2);animation-delay:.07s}
.win-card.d3{border-color:rgba(185,28,28,.2)}
/* D3 dominant state when suicide risk is flagged */
.win-card.d3.risk-active{
  border-color:var(--red);border-width:2px;
  background:var(--red-bg);box-shadow:0 0 0 4px rgba(185,28,28,.08),var(--shadow-md);
}
.win-card.d3.risk-active .wc-pred{color:var(--red)}
.wc-lbl{font-size:9px;font-family:'DM Mono',monospace;letter-spacing:.1em;text-transform:uppercase;margin-bottom:8px;line-height:1.5}
.win-card.d1 .wc-lbl{color:var(--blue)}
.win-card.d2 .wc-lbl{color:var(--amber)}
.win-card.d3 .wc-lbl{color:var(--red)}
.wc-pred{font-family:'Instrument Serif',serif;font-size:20px;letter-spacing:-.02em;color:var(--ink);margin-bottom:6px;min-height:48px;display:flex;align-items:flex-end}
.conf-row{display:flex;align-items:center;gap:8px;margin-bottom:5px}
.conf-track{flex:1;height:4px;background:var(--bg2);border-radius:2px;overflow:hidden}
.conf-fill{height:100%;border-radius:2px;transition:width .8s cubic-bezier(.4,0,.2,1);width:0}
.win-card.d1 .conf-fill{background:var(--blue-mid)}
.win-card.d2 .conf-fill{background:var(--amber-mid)}
.win-card.d3 .conf-fill{background:var(--red)}
.conf-pct{font-size:11px;font-family:'DM Mono',monospace;min-width:34px;text-align:right}
.win-card.d1 .conf-pct{color:var(--blue)}
.win-card.d2 .conf-pct{color:var(--amber)}
.win-card.d3 .conf-pct{color:var(--red)}
.win-card.d3.risk-active .conf-pct{font-size:14px;font-weight:500}
.wc-meta{font-size:11px;color:var(--ink3)}
/* Clinical Insight Alert */
.clinical-insight{background:var(--amber-bg);border:1px solid rgba(180,83,9,.2);border-left:3px solid var(--amber-mid);border-radius:10px;padding:14px 18px;margin-bottom:16px;display:none;animation:fadeUp .3s ease both}
.ci-title{font-size:13px;font-weight:500;color:var(--amber);margin-bottom:5px;display:flex;align-items:center;gap:7px}
.ci-body{font-size:12px;color:#92400e;line-height:1.65}

/* ── GLOSSARY TOOLTIPS ── */
.gloss{border-bottom:1px dashed var(--ink3);cursor:help;position:relative;display:inline}
.gloss::after{content:attr(data-tip);position:absolute;bottom:calc(100% + 8px);left:50%;transform:translateX(-50%);background:var(--ink);color:#fff;font-size:11.5px;padding:8px 12px;border-radius:8px;width:230px;white-space:normal;line-height:1.5;font-family:'Geist',sans-serif;letter-spacing:0;text-align:left;opacity:0;pointer-events:none;transition:opacity .15s;z-index:300;box-shadow:0 4px 16px rgba(0,0,0,.18)}
.gloss::before{content:'';position:absolute;bottom:calc(100% + 2px);left:50%;transform:translateX(-50%);border:5px solid transparent;border-top-color:var(--ink);opacity:0;transition:opacity .15s;z-index:301}
.gloss:hover::after,.gloss:hover::before{opacity:1}

/* ── CODE MODAL ── */
.cm-term{border-bottom:1px dashed var(--blue);color:var(--ink);cursor:pointer;display:inline-flex;align-items:center;gap:5px;transition:color .15s}
.cm-term:hover{color:var(--blue)}
.cm-term::after{content:'</>';font-family:'DM Mono',monospace;font-size:9px;color:var(--blue);opacity:.7;letter-spacing:-.03em}
.cm-overlay{position:fixed;inset:0;background:rgba(26,24,22,.45);z-index:500;display:none;align-items:center;justify-content:center;padding:20px;backdrop-filter:blur(3px)}
.cm-overlay.open{display:flex}
.cm-box{background:#fff;border-radius:16px;width:100%;max-width:680px;max-height:88vh;display:flex;flex-direction:column;box-shadow:0 24px 80px rgba(0,0,0,.18);overflow:hidden}
.cm-head{padding:20px 24px 0;display:flex;align-items:flex-start;justify-content:space-between;gap:16px}
.cm-title{font-family:'Instrument Serif',serif;font-size:22px;letter-spacing:-.02em;color:var(--ink)}
.cm-close{width:28px;height:28px;border-radius:50%;border:1px solid var(--border);background:var(--bg2);cursor:pointer;font-size:14px;display:flex;align-items:center;justify-content:center;flex-shrink:0;color:var(--ink2)}
.cm-close:hover{background:var(--bg3)}
.cm-tabs{display:flex;gap:2px;padding:12px 24px 0;border-bottom:1px solid var(--border)}
.cm-tab{font-size:11px;font-family:'DM Mono',monospace;padding:6px 14px;border-radius:6px 6px 0 0;cursor:pointer;border:1px solid transparent;border-bottom:none;color:var(--ink2);margin-bottom:-1px;background:none}
.cm-tab.active{background:#fff;border-color:var(--border);color:var(--ink)}
.cm-body{overflow-y:auto;padding:20px 24px 24px}
.cm-panel{display:none}
.cm-panel.active{display:block}
.cm-pre{background:var(--ink);color:#e8e4dc;font-family:'DM Mono',monospace;font-size:12px;line-height:1.7;padding:16px 18px;border-radius:10px;overflow-x:auto;white-space:pre;margin-bottom:10px}
.cm-src{font-size:11px;color:var(--ink3);font-family:'DM Mono',monospace;margin-top:6px}
.cm-why-body{font-size:13.5px;color:var(--ink2);line-height:1.8}
.cm-why-body strong{color:var(--ink)}
.cm-out-row{display:flex;align-items:flex-start;gap:12px;padding:10px 14px;background:var(--bg2);border-radius:8px;margin-bottom:8px}
.cm-out-lbl{font-size:10px;font-family:'DM Mono',monospace;color:var(--ink3);min-width:28px;text-transform:uppercase;margin-top:2px}
.cm-out-val{font-size:13px;color:var(--ink);line-height:1.5}
.cm-out-val em{font-family:'DM Mono',monospace;font-size:11.5px;color:var(--blue)}

/* ── FAQ ACCORDION ── */
.faq-section{max-width:1040px;margin:0 auto;padding:64px 48px}
.faq-group{margin-bottom:32px}
.faq-group-title{font-size:10px;font-family:'DM Mono',monospace;letter-spacing:.12em;text-transform:uppercase;color:var(--blue);margin-bottom:12px}
.faq-item{border:1px solid var(--border);border-radius:10px;margin-bottom:6px;overflow:hidden;background:#fff}
.faq-q{width:100%;text-align:left;background:none;border:none;padding:14px 18px;font-size:13px;font-family:'Geist',sans-serif;color:var(--ink);cursor:pointer;display:flex;justify-content:space-between;align-items:center;gap:16px;line-height:1.45}
.faq-q:hover{background:var(--bg2)}
.faq-q .faq-chevron{font-size:10px;color:var(--ink3);flex-shrink:0;transition:transform .2s}
.faq-item.open .faq-chevron{transform:rotate(180deg)}
.faq-a{max-height:0;overflow:hidden;transition:max-height .25s ease}
.faq-item.open .faq-a{max-height:500px}
.faq-a-inner{padding:0 18px 14px;font-size:12.5px;color:var(--ink2);line-height:1.75;border-top:1px solid var(--border)}
.faq-a-inner code{font-family:'DM Mono',monospace;font-size:11px;background:var(--bg2);padding:1px 5px;border-radius:3px;color:var(--ink)}

footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);font-size:11px;font-family:'DM Mono',monospace;color:var(--ink3);line-height:1.9}
@media(max-width:768px){
  header,.section,.demo-section,footer{padding-left:20px;padding-right:20px}
  .hero{padding:48px 20px 40px}
  .hero-top,.findings-grid,.winner-grid{grid-template-columns:1fr}
  .method-steps{grid-template-columns:1fr;gap:24px}
  .method-steps::before{display:none}
}
</style>
</head>
<body>

<!-- HEADER -->
<header>
  <div class="logo">
    <div class="logo-mark">
      <svg viewBox="0 0 14 14" fill="none">
        <circle cx="7" cy="7" r="5.5" stroke="white" stroke-width="1.3"/>
        <path d="M5 7c0-1.2.8-2 2-2s2 .8 2 2-.8 2-2 2" stroke="white" stroke-width="1.3" stroke-linecap="round"/>
        <circle cx="7" cy="7" r="1.2" fill="white"/>
      </svg>
    </div>
    <div class="logo-txt">Mind<em>Scan</em></div>
  </div>
  <nav class="nav-links">
    <a href="#comparison">vs Base Paper</a>
    <a href="#methodology">Methodology</a>
    <a href="#matrix">Evidence Matrix</a>
    <a href="#findings">Findings</a>
    <a href="#verdict">Conclusions</a>
    <a href="#demo">Live Demo</a>
    <a href="#faq">FAQ</a>
    <a href="/flow" target="_blank" style="background:var(--bg3);color:var(--ink)">System Flow ↗</a>
  </nav>
  <div class="nav-badge">NCI H9DAI 2026</div>
</header>

<!-- HERO -->
<div class="hero">
  <div class="hero-top">
    <div>
      <div class="hero-eyebrow"><div class="eyebrow-dot"></div><span class="eyebrow-txt">Mental health NLP research · NCI H9DAI</span></div>
      <h1>Multi-Model Framework for Depression Classification and <em>Suicide Risk Detection</em> from Social Media Text</h1>
      <p class="hero-sub">A parallel ensemble of 12 classifiers across 3 clinical datasets — extending Tumaliuan et al. (2024) with modern transformers and SMOTE balancing.</p>
      <div class="rq-cards">
        <div class="rq-card rq1">
          <div class="rq-label">RQ1</div>
          <div class="rq-text">Which machine learning model provides the highest Accuracy for identifying depression and suicide risk?</div>
        </div>
        <div class="rq-card rq2">
          <div class="rq-label">RQ2</div>
          <div class="rq-text">Does training on the full dataset (232K), a half split (116K), or a sample (50K) provide a significant boost in Accuracy?</div>
        </div>
      </div>
    </div>
    <div class="stats-panel">
      <div class="stat-box"><div class="stat-num" data-target="3" data-suffix="">0</div><div class="stat-lbl">Datasets</div></div>
      <div class="stat-box"><div class="stat-num" data-target="12" data-suffix="">0</div><div class="stat-lbl">Models trained</div></div>
      <div class="stat-box"><div class="stat-num" data-target="98.1" data-suffix="%" data-dec="1">0</div><div class="stat-lbl">D3 Accuracy (binary)</div></div>
      <div class="stat-box"><div class="stat-num" data-target="11.4" data-suffix="%" data-dec="1">0</div><div class="stat-lbl">vs Base Paper ↑</div></div>
    </div>
  </div>
</div>

<hr class="section-divider">

<!-- BASE PAPER COMPARISON -->
<section class="section" id="comparison">
  <div class="sec-eyebrow">Extending prior work</div>
  <div class="sec-h2">Our work vs <em>Tumaliuan et al. (2024)</em></div>
  <p class="sec-lead">Dataset 1 is structurally equivalent to the base paper's Filipino Twitter corpus — same 6-class task, same clinical annotation method — making a direct F1 comparison valid.</p>

  <div class="comparison-wrap">
    <div class="comp-card theirs">
      <div class="comp-label">Tumaliuan et al. — 2024</div>
      <div class="comp-title">Filipino Twitter Depression</div>
      <div class="comp-sub">Frontiers in Computer Science · word2vec pipeline</div>
      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text">Used <strong><span class="gloss" data-tip="word2vec (2013): maps words to fixed vectors based on co-occurrence. Cannot understand negation ('not happy' ≈ 'happy') or context. Superseded by transformers.">word2vec</span></strong> (2013) — static embeddings, no negation handling</div></div>
      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text"><strong><span class="gloss" data-tip="SVM (Support Vector Machine): finds the maximum-margin hyperplane separating classes. Very effective for high-dimensional text features like TF-IDF. Gold standard for NLP before transformers.">SVM never tested</span></strong> — absent from evaluation despite being NLP gold standard</div></div>
      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text"><strong><span class="gloss" data-tip="XGBoost: gradient-boosted decision trees. Sequentially builds trees to correct previous errors. Handles imbalanced data well and often beats random forests on tabular/sparse features.">XGBoost never tested</span></strong> — gradient boosting entirely absent</div></div>
      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text">Class imbalance listed as <strong>limitation — never resolved</strong></div></div>
      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text"><strong>Restricted dataset</strong> — requires author permission to access</div></div>
      <div class="comp-row"><div class="comp-icon bad">✕</div><div class="comp-text"><strong>Accuracy not verified</strong> — no reproducible baseline reported</div></div>
      <div class="comp-f1-row"><span class="comp-f1-label">Best Accuracy</span><span class="comp-f1-val">~81%</span></div>
    </div>

    <div class="comp-middle">
      <div class="comp-arrow">→</div>
      <div class="comp-delta">+11.4%</div>
      <div class="comp-delta-lbl">accuracy gain (D1)</div>
    </div>

    <div class="comp-card ours">
      <div class="comp-label">MindScan — 2026</div>
      <div class="comp-title">English Twitter + Reddit</div>
      <div class="comp-sub">Zenodo (Nusrat 2024) · XLM-RoBERTa + SVM + XGBoost</div>
      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><strong><span class="gloss" data-tip="XLM-RoBERTa: 278M-parameter multilingual transformer. Fine-tuned on 100 languages. Produces contextual embeddings — the same word gets different vectors depending on surrounding context. Understands negation, irony, and long-range dependencies.">XLM-RoBERTa</span></strong> (2019) — contextual embeddings, understands negation</div></div>
      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><strong>SVM added</strong> — best D1 accuracy 92.36%, beats transformer (90.52%)</div></div>
      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><strong>XGBoost added</strong> — accuracy 91.76%, gradient boosting for imbalanced data</div></div>
      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><span class="gloss" data-tip="SMOTE (Synthetic Minority Oversampling Technique): generates synthetic training samples for minority classes by interpolating between existing minority-class examples in feature space. Applied to training data only — never the test set.">SMOTE</span> applied — <strong>imbalance resolved</strong>, all 6 classes equalised to 2,997</div></div>
      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><strong>Public dataset</strong> — fully reproducible, anyone can verify results</div></div>
      <div class="comp-row"><div class="comp-icon good">✓</div><div class="comp-text"><strong>Accuracy verified</strong> on held-out 20% test set, same 6-class task</div></div>
      <div class="comp-f1-row"><span class="comp-f1-label">Best Accuracy (D1 SVM)</span><span class="comp-f1-val">92.4%</span></div>
    </div>
  </div>
</section>

<hr class="section-divider">

<!-- METHODOLOGY -->
<section class="section" id="methodology">
  <div class="sec-eyebrow">Methodology</div>
  <div class="sec-h2">Three-step <em>pipeline</em></div>
  <p class="sec-lead"><span class="gloss" data-tip="CRISP-DM: Cross-Industry Standard Process for Data Mining. 6 phases: Business Understanding → Data Understanding → Data Preparation → Modelling → Evaluation → Deployment. The de facto lifecycle framework for data science projects.">CRISP-DM</span> applied across all three datasets — from raw social media text to parallel ensemble predictions.</p>

  <div class="method-steps">
    <div class="method-step active" onclick="showMethodDetail(0)">
      <div class="ms-dot">01</div>
      <div class="ms-title">Data</div>
      <div class="ms-body">3 clinical datasets spanning Twitter and Reddit, covering depression types, binary detection, and suicide risk.</div>
    </div>
    <div class="method-step" onclick="showMethodDetail(1)">
      <div class="ms-dot">02</div>
      <div class="ms-title">Preprocessing</div>
      <div class="ms-body">6-stage text cleaning pipeline + SMOTE oversampling to address class imbalance left unresolved by the base paper.</div>
    </div>
    <div class="method-step" onclick="showMethodDetail(2)">
      <div class="ms-dot">03</div>
      <div class="ms-title">Modelling</div>
      <div class="ms-body">Parallel ensemble of 12 classifiers — all run independently on every prediction, never as a sequential cascade.</div>
    </div>
  </div>

  <div class="method-detail">
    <!-- Panel 0: Data -->
    <div class="md-panel active" id="mdp0">
      <div class="md-title"><div class="md-title-dot"></div>Dataset Overview</div>
      <div class="md-grid">
        <div class="md-block">
          <div class="md-block-lbl">D1 — Depression Types (Zenodo 14233292)</div>
          <div class="md-block-val"><em>14,983 tweets · 6 classes</em> — Postpartum (3,746), Major Depressive (2,517), Bipolar (2,443), Psychotic (2,312), No Depression (1,985), Atypical (1,980). Psychiatrist-verified labels. Class imbalance ratio: 1.89×.</div>
        </div>
        <div class="md-block">
          <div class="md-block-lbl">D2 — Binary Depression (Kaggle: albertobellardini)</div>
          <div class="md-block-val"><em>10,314 tweets · 2 classes</em> — Not Depressed (8,000) / Depressed (2,314). Severe class imbalance: <strong>3.46×</strong>. Twitter short-form text. SMOTE applied to training set (8,251 → 12,800 samples). Trained on Twitter affect patterns — may underdetect atypical presentations.</div>
        </div>
        <div class="md-block">
          <div class="md-block-lbl">D3 — Suicide Risk (Kaggle: nikhileswarkomati)</div>
          <div class="md-block-val"><em>232,074 Reddit posts · 2 classes</em> — Suicide / Non-Suicide (perfectly balanced, 116,037 each). Suicide posts average <em>200.8 words</em> (mean), non-suicide posts 63 words. We sample <em>50K posts</em> and compare against full/half splits to answer RQ2.</div>
        </div>
        <div class="md-block">
          <div class="md-block-lbl">Business Context</div>
          <div class="md-block-val">A clinically-motivated framework for social media monitoring — applicable to platform-level moderation, mental health triage, and early intervention systems. Complements rather than replaces clinical assessment.</div>
        </div>
      </div>
    </div>
    <!-- Panel 1: Preprocessing -->
    <div class="md-panel" id="mdp1">
      <div class="md-title"><div class="md-title-dot"></div>Preprocessing Pipeline</div>
      <div class="md-grid">
        <div class="md-block">
          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('clean_text')">6-Stage Text Cleaning</span></div>
          <div class="md-block-val">1. Lowercase · 2. Strip URLs &amp; http links · 3. Remove @mentions · 4. Remove # symbols · 5. Strip punctuation · 6. Collapse whitespace. Applied identically across all three datasets for consistency.</div>
        </div>
        <div class="md-block">
          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('smote')">SMOTE — Synthetic Oversampling</span></div>
          <div class="md-block-val">Applied to D1 and D2 training sets only (D3 is pre-balanced). D1: 11,986 → <em>17,982 samples</em>. D2: 8,251 → <em>12,800 samples</em>. Creates synthetic clinical neighbours in TF-IDF feature space. Directly addresses the base paper's (Tumaliuan 2024) biggest limitation — they trained on raw imbalanced data.</div>
        </div>
        <div class="md-block">
          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('tfidf')">Feature Extraction — TF-IDF</span></div>
          <div class="md-block-val"><span class="gloss" data-tip="TF-IDF (Term Frequency–Inverse Document Frequency): scores each word by how often it appears in a document (TF) divided by how common it is across all documents (IDF). Settings: max_features=50,000, ngram_range=(1,2), sublinear_tf=True, min_df=2.">TF-IDF</span> vectoriser with unigrams + bigrams, fitted per-dataset on training data only. Captures frequency-weighted term co-occurrence patterns, well-suited for short Twitter text.</div>
        </div>
        <div class="md-block">
          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('tokeniser')">Feature Extraction — Tokeniser</span></div>
          <div class="md-block-val">XLM-RoBERTa tokeniser (max 128 tokens D1/D2, 256 tokens D3) with padding. Pre-trained multilingual contextual embeddings capture semantic meaning and long-range dependencies — critical for Reddit's longer posts.</div>
        </div>
      </div>
    </div>
    <!-- Panel 2: Modelling -->
    <div class="md-panel" id="mdp2">
      <div class="md-title"><div class="md-title-dot"></div>Ensemble Strategy &amp; Architecture</div>
      <div class="md-grid">
        <div class="md-block">
          <div class="md-block-lbl">4 Models per Dataset (12 total)</div>
          <div class="md-block-val"><span class="cm-term" onclick="openCM('lr')">Logistic Regression</span> — L2 regularised, max_iter=1000. <span class="cm-term" onclick="openCM('svm')">SVM</span> — LinearSVC, C=1.0. <span class="cm-term" onclick="openCM('xgb')">XGBoost</span> — 300 estimators, max_depth=6. <span class="cm-term" onclick="openCM('xlmr_ft')">XLM-RoBERTa</span> — fine-tuned multilingual transformer, <em>278M parameters</em>, lr=2e-5, 3 epochs.</div>
        </div>
        <div class="md-block">
          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('majority_vote')">Ensemble Vote — Risk Flag Logic</span></div>
          <div class="md-block-val">All 12 models run simultaneously on every input. A sequential design (check depression first, then suicide risk) would <strong>miss masked suicidality</strong> — a clinically documented pre-crisis pattern where affect appears normal but intent is resolved. Parallelism is a safety requirement, not a design preference.</div>
        </div>
        <div class="md-block">
          <div class="md-block-lbl">XGBoost Algorithm Collapse</div>
          <div class="md-block-val">XGBoost accuracy on D3: <em>91.6% (50K sample) → 70.5% (Full 232K) → 60.1% (H1 116K)</em>. Performance degrades as training data grows. The H1/H2 results are also inconsistent (60.1% vs 71.0%) — gradient boosting is highly sensitive to data distribution shifts at this scale, making it unreliable for large Reddit corpora.</div>
        </div>
        <div class="md-block">
          <div class="md-block-lbl"><span class="cm-term" onclick="openCM('split_study')">D3 Split Study (RQ2)</span></div>
          <div class="md-block-val">D3 trained on 4 configurations: Full (232K), Half 1 (116K), Half 2 (116K), Sample (50K). XLM-RoBERTa accuracy: <em>98.1% (50K) → 97.8% (H1) → 98.0% (H2/Full)</em>. Δ = 0.3% across 4× more data. Kolmogorov-Smirnov tests confirm all splits share identical distributions (p &gt; 0.49), validating the comparison.</div>
        </div>
      </div>
    </div>
  </div>
</section>

<hr class="section-divider">

<!-- ACCURACY EVIDENCE MATRIX -->
<section class="section" id="matrix">
  <div class="sec-eyebrow">Core evaluation</div>
  <div class="sec-h2">Accuracy <em>Evidence Matrix</em></div>
  <p class="sec-lead">All 4 models evaluated across all dataset splits. <strong>Bold</strong> = winner per row. <span style="color:var(--red)">Red</span> = XGBoost collapse on larger training sets. — <span class="cm-term" onclick="openCM('eval_metrics')">How metrics are computed</span></p>

  <div class="matrix-wrap">
    <table class="matrix-tbl">
      <thead>
        <tr>
          <th>Dataset / Split</th>
          <th><span class="gloss" data-tip="Logistic Regression: linear model trained with L2 regularisation (max_iter=1000). Fast, interpretable baseline. Outputs class probabilities via sigmoid/softmax. Works well with TF-IDF sparse vectors.">Logistic Regression</span></th>
          <th><span class="gloss" data-tip="SVM (Support Vector Machine): LinearSVC, C=1.0. Finds maximum-margin hyperplane in TF-IDF feature space. Best classical model on D1 — short tweets give TF-IDF enough signal to beat contextual embeddings.">SVM</span></th>
          <th><span class="gloss" data-tip="XGBoost: gradient-boosted trees, 300 estimators, max_depth=6. Sequentially corrects previous errors. Collapses on D3 (71%) — vocabulary overlap between depressive and suicidal language confuses boosted trees.">XGBoost</span></th>
          <th><span class="gloss" data-tip="XLM-RoBERTa: 278M-parameter multilingual transformer. Fine-tuned with lr=2e-5, 3 epochs. Max 128 tokens (D1/D2) or 256 tokens (D3). Best on long-form Reddit posts — contextual embeddings capture meaning beyond keyword matching.">XLM-RoBERTa</span></th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td><div class="ds-label"><span class="ds-badge" style="background:var(--blue-bg);color:var(--blue)">D1</span> Depression Types</div></td>
          <td>91.5%</td>
          <td class="winner">92.4%</td>
          <td>91.8%</td>
          <td>90.5%</td>
        </tr>
        <tr>
          <td><div class="ds-label"><span class="ds-badge" style="background:var(--amber-bg);color:var(--amber)">D2</span> Binary Depression</div></td>
          <td>98.9%</td>
          <td>97.1%</td>
          <td>99.3%</td>
          <td class="winner">99.9%</td>
        </tr>
        <tr>
          <td><div class="ds-label"><span class="ds-badge" style="background:var(--red-bg);color:var(--red)">D3</span> Full (232K)</div></td>
          <td>94.3%</td>
          <td>94.6%</td>
          <td class="collapsed">70.5%</td>
          <td class="winner">98.0%</td>
        </tr>
        <tr>
          <td><div class="ds-label"><span class="ds-badge" style="background:var(--blue-bg);color:var(--blue)">D3</span> Half 1 (116K)</div></td>
          <td>93.8%</td>
          <td>94.2%</td>
          <td class="collapsed">60.1%</td>
          <td class="winner">97.8%</td>
        </tr>
        <tr>
          <td><div class="ds-label"><span class="ds-badge" style="background:var(--amber-bg);color:var(--amber)">D3</span> Half 2 (116K)</div></td>
          <td>93.7%</td>
          <td>94.2%</td>
          <td class="collapsed">71.0%</td>
          <td class="winner">98.0%</td>
        </tr>
        <tr>
          <td><div class="ds-label"><span class="ds-badge" style="background:var(--green-bg);color:var(--green)">D3</span> Sample (50K) ★</div></td>
          <td>93.2%</td>
          <td>93.7%</td>
          <td>91.6%</td>
          <td class="winner">98.1%</td>
        </tr>
      </tbody>
    </table>
  </div>
  <p class="matrix-footnote"><strong>Note:</strong> Full performance evaluation including Macro F1-Score, Cohen's Kappa, and per-class metrics are documented in the Final IEEE Report. Accuracy is shown here as the primary comparative metric for cross-dataset validation.</p>
</section>

<hr class="section-divider">

<!-- FINDINGS -->
<section class="section" id="findings">
  <div class="sec-eyebrow">Key findings</div>
  <div class="sec-h2">What the results <em>show</em></div>
  <p class="sec-lead">Four insights that directly answer the research questions.</p>

  <div class="findings-grid">
    <div class="finding">
      <div class="finding-n">01</div>
      <div class="finding-t">SVM is the best model for short-form text</div>
      <div class="finding-b">On 6-class depression type classification (D1), SVM achieves the highest Accuracy of 92.4%. Tweets average 31 words — too short for transformer contextual embeddings to gain advantage over TF-IDF bigrams.</div>
      <div class="finding-chip">D1 Accuracy: SVM 92.4%</div>
    </div>
    <div class="finding">
      <div class="finding-n">02</div>
      <div class="finding-t">XLM-RoBERTa is the best model for long-form text</div>
      <div class="finding-b">On Reddit suicide risk posts (D3), XLM-RoBERTa achieves 98.1% Accuracy with the 50K sample. Suicide posts average 200.8 words — rich enough context for transformer embeddings to dominate every competitor. D2 (Twitter, ~31 words) tells the opposite story.</div>
      <div class="finding-chip">D3 Accuracy: XLM-RoBERTa 98.1%</div>
    </div>
    <div class="finding">
      <div class="finding-n">03</div>
      <div class="finding-t">Increasing data size provided no significant gain</div>
      <div class="finding-b">Scaling from 50K to 232K samples produced only a 0.1% change in XLM-RoBERTa Accuracy (98.1% → 98.0%). Adding 182,000 more training examples gave no meaningful improvement, validating the 50K sample.</div>
      <div class="finding-chip">50K → 232K: Δ Accuracy = 0.1%</div>
    </div>
    <div class="finding">
      <div class="finding-n">04</div>
      <div class="finding-t">Social media affect ≠ clinical presentation</div>
      <div class="finding-b">D2 was trained on Twitter-style emotional language (explicit distress, slang). Clinical presentations — anhedonia ("nothing feels enjoyable"), fatigue, flat affect — use a different lexicon and are systematically under-flagged. This is the documented <em>Affective vs. Clinical Lexicon Gap</em>: models trained on social media affect fail to recognise diagnostic-criteria language.</div>
      <div class="finding-chip">D2 limitation — documented failure mode</div>
    </div>
    <div class="finding">
      <div class="finding-n">05</div>
      <div class="finding-t">Parallel architecture is the safety net</div>
      <div class="finding-b">When D2 misses a clinical presentation, D1 and D3 can still catch it. When classical D3 models over-flag depressive vocabulary, XLM-RoBERTa's contextual understanding overrides them. No single model is sufficient — the parallel ensemble exists precisely because each model's failure mode is different and partially compensated by the others.</div>
      <div class="finding-chip">Multi-task learning precedent — Zogan et al. 2024</div>
    </div>
  </div>
</section>

<hr class="section-divider">

<!-- CONCLUSIONS & VERDICT -->
<section class="section" id="verdict">
  <div class="sec-eyebrow">Conclusions</div>
  <div class="sec-h2">Research <em>verdict</em></div>
  <p class="sec-lead">Direct answers to both research questions, and the key limitations of the study.</p>

  <div class="verdict-grid">
    <div class="verdict-card rq">
      <div class="vc-eyebrow">RQ1 — Best model</div>
      <div class="vc-title">No single model wins across all tasks</div>
      <div class="vc-body">SVM (92.4%) wins on short-form Twitter text (D1) where TF-IDF bigrams capture enough signal. XLM-RoBERTa wins on long-form Reddit posts (D2: 99.9%, D3: 98.1%) where contextual embeddings dominate. Model selection must be text-length aware.</div>
      <div class="vc-chip">SVM for short text · XLM-RoBERTa for long text</div>
    </div>
    <div class="verdict-card rq">
      <div class="vc-eyebrow">RQ2 — Dataset size</div>
      <div class="vc-title">More data gave no meaningful gain</div>
      <div class="vc-body">Scaling from 50K to 232K training samples produced only a 0.1% change in XLM-RoBERTa Accuracy (98.1% → 98.0%). For this task and model, the 50K sample captures the full signal — there is no statistically significant benefit from 4× more data.</div>
      <div class="vc-chip">50K sample is sufficient · Δ = 0.1%</div>
    </div>
    <div class="verdict-card lim">
      <div class="vc-eyebrow">Limitation 1 — Affective vs. Clinical Lexicon Gap</div>
      <div class="vc-title">Social media affect ≠ clinical diagnostic criteria</div>
      <div class="vc-body">D2 was trained on Twitter explicit emotional language. Clinical presentations using diagnostic vocabulary — anhedonia ("nothing feels enjoyable"), psychomotor fatigue, flat affect — do not match that training distribution and are systematically under-flagged. This is empirical evidence of the domain gap between self-reported social media affect and clinical language, not a model defect.</div>
      <div class="vc-chip">Documented domain gap — Finding 04</div>
    </div>
    <div class="verdict-card lim">
      <div class="vc-eyebrow">Limitation 2 — Classical model lexical overfitting</div>
      <div class="vc-title">TF-IDF ignores word order and context</div>
      <div class="vc-body">Classical D3 models (LR, SVM, XGBoost) use TF-IDF bag-of-words features. Vocabulary overlapping with r/SuicideWatch posts (e.g. "exhausted", "nothing feels enjoyable") triggers false-positive suicide flags — the model sees matching tokens without understanding the sentence context. XLM-RoBERTa's contextual embeddings override these false positives, demonstrating why the transformer is the reliable D3 winner.</div>
      <div class="vc-chip">TF-IDF lexical overfitting — defer to XLM-RoBERTa</div>
    </div>
  </div>
</section>

<hr class="section-divider">

<!-- LIVE DEMO -->
<div class="demo-section" id="demo">
  <div class="sec-eyebrow">Live inference</div>
  <div class="sec-h2" style="margin-bottom:8px">Try it — <em>winner model per task</em></div>
  <p class="sec-lead" style="margin-bottom:12px">Sample 3 demonstrates masked suicidality. Try typing clinical-style depressive language ("I feel exhausted, nothing feels enjoyable") to observe the Affective vs. Clinical Lexicon Gap documented in Finding 04.</p>
  <p style="font-size:13px;color:var(--ink2);margin-bottom:24px">How the demo works: <span class="cm-term" onclick="openCM('flask_deploy')">Flask → HuggingFace proxy</span> · <span class="cm-term" onclick="openCM('predict_flow')">predict_all() inference flow</span></p>

  <div class="disclaimer"><strong>Research prototype only.</strong> Not a clinical tool. If you or someone you know is in crisis, please contact a mental health professional or emergency services immediately.</div>

  <div class="input-card">
    <textarea id="textInput" placeholder="Enter any text — tweet, Reddit post, or sentence..."></textarea>
    <div class="input-foot">
      <div class="char-count" id="charCount">0 characters</div>
      <div class="samples">
        <button class="sbtn" onclick="loadSample(0)">Sample 1 — Postpartum</button>
        <button class="sbtn" onclick="loadSample(1)">Sample 2 — Psychotic</button>
        <button class="sbtn danger" onclick="loadSample(2)"><div class="sbtn-pulse"></div>Sample 3 — Masked risk</button>
        <button class="sbtn" onclick="loadSample(3)">Sample 4 — No issue</button>
      </div>
    </div>
    <button class="run-btn" id="runBtn" onclick="runAnalysis()">
      <div class="spinner" id="spinner"></div>
      <span id="btnTxt">Run analysis</span>
    </button>
  </div>

  <div class="results" id="results">
    <div class="risk-banner" id="riskBanner">
      <div class="rb-icon" id="rbIcon"></div>
      <div><div class="rb-title" id="rbTitle"></div><div class="rb-body" id="rbBody"></div></div>
    </div>

    <!-- Clinical Insight Alert — shown when patterns warrant clinical interpretation -->
    <div class="clinical-insight" id="clinicalInsight">
      <div class="ci-title" id="ciTitle"></div>
      <div class="ci-body" id="ciBody"></div>
    </div>

    <div class="results-hdr">
      <div class="results-hdr-title">Analysis results</div>
      <div class="elapsed-chip" id="elapsed"></div>
    </div>

    <!-- Cards ordered D3 → D2 → D1 (safety-first triage) -->
    <div class="winner-grid" id="winnerGrid">
      <div class="win-card d3" id="cardD3">
        <div class="wc-lbl">D3 — Immediate Risk · XLM-RoBERTa</div>
        <div class="wc-pred" id="wpC">—</div>
        <div class="conf-row"><div class="conf-track"><div class="conf-fill" id="wbC"></div></div><div class="conf-pct" id="wcC">—</div></div>
        <div class="wc-meta">98.1% Accuracy on D3</div>
      </div>
      <div class="win-card d2" id="cardD2">
        <div class="wc-lbl">D2 — Depressed? · XLM-RoBERTa</div>
        <div class="wc-pred" id="wpB">—</div>
        <div class="conf-row"><div class="conf-track"><div class="conf-fill" id="wbB"></div></div><div class="conf-pct" id="wcB">—</div></div>
        <div class="wc-meta">99.9% Accuracy on D2</div>
      </div>
      <div class="win-card d1" id="cardD1">
        <div class="wc-lbl">D1 — Depression type · SVM</div>
        <div class="wc-pred" id="wpA">—</div>
        <div class="conf-row"><div class="conf-track"><div class="conf-fill" id="wbA"></div></div><div class="conf-pct" id="wcA">—</div></div>
        <div class="wc-meta">92.4% Accuracy on D1</div>
      </div>
    </div>
  </div>
</div>

<!-- CODE MODAL OVERLAY -->
<div class="cm-overlay" id="cmOverlay" onclick="closeCMOutside(event)">
  <div class="cm-box" id="cmBox">
    <div class="cm-head">
      <div class="cm-title" id="cmTitle"></div>
      <button class="cm-close" onclick="closeCM()">✕</button>
    </div>
    <div class="cm-tabs">
      <div class="cm-tab active" onclick="switchCMTab(0)">Code</div>
      <div class="cm-tab" onclick="switchCMTab(1)">Why</div>
      <div class="cm-tab" onclick="switchCMTab(2)">Output</div>
    </div>
    <div class="cm-body">
      <div class="cm-panel active" id="cmt0"></div>
      <div class="cm-panel" id="cmt1"></div>
      <div class="cm-panel" id="cmt2"></div>
    </div>
  </div>
</div>

<hr class="section-divider">

<!-- FAQ SECTION -->
<section class="faq-section" id="faq">
  <div class="sec-eyebrow">Defence prep</div>
  <div class="sec-h2">Frequently asked <em>questions</em></div>
  <p class="sec-lead">Click any question to expand the answer. Grouped by topic for quick navigation during Q&amp;A.</p>

  <div class="faq-group">
    <div class="faq-group-title">Data &amp; Datasets</div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">What are the three datasets and what makes them different? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">D1 — 6-class depression type classification (atypical, bipolar, major depressive, no depression, postpartum, psychotic) from Kaggle. Twitter-length text, 11,986 samples. D2 — binary depressed/not-depressed from Twitter (10,314 samples, severe 3.46× imbalance). D3 — binary suicide/non-suicide from Reddit (232K samples, perfectly balanced 116,037 each — we use a 50K sample of 25K per class). Each dataset has a different task, different text length, and different vocabulary domain — which is precisely why running all three in parallel is informative.</div></div>
    </div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">How did you handle class imbalance? Why SMOTE and not class weighting? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">D1 had 1.89× imbalance (atypical class), D2 had 3.46× imbalance. We applied <code>SMOTE</code> to training data only — never the test set. SMOTE interpolates new synthetic samples in TF-IDF feature space between existing minority-class examples. Class weighting was also evaluated; SMOTE showed equal or better Macro F1 in cross-validation. D3 was pre-balanced and required no oversampling.</div></div>
    </div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">Is there any data leakage in your pipeline? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">No. The train/test split (stratified 80/20) is performed first. SMOTE is then applied only to the training portion. The TF-IDF vocabulary is fitted on training data only and applied as a read-only transform to the test set. XLM-RoBERTa uses a fixed pretrained tokeniser. No test sample was ever used to inform any training decision.</div></div>
    </div>
  </div>

  <div class="faq-group">
    <div class="faq-group-title">Methodology &amp; Models</div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">Why four model types per dataset? Why not just use the best one? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">Each captures a different inductive bias: Logistic Regression (linear decision boundary), SVM (maximum-margin), Random Forest/XGBoost (non-linear tree ensembles), XLM-RoBERTa (contextual transformer). Disagreement between models is itself a signal. On D1, SVM (92.4%) beats XLM-RoBERTa (90.5%) — short tweets don't give the transformer enough context to gain advantage. On D3 (200.8-word Reddit posts), XLM-RoBERTa (98.1%) dominates every classical model.</div></div>
    </div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">What are your TF-IDF settings and why? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner"><code>max_features=50,000</code> — covers the full relevant vocabulary without noise. <code>ngram_range=(1,2)</code> — unigrams + bigrams capture local phrases ("not happy", "kill myself") that unigrams miss. <code>sublinear_tf=True</code> — applies log(1+tf) to dampen high-frequency word dominance. <code>min_df=2</code> — removes hapax legomena (words appearing only once) that add noise.</div></div>
    </div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">How was XLM-RoBERTa fine-tuned? What hyperparameters? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">Standard sequence classification fine-tuning: Adam optimiser, <code>lr=2e-5</code>, <code>3 epochs</code>, linear warmup scheduler. Max token length: 128 for D1/D2 (Twitter-length text), 256 for D3 (Reddit posts average 200.8 words). Cross-entropy loss. Best checkpoint saved by validation accuracy. 278M parameters — multilingual pretraining covers 100 languages.</div></div>
    </div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">Why did XGBoost collapse on D3 at full scale? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">On the 50K sample, XGBoost achieves 91.6% — competitive. At full scale (232K), it collapses to 70.52% (Macro F1: 0.6998). This is TF-IDF lexical overfitting: vocabulary overlap between "suicide" and "non-suicide" Reddit posts increases with scale — words like "exhausted", "hopeless", "nothing matters" appear in both classes. Boosted trees memorise these majority-class token patterns instead of learning discriminative boundaries. H1 (116K) drops further to 60.1%, and H1 vs H2 are inconsistent (60.1% vs 70.9%), confirming XGBoost is unstable at this data scale. XLM-RoBERTa stays at 98.1% across all splits.</div></div>
    </div>
  </div>

  <div class="faq-group">
    <div class="faq-group-title">Results &amp; Evaluation</div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">Why is SVM accuracy 92.4% on D1 but XLM-RoBERTa (278M params) only gets 90.5%? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">Text length. D1 tweets average ~31 words. Transformers need rich context to outperform classical methods — contextual embeddings add little value on ~40-token inputs. TF-IDF bigrams on short explicit text (like tweets) already capture the full signal. This is Finding 01 and one of the key research conclusions: model selection must be text-length aware.</div></div>
    </div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">Why show accuracy rather than Macro F1? Isn't accuracy misleading on imbalanced data? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">The dashboard shows accuracy for accessibility (non-specialist audience). After SMOTE, all training classes are equalised — so accuracy and Macro F1 are closely aligned. The full Macro F1, Cohen's Kappa, and per-class precision/recall are reported in the IEEE technical report. The evidence matrix footnote notes this explicitly.</div></div>
    </div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">Did adding more training data (50K → 232K) improve D3 results? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">No — XLM-RoBERTa: 98.1% (50K, NB2) · 98.02% (Full 232K) · 97.78% (H1) · 98.02% (H2). Maximum delta = 0.32%. KS tests across the three split study splits (Full, H1, H2) confirm identical distributions: suicide class p=0.4967 (H1 vs H2), p=0.9758 (Full vs H1); non-suicide class p=0.8125 (H1 vs H2), p=0.9992 (Full vs H1). All well above the p=0.05 threshold — distribution shift is not driving the results. This is Finding 03.</div></div>
    </div>
  </div>

  <div class="faq-group">
    <div class="faq-group-title">Architecture &amp; Live Demo</div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">Is the live demo using real models or hardcoded responses? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">Real models. The Flask app proxies every request to a HuggingFace Space (<code>esvanth-mindscan.hf.space</code>) which runs <code>predict.py</code> with all 12 loaded models. There is no hardcoded data — every input goes through the full pipeline. If the Space is sleeping it auto-wakes within ~60 seconds.</div></div>
    </div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">What does "Ensemble Conflict" (amber banner) mean? Why not just show red? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">It means classical D3 models (LR/SVM/XGBoost) flagged suicide risk by majority vote, but XLM-RoBERTa — the best model at 98.1% accuracy — disagrees. A pure majority vote could trigger false alarms on metaphorical language ("I'm dying of embarrassment"). The amber state expresses uncertainty rather than forcing a binary decision, which maps directly to "escalate for human review" — the appropriate clinical-conservative response.</div></div>
    </div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">Why does D2 under-flag clinical-style text like "I feel exhausted, nothing feels enjoyable"? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">This is the Affective vs. Clinical Lexicon Gap (Finding 04, documented in NAACL 2024). D2 was trained on Twitter emotional language — explicit distress, slang, emotional punctuation. Clinical presentations use diagnostic vocabulary: anhedonia ("nothing feels enjoyable"), psychomotor fatigue, flat affect. These words are absent from D2's training distribution. This is not a bug — it is an empirical finding about the domain gap between social media affect and clinical language.</div></div>
    </div>
    <div class="faq-item">
      <button class="faq-q" onclick="toggleFaq(this)">What is the single most important future direction? <span class="faq-chevron">▼</span></button>
      <div class="faq-a"><div class="faq-a-inner">Replace TF-IDF classical models with <strong>MentalBERT/MentalRoBERTa</strong> (Ji et al. 2022) pretrained on mental health forum data. Combine all three tasks in a true multi-task learning setup with a shared encoder and task-specific heads — following the MTL precedent from Zogan et al. (2024). This would address both documented limitations (Affective Lexicon Gap and TF-IDF overfitting) simultaneously.</div></div>
    </div>
  </div>
</section>

<footer>
  MindScan · NCI H9DAI Research Project 2026 · Academic Prototype Only<br>
  Datasets: Zenodo 14233292 · Kaggle albertobellardini · Kaggle nikhileswarkomati<br>
  Not for clinical use · MSc Artificial Intelligence coursework
</footer>

<script>
// ── METHODOLOGY PANEL SWITCH ──────────────────────────────────────
function showMethodDetail(idx){
  document.querySelectorAll('.method-step').forEach((s,i)=>{
    s.classList.toggle('active',i===idx);
  });
  document.querySelectorAll('.md-panel').forEach((p,i)=>{
    p.classList.toggle('active',i===idx);
  });
  // re-trigger animation
  const det=document.querySelector('.method-detail');
  det.style.animation='none';
  requestAnimationFrame(()=>{det.style.animation=''});
}

// ── COUNTER ANIMATION ─────────────────────────────────────────────
function animateCounters(){
  document.querySelectorAll('.stat-num[data-target]').forEach(el=>{
    const target=parseFloat(el.getAttribute('data-target'));
    const dec=parseInt(el.getAttribute('data-dec')||'0');
    const suffix=el.getAttribute('data-suffix')||'';
    const duration=1400;
    const start=performance.now();
    function step(now){
      const p=Math.min((now-start)/duration,1);
      const ease=1-Math.pow(1-p,3);
      const val=target*ease;
      el.textContent=(dec>0?val.toFixed(dec):Math.floor(val))+suffix;
      if(p<1)requestAnimationFrame(step);
    }
    requestAnimationFrame(step);
  });
}
window.addEventListener('load',()=>setTimeout(animateCounters,300));

// ── SAMPLES ───────────────────────────────────────────────────────
const SAMPLES=[
  "I been going through depression after having my baby. I didn't even realise it till recently. I always said I'm strong but your emotions change, your body changes. I need time.",
  "The universe is sending me signals I should follow. I know it's the psychosis. I know it isn't real. But it feels so real every single day.",
  "I've sorted everything out. Told my friends I love them. Finally feel at peace with my decision. I'm not sad anymore — just ready.",
  "Had such a great day today! Went hiking with friends and saw the most amazing sunset. Feeling really grateful and happy to be alive."
];
const ta=document.getElementById('textInput');
ta.addEventListener('input',()=>document.getElementById('charCount').textContent=ta.value.length+' characters');
function loadSample(i){ta.value=SAMPLES[i];document.getElementById('charCount').textContent=ta.value.length+' characters'}


// ── RUN ANALYSIS ──────────────────────────────────────────────────
async function runAnalysis(){
  const text=ta.value.trim();if(!text)return;
  const btn=document.getElementById('runBtn');
  const sp=document.getElementById('spinner');
  const bt=document.getElementById('btnTxt');
  btn.disabled=true;sp.style.display='block';bt.textContent='Running models...';
  document.getElementById('results').style.display='none';
  try{
    const r=await fetch('/predict',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({text})});
    const d=await r.json();
    if(!r.ok){alert('Error: '+(d.error||'failed'));return}
    render(d);
  }catch(e){
    alert('Cannot reach the inference backend. The HuggingFace Space may be waking up — wait 30 seconds and try again.');
  }finally{
    btn.disabled=false;sp.style.display='none';bt.textContent='Run analysis';
  }
}

function render(d){
  const d1res = d.dataset1.models['SVM'];
  const d2res = d.dataset2.models['XLM-RoBERTa'];
  const d3res = d.dataset3.models['XLM-RoBERTa'];
  const d1Label = d1res.label.toLowerCase();
  const d2Label = d2res.label.toLowerCase();
  const d3Label = d3res.label.toLowerCase();
  const isRisk  = d.risk_flag;
  const notDepressed = d2Label.includes('not');
  // "no depression" may come back as "no" from some model versions — none of the 5 disorder classes contain "no"
  const hasDisorder  = !d1Label.includes('no');
  const isSuicide    = d3Label.includes('suicide') && !d3Label.includes('non');

  // ── Risk banner ───────────────────────────────────────────────
  const rb=document.getElementById('riskBanner');
  if(isRisk && isSuicide){
    // XLM-RoBERTa (best model) confirms suicide risk
    rb.className='risk-banner danger';
    document.getElementById('rbIcon').textContent='⚠';
    document.getElementById('rbTitle').textContent='High Suicide Risk Detected';
    document.getElementById('rbBody').textContent='D3 flagged this text ('+d.suicide_votes+'). This is a research prototype — seek professional help if needed.';
  }else if(isRisk && !isSuicide){
    // Classical models flagged risk but XLM-RoBERTa (best model) disagrees
    rb.className='risk-banner warn';
    document.getElementById('rbIcon').textContent='⚡';
    document.getElementById('rbTitle').textContent='Ensemble Conflict — Classical Models Flagged Risk';
    document.getElementById('rbBody').textContent=d.suicide_votes+', but XLM-RoBERTa (best model, 98.1% accuracy) rates this as '+d3res.label+'. Classical TF-IDF models may be over-flagging depressive language.';
  }else{
    rb.className='risk-banner safe';
    document.getElementById('rbIcon').textContent='✓';
    document.getElementById('rbTitle').textContent='No immediate crisis risk detected';
    document.getElementById('rbBody').textContent='D3 did not detect suicidal ideation markers. ('+d.suicide_votes+')';
  }

  // ── D3 card dominant state ────────────────────────────────────
  const cardD3=document.getElementById('cardD3');
  const d3lbl=document.getElementById('cardD3').querySelector('.wc-lbl');
  // Disagreement: majority voted suicide but XLM-RoBERTa (best model) says non-suicide
  const majorityVsWinner = isRisk && !isSuicide;
  if(isRisk && isSuicide){
    // Confirmed risk — XLM-RoBERTa agrees
    cardD3.classList.add('risk-active');
    d3lbl.textContent = 'D3 — Immediate Risk · XLM-RoBERTa';
  }else{
    cardD3.classList.remove('risk-active');
    d3lbl.textContent = majorityVsWinner
      ? 'D3 — '+d.suicide_votes+' (classical) · XLM-RoBERTa dissents'
      : 'D3 — Immediate Risk · XLM-RoBERTa';
  }

  // ── Clinical Insight Alert ────────────────────────────────────
  const ci=document.getElementById('clinicalInsight');
  const ciTitle=document.getElementById('ciTitle');
  const ciBody=document.getElementById('ciBody');

  // Masked suicidality requires XLM-RoBERTa (best D3 model) to also flag suicide,
  // not just the classical models — prevents false positives on plain depressive text
  if(isRisk && isSuicide && notDepressed){
    // Masked suicidality — confirmed by XLM-RoBERTa + majority vote
    ciTitle.innerHTML='⚡ Clinical Insight — Masked Suicidality Pattern Detected';
    ciBody.textContent='This text shows low depressive affect (D2: '+d2res.label+') but high intent resolution (D3: Suicide Risk). This is a clinically documented pre-crisis pattern where a person appears calm and resolved rather than distressed. A sequential pipeline gating D3 behind D2 would have missed this entirely — demonstrating the necessity of the parallel architecture.';
    ci.style.display='block';
  }else if(isRisk && majorityVsWinner && notDepressed){
    // Classical models flag risk but XLM-RoBERTa disagrees — model disagreement
    ciTitle.innerHTML='⚠ Clinical Insight — Ensemble Disagreement';
    ciBody.textContent=d.suicide_votes+' (classical models), but XLM-RoBERTa rates this as '+d3res.label+' ('+pct(d3res.confidence)+' confidence). XLM-RoBERTa (98.1% accuracy) likely correct here — classical TF-IDF models can over-flag depressive language as suicide risk. Human review recommended.';
    ci.style.display='block';
  }else if(hasDisorder && notDepressed){
    // Disorder type detected but no depressive affect — affect mismatch
    ciTitle.innerHTML='⚠ Clinical Insight — Affect Mismatch Detected';
    ciBody.textContent='D1 identifies '+d1res.label+' presentation, yet D2 finds no classic depressive affect. This is expected: D2 detects Twitter-style depressive language patterns, while psychotic, atypical, and bipolar presentations often do not match that affect profile. The patient is not presenting with classic depressive symptoms but the disorder classification remains clinically valid.';
    ci.style.display='block';
  }else{
    ci.style.display='none';
  }

  document.getElementById('elapsed').textContent=d.processing_time_ms+'ms';

  // Fixed winner per task: SVM for D1, XLM-RoBERTa for D2 and D3
  setW('A', d1res);
  setW('B', d2res);
  setW('C', d3res);

  document.getElementById('results').style.display='block';
  document.getElementById('results').scrollIntoView({behavior:'smooth',block:'start'});
}

function setW(id,res){
  document.getElementById('wp'+id).textContent=res.label;
  document.getElementById('wc'+id).textContent=pct(res.confidence);
  setTimeout(()=>document.getElementById('wb'+id).style.width=(res.confidence*100).toFixed(1)+'%',100);
}

function pct(v){return(v*100).toFixed(1)+'%'}

/* ── CODE MODAL DATA ── */
const CM_DATA = {
  clean_text: {
    title: 'clean_text() — Text Preprocessing',
    code: `def clean_text(text):
    text = str(text).lower()
    # remove URLs
    text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text)
    # remove @mentions
    text = re.sub(r'@\\w+', '', text)
    # remove # symbol (keep hashtag word)
    text = re.sub(r'#', '', text)
    # strip all punctuation
    text = text.translate(
        str.maketrans('', '', string.punctuation)
    )
    # collapse whitespace
    text = re.sub(r'\\s+', ' ', text).strip()
    return text`,
    src: 'notebooks/DA_Notebook_One.ipynb — cell 5 · notebooks/DA_2_Notebook.ipynb — cell 6 · predict.py lines 106–113',
    why: '<strong>Why lowercase?</strong> "Sad" and "sad" must map to the same TF-IDF token. <strong>Why remove URLs?</strong> Hundreds of unique tokens, zero semantic value — pure noise. <strong>Why keep hashtag words?</strong> "#depressed" → "depressed" preserves the semantic signal, removes the markup. <strong>Why no stemming?</strong> Stemming degrades bigram quality — "kill myself" would become "kill myself" but "killing" → "kill" breaks n-gram boundaries. Same function is used at both training time (notebook) and inference time (predict.py) to guarantee identical preprocessing.',
    outputs: [
      {label:'Input', val:'"I been going through #Depression after @user check https://t.co/xyz!!"'},
      {label:'Output', val:'"i been going through depression after check"'},
      {label:'Note', val:'Applied to all 3 datasets before TF-IDF and before XLM-RoBERTa tokenisation'},
    ]
  },
  smote: {
    title: 'SMOTE — Synthetic Minority Oversampling',
    code: `def apply_smote(X_train, y_train):
    before = Counter(y_train)
    smote = SMOTE(random_state=42)
    X_bal, y_bal = smote.fit_resample(X_train, y_train)
    after = Counter(y_bal)
    print(f'SMOTE: {sum(before.values())} → {sum(after.values())}')
    return X_bal, y_bal

# Called AFTER TF-IDF vectorisation, AFTER train/test split
X1_bal, y1_bal = apply_smote(X1_tr_tf, y1_tr)`,
    src: 'notebooks/DA_Notebook_One.ipynb — cell 5 (apply_smote def) · cell 10 (D1 call) · cell 17 (D2 call) · D3 skipped',
    why: '<strong>Why after TF-IDF?</strong> SMOTE interpolates in feature space — it creates synthetic TF-IDF vectors, not synthetic text. <strong>Why not before the split?</strong> Applying SMOTE before splitting would let synthetic samples leak into the test set — the test set must contain only real data. <strong>Why not class_weight instead?</strong> Class weighting reweights the loss function — it doesn\'t add new training examples. SMOTE was chosen because it physically fills the minority-class region of feature space, giving tree-based models (RF, XGB) more to learn from. <strong>D3 skipped:</strong> D3 is pre-balanced (116K each class) — no intervention needed.',
    outputs: [
      {label:'D1', val:'11,986 → 17,982 samples (atypical: 1,584 → 2,997, each class equalised)'},
      {label:'D2', val:'8,251 → 12,800 samples (Depressed: 1,851 → 6,400)'},
      {label:'D3', val:'Skipped — pre-balanced at 116,037 per class'},
    ]
  },
  tfidf: {
    title: 'TfidfVectorizer — Feature Extraction',
    code: `def make_tfidf(X_train, X_test, max_features=50000):
    tfidf = TfidfVectorizer(
        max_features=50000,   # top 50K tokens by corpus frequency
        ngram_range=(1, 2),   # unigrams AND bigrams
        sublinear_tf=True,    # log(1+tf) instead of raw tf
        min_df=2              # ignore tokens appearing < 2 times
    )
    Xtr = tfidf.fit_transform(X_train)  # fit on train only
    Xte = tfidf.transform(X_test)       # apply to test (no fit)
    return tfidf, Xtr, Xte`,
    src: 'notebooks/DA_Notebook_One.ipynb — cell 5 (make_tfidf def) · cell 10 (D1) · cell 17 (D2) · cell 22 (D3)',
    why: '<strong>ngram_range=(1,2):</strong> Bigrams capture "kill myself", "not happy", "feeling better" — critical signals that unigrams miss entirely. <strong>sublinear_tf=True:</strong> Applies log(1+tf) to dampen high-frequency word dominance. Without this, common words like "i", "feel" swamp the features. <strong>min_df=2:</strong> Removes hapax legomena (words appearing only once) — they add 0 generalisable information. <strong>fit only on train:</strong> Vocabulary is locked on training data — the test set is transformed using this fixed vocabulary, preventing any data leakage.',
    outputs: [
      {label:'D1 shape', val:'11,986 × 50,000 sparse matrix (tweets × features)'},
      {label:'D2 shape', val:'8,251 × 50,000 sparse matrix'},
      {label:'D3 shape', val:'40,000 × 50,000 sparse matrix'},
      {label:'After SMOTE', val:'D1 becomes 17,982 × 50,000, D2 becomes 12,800 × 50,000'},
    ]
  },
  tokeniser: {
    title: 'XLM-RoBERTa Tokeniser',
    code: `tokenizer = AutoTokenizer.from_pretrained(
    'FacebookAI/xlm-roberta-base'
)

def tokenize_tweets(examples):
    return tokenizer(
        examples['text'],
        max_length=128,      # 128 for D1/D2 (tweets avg ~40 tokens)
        truncation=True,     # cut anything beyond max_length
        padding='max_length' # pad shorter inputs to fixed length
    )

# D3 uses max_length=256 — Reddit posts avg 200.8 words (~280 tokens)
def tokenize_reddit(examples):
    return tokenizer(
        examples['text'],
        max_length=256,
        truncation=True,
        padding='max_length'
    )`,
    src: 'notebooks/DA_2_Notebook.ipynb — cell 9 (tokenize_tweets, max_length=128, D1/D2) · cell 21 (tokenize_reddit, max_length=256, D3)',
    why: '<strong>SentencePiece subword tokenisation:</strong> Splits unknown words into subword pieces — "suicidal" might become ["su", "ici", "dal"]. No word is truly out-of-vocabulary. <strong>max_length=128 for D1/D2:</strong> Tweets average ~31 words ≈ 40 tokens. 128 is 3× headroom. <strong>max_length=256 for D3:</strong> Reddit posts average 200.8 words ≈ 280 tokens — 128 would truncate most of the signal. <strong>padding=\'max_length\':</strong> All batches must be identical length for GPU tensor operations — shorter inputs are padded with [PAD] tokens. The attention mask tells the model to ignore padding.',
    outputs: [
      {label:'D1/D2 shape', val:'Each input → tensor of shape [128] (input_ids) + [128] (attention_mask)'},
      {label:'D3 shape', val:'Each input → tensor of shape [256] × 2'},
      {label:'Example', val:'"i feel hopeless" → input_ids: [0, 444, 7809, 73542, 2, 1, 1, ...]'},
    ]
  },
  lr: {
    title: 'Logistic Regression',
    code: `LogisticRegression(
    max_iter=1000,            # enough iterations to converge on 50K features
    class_weight='balanced',  # backup alongside SMOTE
    random_state=42,
    n_jobs=-1                 # use all CPU cores
)`,
    src: 'notebooks/DA_Notebook_One.ipynb — cell 11 (D1) · cell 18 (D2) · cell 23 (D3)',
    why: '<strong>Why use it?</strong> Fast, interpretable linear baseline. On 50,000 TF-IDF features, L2 regularisation prevents overfitting by shrinking large weights toward zero. Outputs calibrated probabilities via softmax — important for confidence scores in the UI. <strong>class_weight=\'balanced\':</strong> Secondary guard alongside SMOTE — the model pays proportionally more attention to minority classes during gradient updates.',
    outputs: [
      {label:'D1', val:'91.5% accuracy — solid baseline, beaten by SVM'},
      {label:'D2', val:'98.9% accuracy'},
      {label:'D3', val:'93.2% accuracy'},
    ]
  },
  svm: {
    title: 'SVM — LinearSVC',
    code: `LinearSVC(
    C=1.0,                    # regularisation strength (lower = more reg)
    class_weight='balanced',
    max_iter=2000,
    random_state=42
)

# LinearSVC has no predict_proba — use decision_function + softmax
scores = model.decision_function(vec)[0]
e = np.exp(scores - scores.max())
conf = float(e[pred_idx] / e.sum())`,
    src: 'notebooks/DA_Notebook_One.ipynb — cell 11 (D1) · cell 18 (D2) · cell 23 (D3) · predict.py lines 147–154 (confidence fallback)',
    why: '<strong>Why SVM wins on D1?</strong> LinearSVC finds the maximum-margin hyperplane in TF-IDF feature space — the optimal linear decision boundary for sparse high-dimensional data. Tweets (31 words avg) produce sparse TF-IDF vectors where the margin is well-defined. Contextual embeddings (XLM-RoBERTa) add no value at this sentence length. <strong>Why LinearSVC over SVC(kernel=\'rbf\')?</strong> Linear kernel scales to 50,000 features. RBF kernel would be O(n²) — computationally infeasible.',
    outputs: [
      {label:'D1', val:'92.4% accuracy — best model on D1, beats XLM-RoBERTa (90.5%)'},
      {label:'D2', val:'97.1% accuracy'},
      {label:'D3', val:'77.8% accuracy'},
    ]
  },
  xgb: {
    title: 'XGBoost — XGBClassifier',
    code: `XGBClassifier(
    n_estimators=300,         # 300 trees built sequentially
    learning_rate=0.1,        # each tree contributes 10% of its weight
    max_depth=6,              # max tree depth — controls complexity
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)`,
    src: 'notebooks/DA_Notebook_One.ipynb — cell 11 (D1) · cell 18 (D2) · cell 23 (D3)',
    why: '<strong>Gradient boosting principle:</strong> Each new tree is trained to correct the residual errors of all previous trees. 300 trees × learning_rate=0.1 = strong ensemble. <strong>Why does it collapse on D3?</strong> Vocabulary overlap between depressive and suicidal language in Reddit posts — words like "exhausted", "hopeless" appear in both classes. Boosted trees memorise these majority-class token patterns and fail at full scale (232K). XGBoost is highly sensitive to distribution shifts at this scale, shown by inconsistent H1/H2 results (60.1% vs 71.0%).',
    outputs: [
      {label:'D1', val:'91.8% accuracy'},
      {label:'D2', val:'99.3% accuracy'},
      {label:'D3 (50K)', val:'91.6% — performs well on sample'},
      {label:'D3 (Full 232K)', val:'70.5% — collapse (lexical overfitting)'},
    ]
  },
  xlmr_ft: {
    title: 'XLM-RoBERTa Fine-Tuning',
    code: `xlmr = AutoModelForSequenceClassification.from_pretrained(
    'FacebookAI/xlm-roberta-base',
    num_labels=NUM_LABELS   # 6 for D1, 2 for D2, 2 for D3
)

args = TrainingArguments(
    num_train_epochs=3,
    learning_rate=2e-5,           # standard BERT fine-tuning rate
    per_device_train_batch_size=16,  # 8 for D3 (longer sequences)
    gradient_accumulation_steps=2,   # D3 only — simulates batch=16
    warmup_steps=200,             # gradual LR increase at start
    weight_decay=0.01,            # L2 regularisation on weights
    load_best_model_at_end=True,  # save epoch with lowest val loss
    fp16=torch.cuda.is_available()  # half precision — 2× faster on GPU
)

trainer = Trainer(
    model=xlmr, args=args,
    train_dataset=train_tok,
    eval_dataset=test_tok
)
trainer.train()`,
    src: 'notebooks/DA_2_Notebook.ipynb — cell 10 (model init D1) · cell 11 (TrainingArguments D1) · cell 17 (D2) · cell 22 (model init D3) · cell 23 (TrainingArguments D3)',
    why: '<strong>lr=2e-5:</strong> Standard for fine-tuning BERT-family models. Too high destroys pretrained weights (catastrophic forgetting). Too low fails to converge in 3 epochs. <strong>warmup_steps=200:</strong> LR starts at 0 and linearly ramps — prevents early instability when weights are far from the task optimum. <strong>load_best_model_at_end:</strong> Epoch 3 is not always best — we restore the checkpoint with the lowest validation loss. <strong>D3 batch=8 + accumulation=2:</strong> max_length=256 uses 2× GPU memory vs 128. Accumulation simulates batch=16 without OOM.',
    outputs: [
      {label:'D1', val:'90.5% accuracy (Macro F1: 0.9117, κ=0.8852)'},
      {label:'D2', val:'99.95% accuracy (Macro F1: 0.9993)'},
      {label:'D3', val:'98.1% accuracy (Macro F1: 0.9810, κ=0.9620)'},
    ]
  },
  majority_vote: {
    title: 'Ensemble Vote — Risk Flag Logic',
    code: `# From predict.py — predict_all() function
suicide_count = sum(
    1 for r in d3.values()
    if 'suicide' in r['label'].lower()
    and 'non' not in r['label'].lower()
)
risk_flag = suicide_count >= 3  # majority = ≥3 of 4 models

# d3.values() = results from LR, SVM, XGBoost, XLM-RoBERTa
# XLM-RoBERTa is also checked separately for banner state:
isSuicide = d3['XLM-RoBERTa'].label includes 'suicide' (JS)

# Three UI states:
# risk_flag=True  AND XLM-R agrees  → RED   (High Suicide Risk)
# risk_flag=True  AND XLM-R dissents → AMBER (Ensemble Conflict)
# risk_flag=False                   → GREEN (Low Risk)`,
    src: 'predict.py lines 266–270 (suicide_count + risk_flag) · predict.py line 296 (suicide_votes string) · templates/index.html JS render() — banner state logic',
    why: '<strong>Why ≥3/4 threshold?</strong> 1–2 flagging models could be TF-IDF false positives (lexical overfitting). 3+ represents genuine consensus — meaningful signal. <strong>Why check XLM-RoBERTa separately for the banner?</strong> XLM-RoBERTa has the highest D3 accuracy (98.1%) and understands context. If XLM-R disagrees with the majority, the amber "Ensemble Conflict" state is safer than a red alert — it flags uncertainty rather than over-alarming on metaphorical language ("I\'m dying of embarrassment").',
    outputs: [
      {label:'Threshold', val:'≥ 3/4 D3 models output "suicide" (not "non-suicide")'},
      {label:'Red banner', val:'risk_flag=True AND XLM-RoBERTa confirms suicide'},
      {label:'Amber banner', val:'risk_flag=True but XLM-RoBERTa says non-suicide'},
      {label:'Green banner', val:'risk_flag=False — fewer than 3 models flagged'},
    ]
  },
  eval_metrics: {
    title: 'Evaluation — How Metrics Are Computed',
    code: `def evaluate_transformer(name, y_true, y_pred,
                           label_names, ds_key, results_store):
    acc   = accuracy_score(y_true, y_pred)
    macro = f1_score(y_true, y_pred, average='macro')
    kappa = cohen_kappa_score(y_true, y_pred)

    print(f'Accuracy     : {acc*100:.2f}%')
    print(f'Macro F1     : {macro:.4f}')
    print(f"Cohen's Kappa: {kappa:.4f}")
    print(classification_report(y_true, y_pred,
                                target_names=label_names))

    results_store[name] = {
        'accuracy': round(acc, 4),
        'macro_f1': round(macro, 4),
        'kappa':    round(kappa, 4)
    }

# Same function used for classical models in Notebook 1:
for name, model in models_d1.items():
    model.fit(X1_bal, y1_bal)         # train on SMOTE-balanced data
    preds = model.predict(X1_te_tf)   # test on original held-out set
    evaluate(name, y1_te, preds, le1.classes_, 'd1', d1_results)`,
    src: 'notebooks/DA_Notebook_One.ipynb — cell 5 (evaluate def, classical) · notebooks/DA_2_Notebook.ipynb — cell 6 (evaluate_transformer def)',
    why: '<strong>Accuracy:</strong> (correct predictions) / (total predictions). Simple but misleading on imbalanced data — a model predicting majority class always gets high accuracy. Valid here because SMOTE balanced the training set and D3 is pre-balanced. <strong>Macro F1:</strong> Averages F1 per class without weighting by class size — penalises models that ignore minority classes. This is the primary metric in the IEEE report. <strong>Cohen\'s Kappa:</strong> Measures agreement beyond what chance alone would produce. Formula: (observed − expected) / (1 − expected). κ > 0.8 = almost perfect agreement. Reported because the base paper (Tumaliuan 2024) did not report it — we added it as an improvement. <strong>classification_report:</strong> Shows per-class precision, recall, F1 — the full picture behind the headline number.',
    outputs: [
      {label:'D1 SVM', val:'Accuracy 92.4%, Macro F1 0.9269, κ=0.9072'},
      {label:'D2 XLM-R', val:'Accuracy 99.95%, Macro F1 0.9993, κ=0.9986'},
      {label:'D3 XLM-R', val:'Accuracy 98.1%, Macro F1 0.9810, κ=0.9620'},
      {label:'Atypical F1', val:'0.992 — highest per-class score in the project (D1, after SMOTE)'},
    ]
  },
  flask_deploy: {
    title: 'Flask App — Deployment & Proxy Mode',
    code: `# app.py — auto-detects LOCAL vs PROXY mode at startup
_LOCAL_MODELS = os.path.join(BASE_DIR, 'models', 'classical')
_use_local    = os.path.isdir(_LOCAL_MODELS)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    text = data['text'].strip()

    if len(text) > 5000:
        return jsonify({'error': 'Text too long'}), 400

    if _use_local:
        # LOCAL mode — models loaded in memory
        result = predict_all(text)
        return jsonify(result)
    else:
        # PROXY mode — forward to HuggingFace Space
        r = requests.post(
            f'{HF_SPACE_URL}/predict',
            json={'text': text},
            timeout=120
        )
        return r.content, r.status_code

# HF_SPACE_URL = 'https://esvanth-mindscan.hf.space'
# Overridable via environment variable`,
    src: 'app.py lines 25–27 (mode detection) · lines 61–97 (/predict endpoint) · line 70 (5000-char limit) · line 91 (timeout=120)',
    why: '<strong>Why two modes?</strong> The 12 models total ~2GB on disk. Running locally requires the models folder. The HuggingFace Space hosts the same predict.py and models — the proxy just forwards requests there. <strong>Why timeout=120?</strong> The HF Space sleeps after inactivity and takes ~60s to wake. 120s gives headroom. <strong>Why 5000 char limit?</strong> XLM-RoBERTa max_length=256 tokens ≈ ~1500 characters. 5000 chars is a safe upper bound that prevents abuse without being restrictive. <strong>How the browser talks to Flask:</strong> JavaScript fetch() → POST /predict (localhost:5001) → Flask → HF Space → predict_all() → JSON response → render() updates the UI.',
    outputs: [
      {label:'LOCAL mode', val:'Triggered when models/classical/ directory exists. Loads all 12 models at startup (~30s on CPU).'},
      {label:'PROXY mode', val:'Default — no local models needed. Forwards to esvanth-mindscan.hf.space'},
      {label:'Timeout', val:'504 returned after 120s if HF Space is sleeping. Auto-wakes in ~60s.'},
      {label:'Port', val:'localhost:5001 (overridable via PORT env var)'},
    ]
  },
  predict_flow: {
    title: 'predict_all() — Full Inference Flow',
    code: `def predict_all(raw_text):
    # Step 1 — clean text (same function as training)
    clean = clean_text(raw_text)

    # Step 2 — run all 3 classical models per dataset
    #          (LR, SVM, XGBoost share the same TF-IDF vector)
    def predict_classical(text_clean, ds):
        tfidf = _models[f'tfidf_{ds}']
        vec   = tfidf.transform([text_clean])  # sparse vector
        for model_name in ['logistic_regression','svm','xgboost']:
            model    = _models[f'{model_name}_{ds}']
            pred_idx = model.predict(vec)[0]
            label    = le.classes_[pred_idx]
            # SVM has no predict_proba — use softmax(decision_function)
            if hasattr(model, 'predict_proba'):
                conf = model.predict_proba(vec)[0][pred_idx]
            else:
                scores = model.decision_function(vec)[0]
                e = np.exp(scores - scores.max())
                conf = e[pred_idx] / e.sum()

    # Step 3 — run XLM-RoBERTa per dataset
    def predict_transformer(text_raw, ds):
        inputs = tokenizer(text_raw, max_length=max_len,
                           truncation=True, padding='max_length')
        with torch.no_grad():
            logits = model(**inputs).logits
        probs    = torch.softmax(logits, dim=1)[0]
        pred_idx = probs.argmax()

    # Step 4 — majority vote for risk_flag
    suicide_count = sum(1 for r in d3.values()
        if 'suicide' in r['label'] and 'non' not in r['label'])
    risk_flag = suicide_count >= 3`,
    src: 'predict.py — clean_text lines 106–113 · predict_classical lines 119–163 · predict_transformer lines 166–215 · predict_all lines 221–302',
    why: '<strong>Why clean the text first?</strong> The TF-IDF vocabulary was built on clean text — passing raw text would miss tokens. XLM-RoBERTa receives the raw text because its SentencePiece tokeniser handles punctuation/URLs natively. <strong>Why one TF-IDF vector for 3 classical models?</strong> All three (LR, SVM, XGBoost) use the same vectoriser — the vector is computed once and reused, saving 2 redundant transformations per dataset. <strong>Why torch.no_grad()?</strong> Inference doesn\'t need gradients — disabling them halves memory usage and speeds up the forward pass. <strong>Why softmax on logits?</strong> The model outputs raw logit scores (unbounded). Softmax converts them to probabilities that sum to 1 — required for the confidence percentage shown in the UI.',
    outputs: [
      {label:'Input', val:'"I feel exhausted, nothing feels enjoyable"'},
      {label:'After clean', val:'"i feel exhausted nothing feels enjoyable"'},
      {label:'D1 winner', val:'SVM → Major Depressive (highest confidence)'},
      {label:'D2 winner', val:'XLM-RoBERTa → Not Depressed (Twitter Affect Bias — clinical text)'},
      {label:'D3 result', val:'risk_flag computed from 4 model votes; XLM-R checked separately for banner'},
      {label:'Response time', val:'~200ms local (GPU) · ~2–5s proxy (HF Space warm)'},
    ]
  },
  split_study: {
    title: 'D3 Split Study — RQ2',
    code: `# Sample 25K per class (50K total) for the baseline
df3_sample = df3.groupby('label').apply(
    lambda x: x.sample(25000, random_state=42)
).reset_index(drop=True)

# Half splits — 12.5K per class each
df3_h1 = df3.groupby('label').apply(
    lambda x: x.iloc[:12500]
).reset_index(drop=True)
df3_h2 = df3.groupby('label').apply(
    lambda x: x.iloc[12500:25000]
).reset_index(drop=True)

# Full dataset — 116K per class (232K total)
df3_full = df3  # no sampling

# KS test to confirm splits share same distribution
from scipy.stats import ks_2samp
stat, p = ks_2samp(len_sample, len_full)
# p > 0.49 across all splits — identical distributions confirmed`,
    src: 'notebooks/DA_3_SplitStudy.ipynb — cell 28 (sampling) · cell 4 (TrainingArguments) · cell 14 (KS test)',
    why: '<strong>What is RQ2?</strong> "Does more training data improve performance?" The split study trains 4 separate XLM-RoBERTa models on 50K, 116K (×2), and 232K samples. <strong>KS test:</strong> Kolmogorov-Smirnov test verifies all splits come from the same distribution (p > 0.49) — ruling out that one split has easier examples. <strong>Finding:</strong> Accuracy changes by only 0.3% (98.1% → 98.0%) across 4× more data. The 50K sample fully captures the underlying signal distribution.',
    outputs: [
      {label:'50K sample', val:'98.1% accuracy (XLM-RoBERTa)'},
      {label:'H1 (116K)', val:'97.8% accuracy'},
      {label:'H2 (116K)', val:'98.0% accuracy'},
      {label:'Full (232K)', val:'98.0% accuracy — Δ=0.1% vs 50K'},
      {label:'KS p-value', val:'p > 0.49 across all split pairs — identical distributions'},
    ]
  }
};

function openCM(key){
  const d = CM_DATA[key];
  if(!d) return;
  document.getElementById('cmTitle').textContent = d.title;

  // Code tab
  document.getElementById('cmt0').innerHTML =
    '<pre class="cm-pre">'+escHTML(d.code)+'</pre>'+
    '<div class="cm-src">Source: '+escHTML(d.src)+'</div>';

  // Why tab
  document.getElementById('cmt1').innerHTML =
    '<div class="cm-why-body">'+d.why+'</div>';

  // Output tab
  const rows = d.outputs.map(o=>
    '<div class="cm-out-row"><div class="cm-out-lbl">'+escHTML(o.label)+'</div>'+
    '<div class="cm-out-val"><em>'+escHTML(o.val)+'</em></div></div>'
  ).join('');
  document.getElementById('cmt2').innerHTML = rows;

  // Reset to code tab
  switchCMTab(0);
  document.getElementById('cmOverlay').classList.add('open');
  document.body.style.overflow='hidden';
}

function closeCM(){
  document.getElementById('cmOverlay').classList.remove('open');
  document.body.style.overflow='';
}

function closeCMOutside(e){
  if(e.target===document.getElementById('cmOverlay')) closeCM();
}

function switchCMTab(idx){
  document.querySelectorAll('.cm-tab').forEach((t,i)=>t.classList.toggle('active',i===idx));
  document.querySelectorAll('.cm-panel').forEach((p,i)=>p.classList.toggle('active',i===idx));
}

function escHTML(s){
  return String(s).replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;');
}

// Close on Escape key
document.addEventListener('keydown',e=>{ if(e.key==='Escape') closeCM(); });

function toggleFaq(btn){
  const item=btn.closest('.faq-item');
  const wasOpen=item.classList.contains('open');
  item.classList.toggle('open',!wasOpen);
}
</script>
</body>
</html>