Update templates (index.html + flow_diagram.html)
Browse files- templates/flow_diagram.html +705 -0
- templates/index.html +14 -13
templates/flow_diagram.html
ADDED
|
@@ -0,0 +1,705 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width,initial-scale=1.0">
|
| 6 |
+
<title>MindScan β How It Works (Team Reference)</title>
|
| 7 |
+
<link href="https://fonts.googleapis.com/css2?family=Instrument+Serif:ital@0;1&family=Geist:wght@300;400;500&family=DM+Mono:wght@400;500&display=swap" rel="stylesheet">
|
| 8 |
+
<style>
|
| 9 |
+
:root{
|
| 10 |
+
--bg:#f7f5f0;--bg2:#efece8;--bg3:#e6e2da;
|
| 11 |
+
--ink:#1a1816;--ink2:#5c5750;--ink3:#9c9790;
|
| 12 |
+
--border:rgba(26,24,22,0.09);--border2:rgba(26,24,22,0.16);
|
| 13 |
+
--blue:#1d4ed8;--blue-bg:#eff6ff;
|
| 14 |
+
--amber:#b45309;--amber-bg:#fffbeb;
|
| 15 |
+
--red:#b91c1c;--red-bg:#fef2f2;
|
| 16 |
+
--green:#15803d;--green-bg:#f0fdf4;
|
| 17 |
+
--purple:#6d28d9;--purple-bg:#f5f3ff;
|
| 18 |
+
--teal:#0f766e;--teal-bg:#f0fdfa;
|
| 19 |
+
}
|
| 20 |
+
*{box-sizing:border-box;margin:0;padding:0}
|
| 21 |
+
body{background:var(--bg);color:var(--ink);font-family:'Geist',sans-serif;font-size:14px;line-height:1.6;min-height:100vh}
|
| 22 |
+
header{padding:16px 40px;display:flex;align-items:center;justify-content:space-between;border-bottom:1px solid var(--border);background:rgba(247,245,240,.95);position:sticky;top:0;z-index:20;backdrop-filter:blur(8px)}
|
| 23 |
+
.logo{font-family:'Instrument Serif',serif;font-size:17px;letter-spacing:-.02em}.logo em{font-style:italic;color:var(--ink2)}
|
| 24 |
+
.hbadge{font-size:10px;font-family:'DM Mono',monospace;background:var(--blue-bg);color:var(--blue);border:1px solid rgba(29,78,216,.2);padding:3px 9px;border-radius:20px}
|
| 25 |
+
|
| 26 |
+
/* LAYOUT */
|
| 27 |
+
.layout{display:grid;grid-template-columns:320px 1fr;min-height:calc(100vh - 53px)}
|
| 28 |
+
.sidebar{background:#fff;border-right:1px solid var(--border);padding:24px 20px;overflow-y:auto;position:sticky;top:53px;height:calc(100vh - 53px)}
|
| 29 |
+
.main{padding:32px 36px}
|
| 30 |
+
|
| 31 |
+
/* SIDEBAR */
|
| 32 |
+
.sb-title{font-size:11px;font-family:'DM Mono',monospace;letter-spacing:.1em;text-transform:uppercase;color:var(--ink3);margin-bottom:14px}
|
| 33 |
+
.step-list{display:flex;flex-direction:column;gap:4px;margin-bottom:24px}
|
| 34 |
+
.step-btn{display:flex;align-items:center;gap:10px;padding:10px 12px;border-radius:8px;cursor:pointer;border:1px solid transparent;transition:all .15s;text-align:left;background:none;width:100%}
|
| 35 |
+
.step-btn:hover{background:var(--bg2);border-color:var(--border)}
|
| 36 |
+
.step-btn.active{background:var(--ink);border-color:var(--ink)}
|
| 37 |
+
.step-btn.active .sb-num{background:rgba(255,255,255,.15);color:#fff}
|
| 38 |
+
.step-btn.active .sb-name{color:#fff}
|
| 39 |
+
.step-btn.active .sb-loc{color:rgba(255,255,255,.55)}
|
| 40 |
+
.sb-num{width:26px;height:26px;border-radius:6px;background:var(--bg2);display:flex;align-items:center;justify-content:center;font-size:11px;font-family:'DM Mono',monospace;font-weight:500;color:var(--ink2);flex-shrink:0}
|
| 41 |
+
.sb-info{min-width:0}
|
| 42 |
+
.sb-name{font-size:13px;font-weight:500;color:var(--ink);white-space:nowrap;overflow:hidden;text-overflow:ellipsis}
|
| 43 |
+
.sb-loc{font-size:10px;font-family:'DM Mono',monospace;color:var(--ink3);margin-top:1px}
|
| 44 |
+
.sb-divider{height:1px;background:var(--border);margin:12px 0}
|
| 45 |
+
.sb-section{font-size:10px;font-family:'DM Mono',monospace;letter-spacing:.1em;text-transform:uppercase;color:var(--ink3);margin-bottom:8px;margin-top:4px}
|
| 46 |
+
|
| 47 |
+
/* KEY LEGEND */
|
| 48 |
+
.legend{background:var(--bg2);border-radius:8px;padding:12px 14px}
|
| 49 |
+
.legend-title{font-size:10px;font-family:'DM Mono',monospace;letter-spacing:.08em;text-transform:uppercase;color:var(--ink3);margin-bottom:8px}
|
| 50 |
+
.leg-row{display:flex;align-items:center;gap:8px;margin-bottom:5px;font-size:11px;color:var(--ink2)}
|
| 51 |
+
.leg-dot{width:10px;height:10px;border-radius:50%;flex-shrink:0}
|
| 52 |
+
|
| 53 |
+
/* MAIN CONTENT */
|
| 54 |
+
.step-content{display:none;animation:fadeIn .2s ease}
|
| 55 |
+
.step-content.active{display:block}
|
| 56 |
+
@keyframes fadeIn{from{opacity:0;transform:translateY(4px)}to{opacity:1;transform:translateY(0)}}
|
| 57 |
+
|
| 58 |
+
.step-header{display:flex;align-items:flex-start;gap:16px;margin-bottom:24px}
|
| 59 |
+
.step-icon{width:44px;height:44px;border-radius:10px;display:flex;align-items:center;justify-content:center;font-size:20px;flex-shrink:0}
|
| 60 |
+
.step-num-big{font-family:'DM Mono',monospace;font-size:11px;font-weight:500;margin-bottom:4px}
|
| 61 |
+
.step-title{font-family:'Instrument Serif',serif;font-size:26px;letter-spacing:-.02em;color:var(--ink);margin-bottom:4px}
|
| 62 |
+
.step-file{font-size:11px;font-family:'DM Mono',monospace;color:var(--ink3)}
|
| 63 |
+
|
| 64 |
+
.code-block{background:var(--ink);border-radius:10px;padding:18px 20px;font-family:'DM Mono',monospace;font-size:12px;line-height:1.9;color:#e2dfd8;margin-bottom:16px;overflow-x:auto}
|
| 65 |
+
.code-block .kw{color:#79b8ff}
|
| 66 |
+
.code-block .fn{color:#b3d9ff}
|
| 67 |
+
.code-block .str{color:#9ecf72}
|
| 68 |
+
.code-block .cm{color:#6b7d8a}
|
| 69 |
+
.code-block .num{color:#f0c479}
|
| 70 |
+
.code-block .cls{color:#e2b36a}
|
| 71 |
+
|
| 72 |
+
.what-box{background:var(--bg2);border:1px solid var(--border);border-radius:10px;padding:16px 18px;margin-bottom:16px}
|
| 73 |
+
.what-title{font-size:11px;font-family:'DM Mono',monospace;letter-spacing:.08em;text-transform:uppercase;color:var(--ink3);margin-bottom:8px}
|
| 74 |
+
.what-body{font-size:13px;color:var(--ink2);line-height:1.65}
|
| 75 |
+
.what-body strong{color:var(--ink)}
|
| 76 |
+
|
| 77 |
+
.important-box{border-radius:10px;padding:14px 16px;margin-bottom:16px;border:1px solid;font-size:12px;line-height:1.6}
|
| 78 |
+
.imp-blue{background:var(--blue-bg);border-color:rgba(29,78,216,.2);color:#1e3a8a}
|
| 79 |
+
.imp-amber{background:var(--amber-bg);border-color:rgba(180,83,9,.2);color:#78350f}
|
| 80 |
+
.imp-green{background:var(--green-bg);border-color:rgba(21,128,61,.2);color:#14532d}
|
| 81 |
+
.imp-red{background:var(--red-bg);border-color:rgba(185,28,28,.2);color:#7f1d1d}
|
| 82 |
+
.imp-purple{background:var(--purple-bg);border-color:rgba(109,40,217,.2);color:#3b0764}
|
| 83 |
+
.imp-title{font-weight:500;margin-bottom:3px}
|
| 84 |
+
|
| 85 |
+
.returns-box{background:#fff;border:1px solid var(--border);border-radius:10px;padding:16px 18px;margin-bottom:16px}
|
| 86 |
+
.ret-title{font-size:11px;font-family:'DM Mono',monospace;letter-spacing:.08em;text-transform:uppercase;color:var(--ink3);margin-bottom:10px}
|
| 87 |
+
.ret-row{display:flex;align-items:flex-start;gap:10px;padding:7px 0;border-bottom:1px solid var(--border);font-size:12px}
|
| 88 |
+
.ret-row:last-child{border-bottom:none}
|
| 89 |
+
.ret-key{font-family:'DM Mono',monospace;font-weight:500;min-width:180px;color:var(--ink)}
|
| 90 |
+
.ret-val{color:var(--ink2)}
|
| 91 |
+
|
| 92 |
+
/* FLOW MAP */
|
| 93 |
+
.flow-mini{display:flex;align-items:center;gap:8px;flex-wrap:wrap;margin-bottom:20px;background:var(--bg2);border-radius:10px;padding:12px 16px}
|
| 94 |
+
.flow-node{font-size:10px;font-family:'DM Mono',monospace;padding:4px 9px;border-radius:5px;border:1px solid var(--border);background:#fff;white-space:nowrap}
|
| 95 |
+
.flow-node.active-node{background:var(--ink);color:#fff;border-color:var(--ink)}
|
| 96 |
+
.flow-arrow{color:var(--ink3);font-size:12px}
|
| 97 |
+
|
| 98 |
+
/* NAV BUTTONS */
|
| 99 |
+
.nav-btns{display:flex;justify-content:space-between;margin-top:24px;padding-top:16px;border-top:1px solid var(--border)}
|
| 100 |
+
.nav-btn{font-size:12px;font-family:'DM Mono',monospace;padding:7px 16px;border-radius:7px;border:1px solid var(--border2);background:var(--bg2);color:var(--ink2);cursor:pointer;transition:all .15s}
|
| 101 |
+
.nav-btn:hover{background:var(--ink);color:#fff;border-color:var(--ink)}
|
| 102 |
+
.nav-btn:disabled{opacity:.35;cursor:not-allowed}
|
| 103 |
+
|
| 104 |
+
@media(max-width:768px){.layout{grid-template-columns:1fr}.sidebar{position:static;height:auto;border-right:none;border-bottom:1px solid var(--border)}}
|
| 105 |
+
</style>
|
| 106 |
+
</head>
|
| 107 |
+
<body>
|
| 108 |
+
|
| 109 |
+
<header>
|
| 110 |
+
<div class="logo">Mind<em>Scan</em> β <em>System Flow</em></div>
|
| 111 |
+
<div class="hbadge">Team Reference Β· GitHub</div>
|
| 112 |
+
</header>
|
| 113 |
+
|
| 114 |
+
<div class="layout">
|
| 115 |
+
|
| 116 |
+
<!-- SIDEBAR -->
|
| 117 |
+
<div class="sidebar">
|
| 118 |
+
<div class="sb-title">What happens when you click Run?</div>
|
| 119 |
+
|
| 120 |
+
<div class="sb-section">Frontend β index.html</div>
|
| 121 |
+
<div class="step-list">
|
| 122 |
+
<button class="step-btn active" onclick="goTo(0)">
|
| 123 |
+
<div class="sb-num" style="background:var(--blue-bg);color:var(--blue)">1</div>
|
| 124 |
+
<div class="sb-info"><div class="sb-name">Button click</div><div class="sb-loc">index.html β runAnalysis()</div></div>
|
| 125 |
+
</button>
|
| 126 |
+
<button class="step-btn" onclick="goTo(1)">
|
| 127 |
+
<div class="sb-num" style="background:var(--blue-bg);color:var(--blue)">2</div>
|
| 128 |
+
<div class="sb-info"><div class="sb-name">fetch('/predict')</div><div class="sb-loc">index.html β POST request</div></div>
|
| 129 |
+
</button>
|
| 130 |
+
</div>
|
| 131 |
+
|
| 132 |
+
<div class="sb-section">Backend β app.py</div>
|
| 133 |
+
<div class="step-list">
|
| 134 |
+
<button class="step-btn" onclick="goTo(2)">
|
| 135 |
+
<div class="sb-num" style="background:var(--amber-bg);color:var(--amber)">3</div>
|
| 136 |
+
<div class="sb-info"><div class="sb-name">Flask receives it</div><div class="sb-loc">app.py β predict()</div></div>
|
| 137 |
+
</button>
|
| 138 |
+
</div>
|
| 139 |
+
|
| 140 |
+
<div class="sb-section">Prediction logic β predict.py</div>
|
| 141 |
+
<div class="step-list">
|
| 142 |
+
<button class="step-btn" onclick="goTo(3)">
|
| 143 |
+
<div class="sb-num" style="background:var(--green-bg);color:var(--green)">4</div>
|
| 144 |
+
<div class="sb-info"><div class="sb-name">clean_text()</div><div class="sb-loc">predict.py β text cleaning</div></div>
|
| 145 |
+
</button>
|
| 146 |
+
<button class="step-btn" onclick="goTo(4)">
|
| 147 |
+
<div class="sb-num" style="background:var(--green-bg);color:var(--green)">5</div>
|
| 148 |
+
<div class="sb-info"><div class="sb-name">predict_classical()</div><div class="sb-loc">predict.py β LR Β· SVM Β· XGBoost</div></div>
|
| 149 |
+
</button>
|
| 150 |
+
<button class="step-btn" onclick="goTo(5)">
|
| 151 |
+
<div class="sb-num" style="background:var(--purple-bg);color:var(--purple)">6</div>
|
| 152 |
+
<div class="sb-info"><div class="sb-name">predict_xlmr()</div><div class="sb-loc">predict.py β transformer</div></div>
|
| 153 |
+
</button>
|
| 154 |
+
<button class="step-btn" onclick="goTo(6)">
|
| 155 |
+
<div class="sb-num" style="background:var(--green-bg);color:var(--green)">7</div>
|
| 156 |
+
<div class="sb-info"><div class="sb-name">predict_all()</div><div class="sb-loc">predict.py β assembles all 12</div></div>
|
| 157 |
+
</button>
|
| 158 |
+
</div>
|
| 159 |
+
|
| 160 |
+
<div class="sb-section">Response β back to browser</div>
|
| 161 |
+
<div class="step-list">
|
| 162 |
+
<button class="step-btn" onclick="goTo(7)">
|
| 163 |
+
<div class="sb-num" style="background:var(--amber-bg);color:var(--amber)">8</div>
|
| 164 |
+
<div class="sb-info"><div class="sb-name">JSON response</div><div class="sb-loc">app.py β jsonify() β browser</div></div>
|
| 165 |
+
</button>
|
| 166 |
+
<button class="step-btn" onclick="goTo(8)">
|
| 167 |
+
<div class="sb-num" style="background:var(--blue-bg);color:var(--blue)">9</div>
|
| 168 |
+
<div class="sb-info"><div class="sb-name">render() + buildPanel()</div><div class="sb-loc">index.html β shows results</div></div>
|
| 169 |
+
</button>
|
| 170 |
+
<button class="step-btn" onclick="goTo(9)">
|
| 171 |
+
<div class="sb-num" style="background:var(--blue-bg);color:var(--blue)">10</div>
|
| 172 |
+
<div class="sb-info"><div class="sb-name">CSS bar animation</div><div class="sb-loc">index.html β confidence bars</div></div>
|
| 173 |
+
</button>
|
| 174 |
+
</div>
|
| 175 |
+
|
| 176 |
+
<div class="sb-divider"></div>
|
| 177 |
+
<div class="legend">
|
| 178 |
+
<div class="legend-title">File colours</div>
|
| 179 |
+
<div class="leg-row"><div class="leg-dot" style="background:var(--blue)"></div>index.html β frontend JS</div>
|
| 180 |
+
<div class="leg-row"><div class="leg-dot" style="background:var(--amber)"></div>app.py β Flask server</div>
|
| 181 |
+
<div class="leg-row"><div class="leg-dot" style="background:var(--green)"></div>predict.py β model logic</div>
|
| 182 |
+
<div class="leg-row"><div class="leg-dot" style="background:var(--purple)"></div>XLM-RoBERTa specific</div>
|
| 183 |
+
</div>
|
| 184 |
+
</div>
|
| 185 |
+
|
| 186 |
+
<!-- MAIN CONTENT -->
|
| 187 |
+
<div class="main">
|
| 188 |
+
|
| 189 |
+
<!-- STEP 0 β Button click -->
|
| 190 |
+
<div class="step-content active" id="step0">
|
| 191 |
+
<div class="flow-mini">
|
| 192 |
+
<div class="flow-node active-node">1 Β· Button click</div><div class="flow-arrow">β</div>
|
| 193 |
+
<div class="flow-node">2 Β· fetch()</div><div class="flow-arrow">β</div>
|
| 194 |
+
<div class="flow-node">3 Β· Flask</div><div class="flow-arrow">β</div>
|
| 195 |
+
<div class="flow-node">4β7 Β· predict.py</div><div class="flow-arrow">β</div>
|
| 196 |
+
<div class="flow-node">8 Β· JSON</div><div class="flow-arrow">β</div>
|
| 197 |
+
<div class="flow-node">9β10 Β· UI</div>
|
| 198 |
+
</div>
|
| 199 |
+
<div class="step-header">
|
| 200 |
+
<div class="step-icon" style="background:var(--blue-bg)">π±</div>
|
| 201 |
+
<div>
|
| 202 |
+
<div class="step-num-big" style="color:var(--blue)">Step 01 Β· index.html</div>
|
| 203 |
+
<div class="step-title">User clicks "Run all 12 models"</div>
|
| 204 |
+
<div class="step-file">onclick="runAnalysis()" β defined in index.html <script></div>
|
| 205 |
+
</div>
|
| 206 |
+
</div>
|
| 207 |
+
<div class="what-box">
|
| 208 |
+
<div class="what-title">What happens</div>
|
| 209 |
+
<div class="what-body">The button has an <strong>onclick</strong> attribute pointing to <strong>runAnalysis()</strong>. This function grabs whatever text is in the textarea, disables the button, shows a spinning animation, and starts the process. Nothing touches any model yet β this is purely UI setup.</div>
|
| 210 |
+
</div>
|
| 211 |
+
<div class="code-block"><span class="cm">// Button in HTML</span>
|
| 212 |
+
<button <span class="kw">onclick</span>=<span class="str">"runAnalysis()"</span>>
|
| 213 |
+
Run all <span class="num">12</span> models
|
| 214 |
+
</button>
|
| 215 |
+
|
| 216 |
+
<span class="cm">// Function in <script> at bottom of index.html</span>
|
| 217 |
+
<span class="kw">async function</span> <span class="fn">runAnalysis</span>() {
|
| 218 |
+
<span class="kw">const</span> text = document.<span class="fn">getElementById</span>(<span class="str">'textInput'</span>).value.<span class="fn">trim</span>();
|
| 219 |
+
<span class="kw">if</span> (!text) <span class="kw">return</span>; <span class="cm">// do nothing if textarea is empty</span>
|
| 220 |
+
|
| 221 |
+
btn.disabled = <span class="kw">true</span>; <span class="cm">// disable button while running</span>
|
| 222 |
+
spinner.style.display = <span class="str">'block'</span>; <span class="cm">// show spinning circle</span>
|
| 223 |
+
btnTxt.textContent = <span class="str">'Running 12 models...'</span>;
|
| 224 |
+
<span class="cm">// next: send to backend β</span>
|
| 225 |
+
}</div>
|
| 226 |
+
<div class="important-box imp-blue">
|
| 227 |
+
<div class="imp-title">Important for teammates</div>
|
| 228 |
+
The function is async (uses await). This means the browser does NOT freeze while waiting for the server β the user can still scroll the page. async/await is just a cleaner way of writing a Promise.
|
| 229 |
+
</div>
|
| 230 |
+
<div class="nav-btns">
|
| 231 |
+
<button class="nav-btn" disabled>β Previous</button>
|
| 232 |
+
<button class="nav-btn" onclick="goTo(1)">Next: fetch('/predict') β</button>
|
| 233 |
+
</div>
|
| 234 |
+
</div>
|
| 235 |
+
|
| 236 |
+
<!-- STEP 1 β fetch -->
|
| 237 |
+
<div class="step-content" id="step1">
|
| 238 |
+
<div class="flow-mini">
|
| 239 |
+
<div class="flow-node">1 Β· Button click</div><div class="flow-arrow">β</div>
|
| 240 |
+
<div class="flow-node active-node">2 Β· fetch()</div><div class="flow-arrow">β</div>
|
| 241 |
+
<div class="flow-node">3 Β· Flask</div><div class="flow-arrow">β</div>
|
| 242 |
+
<div class="flow-node">4β7 Β· predict.py</div><div class="flow-arrow">β</div>
|
| 243 |
+
<div class="flow-node">8 Β· JSON</div><div class="flow-arrow">β</div>
|
| 244 |
+
<div class="flow-node">9β10 Β· UI</div>
|
| 245 |
+
</div>
|
| 246 |
+
<div class="step-header">
|
| 247 |
+
<div class="step-icon" style="background:var(--blue-bg)">π‘</div>
|
| 248 |
+
<div>
|
| 249 |
+
<div class="step-num-big" style="color:var(--blue)">Step 02 Β· index.html</div>
|
| 250 |
+
<div class="step-title">HTTP request sent to Flask</div>
|
| 251 |
+
<div class="step-file">fetch('/predict') β browser's built-in HTTP function</div>
|
| 252 |
+
</div>
|
| 253 |
+
</div>
|
| 254 |
+
<div class="what-box">
|
| 255 |
+
<div class="what-title">What happens</div>
|
| 256 |
+
<div class="what-body">The browser sends an <strong>HTTP POST request</strong> to the Flask server at <strong>/predict</strong>. The text is sent as JSON in the request body. The browser then waits for a response β this is when the ~2 second loading spinner appears.</div>
|
| 257 |
+
</div>
|
| 258 |
+
<div class="code-block"><span class="cm">// Still inside runAnalysis() in index.html</span>
|
| 259 |
+
<span class="kw">const</span> r = <span class="kw">await</span> <span class="fn">fetch</span>(<span class="str">'/predict'</span>, {
|
| 260 |
+
method: <span class="str">'POST'</span>,
|
| 261 |
+
headers: { <span class="str">'Content-Type'</span>: <span class="str">'application/json'</span> },
|
| 262 |
+
body: <span class="cls">JSON</span>.<span class="fn">stringify</span>({ text })
|
| 263 |
+
<span class="cm">// sends: { "text": "I feel hopeless..." }</span>
|
| 264 |
+
});
|
| 265 |
+
|
| 266 |
+
<span class="kw">const</span> d = <span class="kw">await</span> r.<span class="fn">json</span>(); <span class="cm">// parse the JSON response</span>
|
| 267 |
+
<span class="fn">render</span>(d, text); <span class="cm">// draw results on screen</span></div>
|
| 268 |
+
<div class="important-box imp-amber">
|
| 269 |
+
<div class="imp-title">Why /predict and not a full URL?</div>
|
| 270 |
+
Because the frontend and backend run on the same server (localhost:5000). Flask serves both the HTML page and the API endpoint. A relative URL like /predict automatically goes to the same host.
|
| 271 |
+
</div>
|
| 272 |
+
<div class="nav-btns">
|
| 273 |
+
<button class="nav-btn" onclick="goTo(0)">β Button click</button>
|
| 274 |
+
<button class="nav-btn" onclick="goTo(2)">Next: Flask receives it β</button>
|
| 275 |
+
</div>
|
| 276 |
+
</div>
|
| 277 |
+
|
| 278 |
+
<!-- STEP 2 β Flask -->
|
| 279 |
+
<div class="step-content" id="step2">
|
| 280 |
+
<div class="flow-mini">
|
| 281 |
+
<div class="flow-node">1 Β· Button click</div><div class="flow-arrow">β</div>
|
| 282 |
+
<div class="flow-node">2 Β· fetch()</div><div class="flow-arrow">β</div>
|
| 283 |
+
<div class="flow-node active-node">3 Β· Flask</div><div class="flow-arrow">β</div>
|
| 284 |
+
<div class="flow-node">4β7 Β· predict.py</div><div class="flow-arrow">β</div>
|
| 285 |
+
<div class="flow-node">8 Β· JSON</div><div class="flow-arrow">β</div>
|
| 286 |
+
<div class="flow-node">9β10 Β· UI</div>
|
| 287 |
+
</div>
|
| 288 |
+
<div class="step-header">
|
| 289 |
+
<div class="step-icon" style="background:var(--amber-bg)">π</div>
|
| 290 |
+
<div>
|
| 291 |
+
<div class="step-num-big" style="color:var(--amber)">Step 03 Β· app.py</div>
|
| 292 |
+
<div class="step-title">Flask receives the POST request</div>
|
| 293 |
+
<div class="step-file">app.py β @app.route('/predict') β predict()</div>
|
| 294 |
+
</div>
|
| 295 |
+
</div>
|
| 296 |
+
<div class="what-box">
|
| 297 |
+
<div class="what-title">What happens</div>
|
| 298 |
+
<div class="what-body">Flask matches the incoming request to the <strong>@app.route('/predict')</strong> decorator. The predict() function extracts the text from the request body, validates it (not empty, not too long), then calls predict_all() from predict.py. It wraps the result with processing time and sends it back as JSON.</div>
|
| 299 |
+
</div>
|
| 300 |
+
<div class="code-block"><span class="cm"># app.py</span>
|
| 301 |
+
<span class="kw">from</span> predict <span class="kw">import</span> predict_all
|
| 302 |
+
|
| 303 |
+
<span class="kw">@app.route</span>(<span class="str">'/predict'</span>, methods=[<span class="str">'POST'</span>])
|
| 304 |
+
<span class="kw">def</span> <span class="fn">predict</span>():
|
| 305 |
+
data = request.<span class="fn">get_json</span>()
|
| 306 |
+
text = data[<span class="str">'text'</span>].<span class="fn">strip</span>()
|
| 307 |
+
|
| 308 |
+
<span class="cm"># validation</span>
|
| 309 |
+
<span class="kw">if not</span> text:
|
| 310 |
+
<span class="kw">return</span> <span class="fn">jsonify</span>({<span class="str">'error'</span>: <span class="str">'Text cannot be empty'</span>}), <span class="num">400</span>
|
| 311 |
+
<span class="kw">if</span> <span class="fn">len</span>(text) > <span class="num">5000</span>:
|
| 312 |
+
<span class="kw">return</span> <span class="fn">jsonify</span>({<span class="str">'error'</span>: <span class="str">'Too long'</span>}), <span class="num">400</span>
|
| 313 |
+
|
| 314 |
+
t0 = time.<span class="fn">time</span>()
|
| 315 |
+
result = <span class="fn">predict_all</span>(text) <span class="cm"># β the big function (next steps)</span>
|
| 316 |
+
result[<span class="str">'processing_time_ms'</span>] = <span class="fn">round</span>((time.<span class="fn">time</span>() - t0) * <span class="num">1000</span>)
|
| 317 |
+
<span class="kw">return</span> <span class="fn">jsonify</span>(result) <span class="cm"># sends JSON back to browser</span></div>
|
| 318 |
+
<div class="important-box imp-amber">
|
| 319 |
+
<div class="imp-title">Models load at STARTUP not per request</div>
|
| 320 |
+
The 12 models are loaded once when you run python app.py (takes ~30s). Every subsequent request reuses them from RAM. If models loaded per request it would take 30s per click.
|
| 321 |
+
</div>
|
| 322 |
+
<div class="nav-btns">
|
| 323 |
+
<button class="nav-btn" onclick="goTo(1)">β fetch()</button>
|
| 324 |
+
<button class="nav-btn" onclick="goTo(3)">Next: clean_text() β</button>
|
| 325 |
+
</div>
|
| 326 |
+
</div>
|
| 327 |
+
|
| 328 |
+
<!-- STEP 3 β clean_text -->
|
| 329 |
+
<div class="step-content" id="step3">
|
| 330 |
+
<div class="flow-mini">
|
| 331 |
+
<div class="flow-node">1β3 Β· Browser β Flask</div><div class="flow-arrow">β</div>
|
| 332 |
+
<div class="flow-node active-node">4 Β· clean_text()</div><div class="flow-arrow">β</div>
|
| 333 |
+
<div class="flow-node">5 Β· classical</div><div class="flow-arrow">β</div>
|
| 334 |
+
<div class="flow-node">6 Β· XLM-R</div><div class="flow-arrow">β</div>
|
| 335 |
+
<div class="flow-node">7 Β· assemble</div>
|
| 336 |
+
</div>
|
| 337 |
+
<div class="step-header">
|
| 338 |
+
<div class="step-icon" style="background:var(--green-bg)">π§Ή</div>
|
| 339 |
+
<div>
|
| 340 |
+
<div class="step-num-big" style="color:var(--green)">Step 04 Β· predict.py</div>
|
| 341 |
+
<div class="step-title">Text cleaning</div>
|
| 342 |
+
<div class="step-file">predict.py β clean_text(raw_text)</div>
|
| 343 |
+
</div>
|
| 344 |
+
</div>
|
| 345 |
+
<div class="what-box">
|
| 346 |
+
<div class="what-title">What happens</div>
|
| 347 |
+
<div class="what-body">The raw text is cleaned with the <strong>same pipeline used in training</strong>. This is critical β if you trained on cleaned text, you must clean the same way at prediction time. The classical models (LR, SVM, XGBoost) use the cleaned version. XLM-RoBERTa uses the original raw text because its tokeniser handles formatting itself.</div>
|
| 348 |
+
</div>
|
| 349 |
+
<div class="code-block"><span class="cm"># predict.py</span>
|
| 350 |
+
<span class="kw">def</span> <span class="fn">clean_text</span>(text):
|
| 351 |
+
text = <span class="fn">str</span>(text).<span class="fn">lower</span>() <span class="cm"># UPPERCASE β lowercase</span>
|
| 352 |
+
text = re.<span class="fn">sub</span>(<span class="str">r'http\S+|www\S+'</span>, <span class="str">''</span>, text) <span class="cm"># remove URLs</span>
|
| 353 |
+
text = re.<span class="fn">sub</span>(<span class="str">r'@\w+'</span>, <span class="str">''</span>, text) <span class="cm"># remove @mentions</span>
|
| 354 |
+
text = re.<span class="fn">sub</span>(<span class="str">r'#'</span>, <span class="str">''</span>, text) <span class="cm"># remove # (keep word)</span>
|
| 355 |
+
text = text.<span class="fn">translate</span>(str.<span class="fn">maketrans</span>(<span class="str">''</span>,<span class="str">''</span>,punctuation)) <span class="cm"># remove !.,?etc</span>
|
| 356 |
+
text = re.<span class="fn">sub</span>(<span class="str">r'\s+'</span>, <span class="str">' '</span>, text).<span class="fn">strip</span>() <span class="cm"># collapse spaces</span>
|
| 357 |
+
<span class="kw">return</span> text
|
| 358 |
+
|
| 359 |
+
<span class="cm"># Example:</span>
|
| 360 |
+
<span class="cm"># IN: "@user I've been SO depressed https://t.co #mentalhealth π’"</span>
|
| 361 |
+
<span class="cm"># OUT: "ive been so depressed mentalhealth"</span></div>
|
| 362 |
+
<div class="returns-box">
|
| 363 |
+
<div class="ret-title">Used by</div>
|
| 364 |
+
<div class="ret-row"><div class="ret-key">Classical models (LR/SVM/XGB)</div><div class="ret-val">Use the cleaned version β TF-IDF cannot handle URLs, emojis, punctuation</div></div>
|
| 365 |
+
<div class="ret-row"><div class="ret-key">XLM-RoBERTa</div><div class="ret-val">Uses the ORIGINAL raw_text β the transformer's tokeniser handles it better</div></div>
|
| 366 |
+
</div>
|
| 367 |
+
<div class="nav-btns">
|
| 368 |
+
<button class="nav-btn" onclick="goTo(2)">β Flask</button>
|
| 369 |
+
<button class="nav-btn" onclick="goTo(4)">Next: Classical models β</button>
|
| 370 |
+
</div>
|
| 371 |
+
</div>
|
| 372 |
+
|
| 373 |
+
<!-- STEP 4 β classical -->
|
| 374 |
+
<div class="step-content" id="step4">
|
| 375 |
+
<div class="flow-mini">
|
| 376 |
+
<div class="flow-node">1β4 Β· Browser β clean</div><div class="flow-arrow">β</div>
|
| 377 |
+
<div class="flow-node active-node">5 Β· predict_classical()</div><div class="flow-arrow">β</div>
|
| 378 |
+
<div class="flow-node">6 Β· XLM-R</div><div class="flow-arrow">β</div>
|
| 379 |
+
<div class="flow-node">7 Β· assemble</div>
|
| 380 |
+
</div>
|
| 381 |
+
<div class="step-header">
|
| 382 |
+
<div class="step-icon" style="background:var(--green-bg)">βοΈ</div>
|
| 383 |
+
<div>
|
| 384 |
+
<div class="step-num-big" style="color:var(--green)">Step 05 Β· predict.py</div>
|
| 385 |
+
<div class="step-title">Classical model predictions</div>
|
| 386 |
+
<div class="step-file">predict.py β predict_classical(text_clean, ds)</div>
|
| 387 |
+
</div>
|
| 388 |
+
</div>
|
| 389 |
+
<div class="what-box">
|
| 390 |
+
<div class="what-title">What happens β 3 steps inside this function</div>
|
| 391 |
+
<div class="what-body"><strong>1. TF-IDF transform:</strong> Converts the cleaned text into a vector of 50,000 numbers using the same vectoriser fitted during training.<br><br><strong>2. Model.predict:</strong> Each of the 3 classical models takes the vector and outputs a class index (e.g. 4 = "postpartum").<br><br><strong>3. Confidence score:</strong> Different method per model β LR and XGBoost use predict_proba(), SVM uses decision_function() converted via softmax.</div>
|
| 392 |
+
</div>
|
| 393 |
+
<div class="code-block"><span class="cm"># predict.py β called 3Γ (once per dataset)</span>
|
| 394 |
+
<span class="kw">def</span> <span class="fn">predict_classical</span>(text_clean, ds):
|
| 395 |
+
tfidf = _models[<span class="str">f'tfidf_{ds}'</span>]
|
| 396 |
+
le = _models[<span class="str">f'le_{ds}'</span>]
|
| 397 |
+
vec = tfidf.<span class="fn">transform</span>([text_clean]) <span class="cm"># text β 50K-dim vector</span>
|
| 398 |
+
|
| 399 |
+
<span class="kw">for</span> model_name <span class="kw">in</span> [<span class="str">'logistic_regression'</span>, <span class="str">'svm'</span>, <span class="str">'xgboost'</span>]:
|
| 400 |
+
model = _models[<span class="str">f'{model_name}_{ds}'</span>]
|
| 401 |
+
pred_idx = model.<span class="fn">predict</span>(vec)[<span class="num">0</span>] <span class="cm"># β e.g. 4</span>
|
| 402 |
+
label = le.classes_[pred_idx] <span class="cm"># 4 β "postpartum"</span>
|
| 403 |
+
|
| 404 |
+
<span class="cm"># LR / XGBoost: direct probability</span>
|
| 405 |
+
<span class="kw">if</span> <span class="fn">hasattr</span>(model, <span class="str">'predict_proba'</span>):
|
| 406 |
+
conf = model.<span class="fn">predict_proba</span>(vec)[<span class="num">0</span>][pred_idx]
|
| 407 |
+
|
| 408 |
+
<span class="cm"># SVM: no predict_proba β use softmax of decision scores</span>
|
| 409 |
+
<span class="kw">elif</span> <span class="fn">hasattr</span>(model, <span class="str">'decision_function'</span>):
|
| 410 |
+
scores = model.<span class="fn">decision_function</span>(vec)[<span class="num">0</span>]
|
| 411 |
+
e = np.<span class="fn">exp</span>(scores - scores.<span class="fn">max</span>())
|
| 412 |
+
conf = e[pred_idx] / e.<span class="fn">sum</span>() <span class="cm"># normalise to 0β1</span></div>
|
| 413 |
+
<div class="important-box imp-green">
|
| 414 |
+
<div class="imp-title">Why SVM needs special treatment</div>
|
| 415 |
+
SVM (LinearSVC) finds a decision boundary but does not model probabilities β it just says "which side of the line?" Converting decision_function scores with softmax gives a reasonable confidence proxy. It is not a true probability but works well enough for display.
|
| 416 |
+
</div>
|
| 417 |
+
<div class="nav-btns">
|
| 418 |
+
<button class="nav-btn" onclick="goTo(3)">β clean_text()</button>
|
| 419 |
+
<button class="nav-btn" onclick="goTo(5)">Next: XLM-RoBERTa β</button>
|
| 420 |
+
</div>
|
| 421 |
+
</div>
|
| 422 |
+
|
| 423 |
+
<!-- STEP 5 β XLM-RoBERTa -->
|
| 424 |
+
<div class="step-content" id="step5">
|
| 425 |
+
<div class="flow-mini">
|
| 426 |
+
<div class="flow-node">1β5 Β· Browser β classical</div><div class="flow-arrow">β</div>
|
| 427 |
+
<div class="flow-node active-node">6 Β· predict_xlmr()</div><div class="flow-arrow">β</div>
|
| 428 |
+
<div class="flow-node">7 Β· assemble</div>
|
| 429 |
+
</div>
|
| 430 |
+
<div class="step-header">
|
| 431 |
+
<div class="step-icon" style="background:var(--purple-bg)">π€</div>
|
| 432 |
+
<div>
|
| 433 |
+
<div class="step-num-big" style="color:var(--purple)">Step 06 Β· predict.py</div>
|
| 434 |
+
<div class="step-title">XLM-RoBERTa prediction</div>
|
| 435 |
+
<div class="step-file">predict.py β predict_xlmr(raw_text, model, le, max_len)</div>
|
| 436 |
+
</div>
|
| 437 |
+
</div>
|
| 438 |
+
<div class="what-box">
|
| 439 |
+
<div class="what-title">What happens β 4 steps</div>
|
| 440 |
+
<div class="what-body"><strong>1. Tokenise:</strong> The tokeniser splits text into sub-word pieces and converts them to integer IDs (e.g. "hopeless" might become [1234, 5678]).<br><br><strong>2. Forward pass:</strong> The 278M parameter model processes the token IDs and produces raw logit scores for each class.<br><br><strong>3. Softmax:</strong> Converts logits to proper probabilities that sum to 1.0.<br><br><strong>4. All class probs:</strong> Returns every class probability, not just the winner β this feeds the 6-class breakdown bars in Dataset 1.</div>
|
| 441 |
+
</div>
|
| 442 |
+
<div class="code-block"><span class="cm"># predict.py β called 3Γ (once per dataset)</span>
|
| 443 |
+
<span class="kw">def</span> <span class="fn">predict_xlmr</span>(raw_text, xlmr_model, le, max_len=<span class="num">128</span>):
|
| 444 |
+
inputs = tokenizer(
|
| 445 |
+
raw_text,
|
| 446 |
+
return_tensors=<span class="str">'pt'</span>, <span class="cm"># PyTorch tensors</span>
|
| 447 |
+
max_length=max_len, <span class="cm"># 128 for tweets, 256 for Reddit</span>
|
| 448 |
+
truncation=<span class="kw">True</span>,
|
| 449 |
+
padding=<span class="str">'max_length'</span>
|
| 450 |
+
).<span class="fn">to</span>(device) <span class="cm"># send to GPU if available</span>
|
| 451 |
+
|
| 452 |
+
<span class="kw">with</span> torch.<span class="fn">no_grad</span>(): <span class="cm"># no_grad saves memory (not training)</span>
|
| 453 |
+
logits = xlmr_model(**inputs).logits
|
| 454 |
+
|
| 455 |
+
probs = torch.<span class="fn">softmax</span>(logits, dim=<span class="num">1</span>)[<span class="num">0</span>] <span class="cm"># β [0.91, 0.04, 0.02, ...]</span>
|
| 456 |
+
pred_idx = <span class="fn">int</span>(probs.<span class="fn">argmax</span>()) <span class="cm"># index of highest</span>
|
| 457 |
+
label = le.classes_[pred_idx]
|
| 458 |
+
|
| 459 |
+
all_probs = {le.classes_[i]: <span class="fn">float</span>(p) <span class="kw">for</span> i, p <span class="kw">in</span> <span class="fn">enumerate</span>(probs)}
|
| 460 |
+
<span class="cm"># all_probs = {"postpartum":0.913, "bipolar":0.041, ...}</span>
|
| 461 |
+
<span class="cm"># only D1 uses this for the breakdown chart</span>
|
| 462 |
+
|
| 463 |
+
<span class="kw">return</span> {<span class="str">'label'</span>: label, <span class="str">'confidence'</span>: <span class="fn">float</span>(probs[pred_idx]), <span class="str">'all_probs'</span>: all_probs}</div>
|
| 464 |
+
<div class="important-box imp-purple">
|
| 465 |
+
<div class="imp-title">max_length differs per dataset</div>
|
| 466 |
+
D1 and D2 are tweets (avg 31 words β 40 tokens) β max_length=128. D3 is Reddit posts (avg 200 words β 260 tokens) β max_length=256. This doubles memory usage for D3, which is why batch_size was halved during training.
|
| 467 |
+
</div>
|
| 468 |
+
<div class="nav-btns">
|
| 469 |
+
<button class="nav-btn" onclick="goTo(4)">β Classical models</button>
|
| 470 |
+
<button class="nav-btn" onclick="goTo(6)">Next: predict_all() β</button>
|
| 471 |
+
</div>
|
| 472 |
+
</div>
|
| 473 |
+
|
| 474 |
+
<!-- STEP 6 β predict_all -->
|
| 475 |
+
<div class="step-content" id="step6">
|
| 476 |
+
<div class="flow-mini">
|
| 477 |
+
<div class="flow-node">1β6 Β· all models run</div><div class="flow-arrow">β</div>
|
| 478 |
+
<div class="flow-node active-node">7 Β· predict_all()</div><div class="flow-arrow">β</div>
|
| 479 |
+
<div class="flow-node">8 Β· JSON</div>
|
| 480 |
+
</div>
|
| 481 |
+
<div class="step-header">
|
| 482 |
+
<div class="step-icon" style="background:var(--green-bg)">π§</div>
|
| 483 |
+
<div>
|
| 484 |
+
<div class="step-num-big" style="color:var(--green)">Step 07 Β· predict.py</div>
|
| 485 |
+
<div class="step-title">predict_all() assembles everything</div>
|
| 486 |
+
<div class="step-file">predict.py β predict_all(raw_text) β the main function</div>
|
| 487 |
+
</div>
|
| 488 |
+
</div>
|
| 489 |
+
<div class="what-box">
|
| 490 |
+
<div class="what-title">What happens</div>
|
| 491 |
+
<div class="what-body">predict_all() is the orchestrator. It calls predict_classical() 3 times (once per dataset) and predict_xlmr() 3 times. Then it finds the winner per dataset (highest confidence), runs the suicide majority vote across D3's 4 models, and packages everything into a single JSON-ready dictionary.</div>
|
| 492 |
+
</div>
|
| 493 |
+
<div class="code-block"><span class="cm"># predict.py β the main function Flask calls</span>
|
| 494 |
+
<span class="kw">def</span> <span class="fn">predict_all</span>(raw_text):
|
| 495 |
+
clean = <span class="fn">clean_text</span>(raw_text)
|
| 496 |
+
|
| 497 |
+
<span class="cm"># Run all 4 models per dataset</span>
|
| 498 |
+
d1 = <span class="fn">predict_classical</span>(clean, <span class="str">'d1'</span>) <span class="cm"># β {LR:{}, SVM:{}, XGB:{}}</span>
|
| 499 |
+
d1[<span class="str">'XLM-RoBERTa'</span>] = <span class="fn">predict_xlmr</span>(raw_text, xlmr1, le1, <span class="num">128</span>)
|
| 500 |
+
<span class="cm"># same for d2, d3...</span>
|
| 501 |
+
|
| 502 |
+
<span class="cm"># Winner = model with highest confidence</span>
|
| 503 |
+
d1_winner = <span class="fn">max</span>(d1.items(), key=<span class="kw">lambda</span> x: x[<span class="num">1</span>][<span class="str">'confidence'</span>])
|
| 504 |
+
<span class="cm"># β ('XGBoost', {'label': 'postpartum', 'confidence': 0.999})</span>
|
| 505 |
+
|
| 506 |
+
<span class="cm"># Suicide risk = majority vote across 4 D3 models</span>
|
| 507 |
+
suicide_count = <span class="fn">sum</span>(
|
| 508 |
+
<span class="num">1</span> <span class="kw">for</span> r <span class="kw">in</span> d3.values()
|
| 509 |
+
<span class="kw">if</span> <span class="str">'suicide'</span> <span class="kw">in</span> r[<span class="str">'label'</span>] <span class="kw">and</span> <span class="str">'non'</span> <span class="kw">not in</span> r[<span class="str">'label'</span>]
|
| 510 |
+
)
|
| 511 |
+
risk_flag = suicide_count >= <span class="num">3</span> <span class="cm"># β₯3 of 4 models β HIGH RISK</span>
|
| 512 |
+
|
| 513 |
+
<span class="kw">return</span> {
|
| 514 |
+
<span class="str">'dataset1'</span>: {<span class="str">'models'</span>: d1, <span class="str">'winner_model'</span>: d1_winner[<span class="num">0</span>], ...},
|
| 515 |
+
<span class="str">'dataset2'</span>: {...},
|
| 516 |
+
<span class="str">'dataset3'</span>: {...},
|
| 517 |
+
<span class="str">'risk_flag'</span>: risk_flag,
|
| 518 |
+
<span class="str">'suicide_votes'</span>: <span class="str">f'{suicide_count}/4 models flagged'</span>
|
| 519 |
+
}</div>
|
| 520 |
+
<div class="important-box imp-red">
|
| 521 |
+
<div class="imp-title">The majority vote threshold β why 3 of 4?</div>
|
| 522 |
+
We chose 3/4 (75%) as the threshold for the high-risk alert. 2/4 (50%) would be too sensitive β a single false positive triggers an alert. 4/4 (100%) would be too strict β if one model misses it, no alert. 3/4 balances sensitivity against false alarms for a research prototype.
|
| 523 |
+
</div>
|
| 524 |
+
<div class="nav-btns">
|
| 525 |
+
<button class="nav-btn" onclick="goTo(5)">β XLM-RoBERTa</button>
|
| 526 |
+
<button class="nav-btn" onclick="goTo(7)">Next: JSON response β</button>
|
| 527 |
+
</div>
|
| 528 |
+
</div>
|
| 529 |
+
|
| 530 |
+
<!-- STEP 7 β JSON response -->
|
| 531 |
+
<div class="step-content" id="step7">
|
| 532 |
+
<div class="flow-mini">
|
| 533 |
+
<div class="flow-node">1β7 Β· All predictions done</div><div class="flow-arrow">β</div>
|
| 534 |
+
<div class="flow-node active-node">8 Β· JSON response</div><div class="flow-arrow">β</div>
|
| 535 |
+
<div class="flow-node">9β10 Β· UI renders</div>
|
| 536 |
+
</div>
|
| 537 |
+
<div class="step-header">
|
| 538 |
+
<div class="step-icon" style="background:var(--amber-bg)">π¦</div>
|
| 539 |
+
<div>
|
| 540 |
+
<div class="step-num-big" style="color:var(--amber)">Step 08 Β· app.py β browser</div>
|
| 541 |
+
<div class="step-title">JSON sent back to browser</div>
|
| 542 |
+
<div class="step-file">app.py β jsonify(result) β HTTP 200 response</div>
|
| 543 |
+
</div>
|
| 544 |
+
</div>
|
| 545 |
+
<div class="what-box">
|
| 546 |
+
<div class="what-title">What the browser receives</div>
|
| 547 |
+
<div class="what-body">Flask wraps the predict_all() result in a JSON HTTP response. The browser's fetch() receives this and parses it. The structure below is exactly what flows into the render() function next.</div>
|
| 548 |
+
</div>
|
| 549 |
+
<div class="code-block">{
|
| 550 |
+
<span class="str">"dataset1"</span>: {
|
| 551 |
+
<span class="str">"task"</span>: <span class="str">"Depression Type (6 Classes)"</span>,
|
| 552 |
+
<span class="str">"models"</span>: {
|
| 553 |
+
<span class="str">"Logistic Regression"</span>: { <span class="str">"label"</span>: <span class="str">"postpartum"</span>, <span class="str">"confidence"</span>: <span class="num">0.958</span> },
|
| 554 |
+
<span class="str">"SVM"</span>: { <span class="str">"label"</span>: <span class="str">"postpartum"</span>, <span class="str">"confidence"</span>: <span class="num">0.828</span> },
|
| 555 |
+
<span class="str">"XGBoost"</span>: { <span class="str">"label"</span>: <span class="str">"postpartum"</span>, <span class="str">"confidence"</span>: <span class="num">0.999</span> },
|
| 556 |
+
<span class="str">"XLM-RoBERTa"</span>: { <span class="str">"label"</span>: <span class="str">"postpartum"</span>, <span class="str">"confidence"</span>: <span class="num">0.997</span> }
|
| 557 |
+
},
|
| 558 |
+
<span class="str">"winner_model"</span>: <span class="str">"XGBoost"</span>,
|
| 559 |
+
<span class="str">"winner_prediction"</span>: <span class="str">"postpartum"</span>,
|
| 560 |
+
<span class="str">"winner_confidence"</span>: <span class="num">0.999</span>,
|
| 561 |
+
<span class="str">"class_probs"</span>: { <span class="str">"postpartum"</span>: <span class="num">0.997</span>, <span class="str">"bipolar"</span>: <span class="num">0.001</span>, ... }
|
| 562 |
+
},
|
| 563 |
+
<span class="str">"dataset2"</span>: { ... },
|
| 564 |
+
<span class="str">"dataset3"</span>: { ... },
|
| 565 |
+
<span class="str">"risk_flag"</span>: <span class="kw">false</span>,
|
| 566 |
+
<span class="str">"suicide_votes"</span>: <span class="str">"0/4 models flagged suicide risk"</span>,
|
| 567 |
+
<span class="str">"processing_time_ms"</span>: <span class="num">2341</span>
|
| 568 |
+
}</div>
|
| 569 |
+
<div class="nav-btns">
|
| 570 |
+
<button class="nav-btn" onclick="goTo(6)">β predict_all()</button>
|
| 571 |
+
<button class="nav-btn" onclick="goTo(8)">Next: render() β</button>
|
| 572 |
+
</div>
|
| 573 |
+
</div>
|
| 574 |
+
|
| 575 |
+
<!-- STEP 8 β render -->
|
| 576 |
+
<div class="step-content" id="step8">
|
| 577 |
+
<div class="flow-mini">
|
| 578 |
+
<div class="flow-node">1β8 Β· JSON received</div><div class="flow-arrow">β</div>
|
| 579 |
+
<div class="flow-node active-node">9 Β· render() + buildPanel()</div><div class="flow-arrow">β</div>
|
| 580 |
+
<div class="flow-node">10 Β· CSS animation</div>
|
| 581 |
+
</div>
|
| 582 |
+
<div class="step-header">
|
| 583 |
+
<div class="step-icon" style="background:var(--blue-bg)">π¨</div>
|
| 584 |
+
<div>
|
| 585 |
+
<div class="step-num-big" style="color:var(--blue)">Step 09 Β· index.html</div>
|
| 586 |
+
<div class="step-title">render() draws the results</div>
|
| 587 |
+
<div class="step-file">index.html β render(data) β buildPanel() Γ 3</div>
|
| 588 |
+
</div>
|
| 589 |
+
</div>
|
| 590 |
+
<div class="what-box">
|
| 591 |
+
<div class="what-title">What happens</div>
|
| 592 |
+
<div class="what-body">render() fills in the three winner cards (depression type, depressed?, suicide risk) and then calls buildPanel() three times β once per dataset β to build the model comparison rows. Each row shows the model name, its prediction, a confidence bar, and a β
if it's the winner.</div>
|
| 593 |
+
</div>
|
| 594 |
+
<div class="code-block"><span class="cm">// index.html β called after fetch() returns</span>
|
| 595 |
+
<span class="kw">function</span> <span class="fn">render</span>(d, text) {
|
| 596 |
+
|
| 597 |
+
<span class="cm">// 1. Fill winner cards</span>
|
| 598 |
+
document.<span class="fn">getElementById</span>(<span class="str">'wpA'</span>).textContent = d.dataset1.winner_prediction;
|
| 599 |
+
document.<span class="fn">getElementById</span>(<span class="str">'wcA'</span>).textContent = (d.dataset1.winner_confidence * <span class="num">100</span>).<span class="fn">toFixed</span>(<span class="num">1</span>) + <span class="str">'%'</span>;
|
| 600 |
+
|
| 601 |
+
<span class="cm">// 2. Build per-model rows for each dataset</span>
|
| 602 |
+
<span class="fn">buildPanel</span>(<span class="str">'p1'</span>, d.dataset1.models, d.dataset1.winner_model);
|
| 603 |
+
<span class="fn">buildPanel</span>(<span class="str">'p2'</span>, d.dataset2.models, d.dataset2.winner_model);
|
| 604 |
+
<span class="fn">buildPanel</span>(<span class="str">'p3'</span>, d.dataset3.models, d.dataset3.winner_model);
|
| 605 |
+
|
| 606 |
+
<span class="cm">// 3. Risk banner</span>
|
| 607 |
+
<span class="kw">if</span> (d.risk_flag) {
|
| 608 |
+
riskBanner.className = <span class="str">'risk-banner danger'</span>;
|
| 609 |
+
} <span class="kw">else</span> {
|
| 610 |
+
riskBanner.className = <span class="str">'risk-banner safe'</span>;
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
+
<span class="cm">// 4. Show results section</span>
|
| 614 |
+
document.<span class="fn">getElementById</span>(<span class="str">'results'</span>).style.display = <span class="str">'block'</span>;
|
| 615 |
+
}
|
| 616 |
+
|
| 617 |
+
<span class="kw">function</span> <span class="fn">buildPanel</span>(panelId, models, winner) {
|
| 618 |
+
<span class="kw">let</span> html = <span class="str">''</span>;
|
| 619 |
+
<span class="cls">Object</span>.<span class="fn">entries</span>(models).<span class="fn">forEach</span>(([name, res]) => {
|
| 620 |
+
html += <span class="str">`<div class="mr ${name===winner?'winner':''}">
|
| 621 |
+
<div class="mr-name">${name}</div>
|
| 622 |
+
<div class="mr-pred">${res.label}</div>
|
| 623 |
+
<div class="mr-fill" data-w="${(res.confidence*100).toFixed(1)}"></div>
|
| 624 |
+
<div class="mr-pct">${(res.confidence*100).toFixed(1)}%</div>
|
| 625 |
+
</div>`</span>;
|
| 626 |
+
});
|
| 627 |
+
panel.innerHTML = html; <span class="cm">// inject HTML</span>
|
| 628 |
+
<span class="cm">// bars animate next step β</span>
|
| 629 |
+
}</div>
|
| 630 |
+
<div class="nav-btns">
|
| 631 |
+
<button class="nav-btn" onclick="goTo(7)">β JSON response</button>
|
| 632 |
+
<button class="nav-btn" onclick="goTo(9)">Next: CSS animation β</button>
|
| 633 |
+
</div>
|
| 634 |
+
</div>
|
| 635 |
+
|
| 636 |
+
<!-- STEP 9 β CSS animation -->
|
| 637 |
+
<div class="step-content" id="step9">
|
| 638 |
+
<div class="flow-mini">
|
| 639 |
+
<div class="flow-node">1β9 Β· HTML rows created</div><div class="flow-arrow">β</div>
|
| 640 |
+
<div class="flow-node active-node">10 Β· CSS animation</div>
|
| 641 |
+
</div>
|
| 642 |
+
<div class="step-header">
|
| 643 |
+
<div class="step-icon" style="background:var(--blue-bg)">β¨</div>
|
| 644 |
+
<div>
|
| 645 |
+
<div class="step-num-big" style="color:var(--blue)">Step 10 Β· index.html + CSS</div>
|
| 646 |
+
<div class="step-title">Confidence bars animate</div>
|
| 647 |
+
<div class="step-file">setTimeout(80ms) β style.width β CSS transition</div>
|
| 648 |
+
</div>
|
| 649 |
+
</div>
|
| 650 |
+
<div class="what-box">
|
| 651 |
+
<div class="what-title">What happens</div>
|
| 652 |
+
<div class="what-body">The bars are created with <strong>width: 0%</strong>. An 80ms delay gives the browser time to paint the DOM first. Then JavaScript sets each bar's width from its <strong>data-w attribute</strong> (e.g. "82.8"). The CSS <strong>transition</strong> property smoothly animates from 0% β 82.8% over 0.8 seconds. That's the fill animation you see.</div>
|
| 653 |
+
</div>
|
| 654 |
+
<div class="code-block"><span class="cm">/* CSS β transition defined in <style> */</span>
|
| 655 |
+
.mr-fill {
|
| 656 |
+
width: <span class="num">0%</span>; <span class="cm">/* starts invisible */</span>
|
| 657 |
+
<span class="kw">transition</span>: width <span class="num">0.8s</span> cubic-bezier(.4,0,.2,1); <span class="cm">/* smooth ease-out */</span>
|
| 658 |
+
}
|
| 659 |
+
.mr.winner .mr-fill { background: var(--purple); } <span class="cm">/* winner = purple */</span>
|
| 660 |
+
|
| 661 |
+
<span class="cm">// JavaScript β in buildPanel()</span>
|
| 662 |
+
<span class="fn">setTimeout</span>(() => {
|
| 663 |
+
panel.<span class="fn">querySelectorAll</span>(<span class="str">'.mr-fill'</span>).<span class="fn">forEach</span>(el => {
|
| 664 |
+
el.style.width = el.<span class="fn">getAttribute</span>(<span class="str">'data-w'</span>) + <span class="str">'%'</span>;
|
| 665 |
+
<span class="cm">// sets e.g. "82.8%" β CSS transition plays automatically</span>
|
| 666 |
+
});
|
| 667 |
+
}, <span class="num">80</span>); <span class="cm">// 80ms wait for DOM to paint first</span>
|
| 668 |
+
|
| 669 |
+
<span class="cm">// The 6-class breakdown bars work the same way</span>
|
| 670 |
+
<span class="cm">// but use 200ms delay and .cp-fill class</span></div>
|
| 671 |
+
<div class="important-box imp-blue">
|
| 672 |
+
<div class="imp-title">Why the 80ms delay?</div>
|
| 673 |
+
If you set style.width immediately after setting innerHTML, the browser hasn't painted the elements yet. The transition has nothing to "from" β the bars jump to their final width instantly with no animation. The 80ms gives the browser one render frame to establish the 0% starting state, so the transition has a clean start point.
|
| 674 |
+
</div>
|
| 675 |
+
<div class="returns-box">
|
| 676 |
+
<div class="ret-title">Complete flow summary</div>
|
| 677 |
+
<div class="ret-row"><div class="ret-key">Total round trip time</div><div class="ret-val">~2β4 seconds (dominated by XLM-RoBERTa inference on CPU)</div></div>
|
| 678 |
+
<div class="ret-row"><div class="ret-key">Files involved</div><div class="ret-val">index.html β app.py β predict.py β back to index.html</div></div>
|
| 679 |
+
<div class="ret-row"><div class="ret-key">Models called</div><div class="ret-val">12 total: LR + SVM + XGBoost + XLM-R Γ 3 datasets</div></div>
|
| 680 |
+
<div class="ret-row"><div class="ret-key">Winner selection</div><div class="ret-val">Highest confidence per dataset β pure Python max()</div></div>
|
| 681 |
+
<div class="ret-row"><div class="ret-key">Risk flag</div><div class="ret-val">Majority vote β β₯3 of 4 Dataset 3 models predict "suicide"</div></div>
|
| 682 |
+
</div>
|
| 683 |
+
<div class="nav-btns">
|
| 684 |
+
<button class="nav-btn" onclick="goTo(8)">β render()</button>
|
| 685 |
+
<button class="nav-btn" onclick="goTo(0)">β Start over</button>
|
| 686 |
+
</div>
|
| 687 |
+
</div>
|
| 688 |
+
|
| 689 |
+
</div>
|
| 690 |
+
</div>
|
| 691 |
+
|
| 692 |
+
<script>
|
| 693 |
+
function goTo(n) {
|
| 694 |
+
document.querySelectorAll('.step-content').forEach((el,i) => {
|
| 695 |
+
el.classList.toggle('active', i === n);
|
| 696 |
+
});
|
| 697 |
+
document.querySelectorAll('.step-btn').forEach((el,i) => {
|
| 698 |
+
el.classList.toggle('active', i === n);
|
| 699 |
+
});
|
| 700 |
+
document.querySelector('.main').scrollTop = 0;
|
| 701 |
+
window.scrollTo(0, 0);
|
| 702 |
+
}
|
| 703 |
+
</script>
|
| 704 |
+
</body>
|
| 705 |
+
</html>
|
templates/index.html
CHANGED
|
@@ -322,6 +322,7 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
|
|
| 322 |
<a href="#verdict">Conclusions</a>
|
| 323 |
<a href="#demo">Live Demo</a>
|
| 324 |
<a href="#faq">FAQ</a>
|
|
|
|
| 325 |
</nav>
|
| 326 |
<div class="nav-badge">NCI H9DAI 2026</div>
|
| 327 |
</header>
|
|
@@ -347,8 +348,8 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
|
|
| 347 |
<div class="stats-panel">
|
| 348 |
<div class="stat-box"><div class="stat-num" data-target="3" data-suffix="">0</div><div class="stat-lbl">Datasets</div></div>
|
| 349 |
<div class="stat-box"><div class="stat-num" data-target="12" data-suffix="">0</div><div class="stat-lbl">Models trained</div></div>
|
| 350 |
-
<div class="stat-box"><div class="stat-num" data-target="98.1" data-suffix="%" data-dec="1">0</div><div class="stat-lbl">D3 Accuracy (
|
| 351 |
-
<div class="stat-box"><div class="stat-num" data-target="
|
| 352 |
</div>
|
| 353 |
</div>
|
| 354 |
</div>
|
|
@@ -437,7 +438,7 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
|
|
| 437 |
</div>
|
| 438 |
<div class="md-block">
|
| 439 |
<div class="md-block-lbl">D3 β Suicide Risk (Kaggle: nikhileswarkomati)</div>
|
| 440 |
-
<div class="md-block-val"><em>232,074 Reddit posts Β· 2 classes</em> β Suicide / Non-Suicide (perfectly balanced, 116,037 each). Suicide posts average <em>
|
| 441 |
</div>
|
| 442 |
<div class="md-block">
|
| 443 |
<div class="md-block-lbl">Business Context</div>
|
|
@@ -578,7 +579,7 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
|
|
| 578 |
<div class="finding">
|
| 579 |
<div class="finding-n">02</div>
|
| 580 |
<div class="finding-t">XLM-RoBERTa is the best model for long-form text</div>
|
| 581 |
-
<div class="finding-b">On Reddit suicide risk posts (D3), XLM-RoBERTa achieves 98.1% Accuracy with the 50K sample. Suicide posts average
|
| 582 |
<div class="finding-chip">D3 Accuracy: XLM-RoBERTa 98.1%</div>
|
| 583 |
</div>
|
| 584 |
<div class="finding">
|
|
@@ -739,7 +740,7 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
|
|
| 739 |
<div class="faq-group-title">Data & Datasets</div>
|
| 740 |
<div class="faq-item">
|
| 741 |
<button class="faq-q" onclick="toggleFaq(this)">What are the three datasets and what makes them different? <span class="faq-chevron">βΌ</span></button>
|
| 742 |
-
<div class="faq-a"><div class="faq-a-inner">D1 β 6-class depression type classification (atypical, bipolar, major depressive, no depression, postpartum, psychotic) from Kaggle. Twitter-length text, 11,986 samples. D2 β binary
|
| 743 |
</div>
|
| 744 |
<div class="faq-item">
|
| 745 |
<button class="faq-q" onclick="toggleFaq(this)">How did you handle class imbalance? Why SMOTE and not class weighting? <span class="faq-chevron">βΌ</span></button>
|
|
@@ -755,7 +756,7 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
|
|
| 755 |
<div class="faq-group-title">Methodology & Models</div>
|
| 756 |
<div class="faq-item">
|
| 757 |
<button class="faq-q" onclick="toggleFaq(this)">Why four model types per dataset? Why not just use the best one? <span class="faq-chevron">βΌ</span></button>
|
| 758 |
-
<div class="faq-a"><div class="faq-a-inner">Each captures a different inductive bias: Logistic Regression (linear decision boundary), SVM (maximum-margin), Random Forest/XGBoost (non-linear tree ensembles), XLM-RoBERTa (contextual transformer). Disagreement between models is itself a signal. On D1, SVM (92.4%) beats XLM-RoBERTa (90.5%) β short tweets don't give the transformer enough context to gain advantage. On D3 (
|
| 759 |
</div>
|
| 760 |
<div class="faq-item">
|
| 761 |
<button class="faq-q" onclick="toggleFaq(this)">What are your TF-IDF settings and why? <span class="faq-chevron">βΌ</span></button>
|
|
@@ -763,11 +764,11 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
|
|
| 763 |
</div>
|
| 764 |
<div class="faq-item">
|
| 765 |
<button class="faq-q" onclick="toggleFaq(this)">How was XLM-RoBERTa fine-tuned? What hyperparameters? <span class="faq-chevron">βΌ</span></button>
|
| 766 |
-
<div class="faq-a"><div class="faq-a-inner">Standard sequence classification fine-tuning: Adam optimiser, <code>lr=2e-5</code>, <code>3 epochs</code>, linear warmup scheduler. Max token length: 128 for D1/D2 (Twitter-length text), 256 for D3 (Reddit posts average
|
| 767 |
</div>
|
| 768 |
<div class="faq-item">
|
| 769 |
-
<button class="faq-q" onclick="toggleFaq(this)">Why did XGBoost collapse on D3
|
| 770 |
-
<div class="faq-a"><div class="faq-a-inner">
|
| 771 |
</div>
|
| 772 |
</div>
|
| 773 |
|
|
@@ -775,7 +776,7 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
|
|
| 775 |
<div class="faq-group-title">Results & Evaluation</div>
|
| 776 |
<div class="faq-item">
|
| 777 |
<button class="faq-q" onclick="toggleFaq(this)">Why is SVM accuracy 92.4% on D1 but XLM-RoBERTa (278M params) only gets 90.5%? <span class="faq-chevron">βΌ</span></button>
|
| 778 |
-
<div class="faq-a"><div class="faq-a-inner">Text length. D1 tweets average ~31 words. Transformers need rich context to outperform classical methods β contextual embeddings add little value
|
| 779 |
</div>
|
| 780 |
<div class="faq-item">
|
| 781 |
<button class="faq-q" onclick="toggleFaq(this)">Why show accuracy rather than Macro F1? Isn't accuracy misleading on imbalanced data? <span class="faq-chevron">βΌ</span></button>
|
|
@@ -783,7 +784,7 @@ footer{text-align:center;padding:28px 48px;border-top:1px solid var(--border);fo
|
|
| 783 |
</div>
|
| 784 |
<div class="faq-item">
|
| 785 |
<button class="faq-q" onclick="toggleFaq(this)">Did adding more training data (50K β 232K) improve D3 results? <span class="faq-chevron">βΌ</span></button>
|
| 786 |
-
<div class="faq-a"><div class="faq-a-inner">No β
|
| 787 |
</div>
|
| 788 |
</div>
|
| 789 |
|
|
@@ -1058,7 +1059,7 @@ def tokenize_tweets(examples):
|
|
| 1058 |
padding='max_length' # pad shorter inputs to fixed length
|
| 1059 |
)
|
| 1060 |
|
| 1061 |
-
# D3 uses max_length=256 β Reddit posts avg
|
| 1062 |
def tokenize_reddit(examples):
|
| 1063 |
return tokenizer(
|
| 1064 |
examples['text'],
|
|
@@ -1067,7 +1068,7 @@ def tokenize_reddit(examples):
|
|
| 1067 |
padding='max_length'
|
| 1068 |
)`,
|
| 1069 |
src: 'notebooks/DA_2_Notebook.ipynb β cell 9 (tokenize_tweets, max_length=128, D1/D2) Β· cell 21 (tokenize_reddit, max_length=256, D3)',
|
| 1070 |
-
why: '<strong>SentencePiece subword tokenisation:</strong> Splits unknown words into subword pieces β "suicidal" might become ["su", "ici", "dal"]. No word is truly out-of-vocabulary. <strong>max_length=128 for D1/D2:</strong> Tweets average ~31 words β 40 tokens. 128 is 3Γ headroom. <strong>max_length=256 for D3:</strong> Reddit posts average
|
| 1071 |
outputs: [
|
| 1072 |
{label:'D1/D2 shape', val:'Each input β tensor of shape [128] (input_ids) + [128] (attention_mask)'},
|
| 1073 |
{label:'D3 shape', val:'Each input β tensor of shape [256] Γ 2'},
|
|
|
|
| 322 |
<a href="#verdict">Conclusions</a>
|
| 323 |
<a href="#demo">Live Demo</a>
|
| 324 |
<a href="#faq">FAQ</a>
|
| 325 |
+
<a href="/flow" target="_blank" style="background:var(--bg3);color:var(--ink)">System Flow β</a>
|
| 326 |
</nav>
|
| 327 |
<div class="nav-badge">NCI H9DAI 2026</div>
|
| 328 |
</header>
|
|
|
|
| 348 |
<div class="stats-panel">
|
| 349 |
<div class="stat-box"><div class="stat-num" data-target="3" data-suffix="">0</div><div class="stat-lbl">Datasets</div></div>
|
| 350 |
<div class="stat-box"><div class="stat-num" data-target="12" data-suffix="">0</div><div class="stat-lbl">Models trained</div></div>
|
| 351 |
+
<div class="stat-box"><div class="stat-num" data-target="98.1" data-suffix="%" data-dec="1">0</div><div class="stat-lbl">D3 Accuracy (binary)</div></div>
|
| 352 |
+
<div class="stat-box"><div class="stat-num" data-target="11.4" data-suffix="%" data-dec="1">0</div><div class="stat-lbl">vs Base Paper β</div></div>
|
| 353 |
</div>
|
| 354 |
</div>
|
| 355 |
</div>
|
|
|
|
| 438 |
</div>
|
| 439 |
<div class="md-block">
|
| 440 |
<div class="md-block-lbl">D3 β Suicide Risk (Kaggle: nikhileswarkomati)</div>
|
| 441 |
+
<div class="md-block-val"><em>232,074 Reddit posts Β· 2 classes</em> β Suicide / Non-Suicide (perfectly balanced, 116,037 each). Suicide posts average <em>200.8 words</em> (mean), non-suicide posts 63 words. We sample <em>50K posts</em> and compare against full/half splits to answer RQ2.</div>
|
| 442 |
</div>
|
| 443 |
<div class="md-block">
|
| 444 |
<div class="md-block-lbl">Business Context</div>
|
|
|
|
| 579 |
<div class="finding">
|
| 580 |
<div class="finding-n">02</div>
|
| 581 |
<div class="finding-t">XLM-RoBERTa is the best model for long-form text</div>
|
| 582 |
+
<div class="finding-b">On Reddit suicide risk posts (D3), XLM-RoBERTa achieves 98.1% Accuracy with the 50K sample. Suicide posts average 200.8 words β rich enough context for transformer embeddings to dominate every competitor. D2 (Twitter, ~31 words) tells the opposite story.</div>
|
| 583 |
<div class="finding-chip">D3 Accuracy: XLM-RoBERTa 98.1%</div>
|
| 584 |
</div>
|
| 585 |
<div class="finding">
|
|
|
|
| 740 |
<div class="faq-group-title">Data & Datasets</div>
|
| 741 |
<div class="faq-item">
|
| 742 |
<button class="faq-q" onclick="toggleFaq(this)">What are the three datasets and what makes them different? <span class="faq-chevron">βΌ</span></button>
|
| 743 |
+
<div class="faq-a"><div class="faq-a-inner">D1 β 6-class depression type classification (atypical, bipolar, major depressive, no depression, postpartum, psychotic) from Kaggle. Twitter-length text, 11,986 samples. D2 β binary depressed/not-depressed from Twitter (10,314 samples, severe 3.46Γ imbalance). D3 β binary suicide/non-suicide from Reddit (232K samples, perfectly balanced 116,037 each β we use a 50K sample of 25K per class). Each dataset has a different task, different text length, and different vocabulary domain β which is precisely why running all three in parallel is informative.</div></div>
|
| 744 |
</div>
|
| 745 |
<div class="faq-item">
|
| 746 |
<button class="faq-q" onclick="toggleFaq(this)">How did you handle class imbalance? Why SMOTE and not class weighting? <span class="faq-chevron">βΌ</span></button>
|
|
|
|
| 756 |
<div class="faq-group-title">Methodology & Models</div>
|
| 757 |
<div class="faq-item">
|
| 758 |
<button class="faq-q" onclick="toggleFaq(this)">Why four model types per dataset? Why not just use the best one? <span class="faq-chevron">βΌ</span></button>
|
| 759 |
+
<div class="faq-a"><div class="faq-a-inner">Each captures a different inductive bias: Logistic Regression (linear decision boundary), SVM (maximum-margin), Random Forest/XGBoost (non-linear tree ensembles), XLM-RoBERTa (contextual transformer). Disagreement between models is itself a signal. On D1, SVM (92.4%) beats XLM-RoBERTa (90.5%) β short tweets don't give the transformer enough context to gain advantage. On D3 (200.8-word Reddit posts), XLM-RoBERTa (98.1%) dominates every classical model.</div></div>
|
| 760 |
</div>
|
| 761 |
<div class="faq-item">
|
| 762 |
<button class="faq-q" onclick="toggleFaq(this)">What are your TF-IDF settings and why? <span class="faq-chevron">βΌ</span></button>
|
|
|
|
| 764 |
</div>
|
| 765 |
<div class="faq-item">
|
| 766 |
<button class="faq-q" onclick="toggleFaq(this)">How was XLM-RoBERTa fine-tuned? What hyperparameters? <span class="faq-chevron">βΌ</span></button>
|
| 767 |
+
<div class="faq-a"><div class="faq-a-inner">Standard sequence classification fine-tuning: Adam optimiser, <code>lr=2e-5</code>, <code>3 epochs</code>, linear warmup scheduler. Max token length: 128 for D1/D2 (Twitter-length text), 256 for D3 (Reddit posts average 200.8 words). Cross-entropy loss. Best checkpoint saved by validation accuracy. 278M parameters β multilingual pretraining covers 100 languages.</div></div>
|
| 768 |
</div>
|
| 769 |
<div class="faq-item">
|
| 770 |
+
<button class="faq-q" onclick="toggleFaq(this)">Why did XGBoost collapse on D3 at full scale? <span class="faq-chevron">βΌ</span></button>
|
| 771 |
+
<div class="faq-a"><div class="faq-a-inner">On the 50K sample, XGBoost achieves 91.6% β competitive. At full scale (232K), it collapses to 70.52% (Macro F1: 0.6998). This is TF-IDF lexical overfitting: vocabulary overlap between "suicide" and "non-suicide" Reddit posts increases with scale β words like "exhausted", "hopeless", "nothing matters" appear in both classes. Boosted trees memorise these majority-class token patterns instead of learning discriminative boundaries. H1 (116K) drops further to 60.1%, and H1 vs H2 are inconsistent (60.1% vs 70.9%), confirming XGBoost is unstable at this data scale. XLM-RoBERTa stays at 98.1% across all splits.</div></div>
|
| 772 |
</div>
|
| 773 |
</div>
|
| 774 |
|
|
|
|
| 776 |
<div class="faq-group-title">Results & Evaluation</div>
|
| 777 |
<div class="faq-item">
|
| 778 |
<button class="faq-q" onclick="toggleFaq(this)">Why is SVM accuracy 92.4% on D1 but XLM-RoBERTa (278M params) only gets 90.5%? <span class="faq-chevron">βΌ</span></button>
|
| 779 |
+
<div class="faq-a"><div class="faq-a-inner">Text length. D1 tweets average ~31 words. Transformers need rich context to outperform classical methods β contextual embeddings add little value on ~40-token inputs. TF-IDF bigrams on short explicit text (like tweets) already capture the full signal. This is Finding 01 and one of the key research conclusions: model selection must be text-length aware.</div></div>
|
| 780 |
</div>
|
| 781 |
<div class="faq-item">
|
| 782 |
<button class="faq-q" onclick="toggleFaq(this)">Why show accuracy rather than Macro F1? Isn't accuracy misleading on imbalanced data? <span class="faq-chevron">βΌ</span></button>
|
|
|
|
| 784 |
</div>
|
| 785 |
<div class="faq-item">
|
| 786 |
<button class="faq-q" onclick="toggleFaq(this)">Did adding more training data (50K β 232K) improve D3 results? <span class="faq-chevron">βΌ</span></button>
|
| 787 |
+
<div class="faq-a"><div class="faq-a-inner">No β XLM-RoBERTa: 98.1% (50K, NB2) Β· 98.02% (Full 232K) Β· 97.78% (H1) Β· 98.02% (H2). Maximum delta = 0.32%. KS tests across the three split study splits (Full, H1, H2) confirm identical distributions: suicide class p=0.4967 (H1 vs H2), p=0.9758 (Full vs H1); non-suicide class p=0.8125 (H1 vs H2), p=0.9992 (Full vs H1). All well above the p=0.05 threshold β distribution shift is not driving the results. This is Finding 03.</div></div>
|
| 788 |
</div>
|
| 789 |
</div>
|
| 790 |
|
|
|
|
| 1059 |
padding='max_length' # pad shorter inputs to fixed length
|
| 1060 |
)
|
| 1061 |
|
| 1062 |
+
# D3 uses max_length=256 β Reddit posts avg 200.8 words (~280 tokens)
|
| 1063 |
def tokenize_reddit(examples):
|
| 1064 |
return tokenizer(
|
| 1065 |
examples['text'],
|
|
|
|
| 1068 |
padding='max_length'
|
| 1069 |
)`,
|
| 1070 |
src: 'notebooks/DA_2_Notebook.ipynb β cell 9 (tokenize_tweets, max_length=128, D1/D2) Β· cell 21 (tokenize_reddit, max_length=256, D3)',
|
| 1071 |
+
why: '<strong>SentencePiece subword tokenisation:</strong> Splits unknown words into subword pieces β "suicidal" might become ["su", "ici", "dal"]. No word is truly out-of-vocabulary. <strong>max_length=128 for D1/D2:</strong> Tweets average ~31 words β 40 tokens. 128 is 3Γ headroom. <strong>max_length=256 for D3:</strong> Reddit posts average 200.8 words β 280 tokens β 128 would truncate most of the signal. <strong>padding=\'max_length\':</strong> All batches must be identical length for GPU tensor operations β shorter inputs are padded with [PAD] tokens. The attention mask tells the model to ignore padding.',
|
| 1072 |
outputs: [
|
| 1073 |
{label:'D1/D2 shape', val:'Each input β tensor of shape [128] (input_ids) + [128] (attention_mask)'},
|
| 1074 |
{label:'D3 shape', val:'Each input β tensor of shape [256] Γ 2'},
|