File size: 12,769 Bytes
016c645
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
"""
MindScan β€” Prediction Logic
NCI H9DAI Research Project 2026

All model loading and prediction functions.
Imported by app.py β€” do not run directly.

Datasets:
  D1 β€” Zenodo (Nusrat 2024) β€” 6-class depression type
  D2 β€” Kaggle (albertobellardini) β€” binary depression (labels: '0'/'1')
  D3 β€” Kaggle (nikhileswarkomati) β€” binary suicide risk

Models per dataset:
  Logistic Regression, SVM, XGBoost, XLM-RoBERTa
  (Random Forest excluded β€” 646 MB, worst performer on D1/D3)
"""

import os, re, string, joblib
import numpy as np

# ─────────────────────────────────────────────────────────────────
# PATHS
# ─────────────────────────────────────────────────────────────────
BASE_DIR       = os.path.dirname(os.path.abspath(__file__))
CLASSICAL_DIR  = os.path.join(BASE_DIR, 'models', 'classical')
TRANSFORMER_DIR = os.path.join(BASE_DIR, 'models', 'transformers')

# ─────────────────────────────────────────────────────────────────
# D2 LABEL MAPPING
# The dataset uses '0' and '1' as labels.
# We map them to human-readable strings for the UI.
# ─────────────────────────────────────────────────────────────────
D2_LABEL_MAP = {
    '0': 'Not Depressed',
    '1': 'Depressed',
    0: 'Not Depressed',
    1: 'Depressed',
}

# ─────────────────────────────────────────────────────────────────
# MODEL STORAGE β€” populated by load_all_models()
# ─────────────────────────────────────────────────────────────────
_models = {}
_loaded = False


def models_loaded():
    return _loaded


def load_all_models():
    """
    Loads all 12 models (4 per dataset Γ— 3 datasets) into memory.
    Called once at server startup. Takes ~30s on CPU due to XLM-RoBERTa.
    """
    global _loaded

    # ── Classical support files ───────────────────────────────────
    for ds in ['d1', 'd2', 'd3']:
        _models[f'le_{ds}']    = joblib.load(os.path.join(CLASSICAL_DIR, f'le_{ds}.pkl'))
        _models[f'tfidf_{ds}'] = joblib.load(os.path.join(CLASSICAL_DIR, f'tfidf_{ds}.pkl'))
        print(f"  βœ“ Loaded encoders/tfidf for {ds}")

    # ── Classical models ──────────────────────────────────────────
    for model_name in ['logistic_regression', 'svm', 'xgboost']:
        for ds in ['d1', 'd2', 'd3']:
            key  = f'{model_name}_{ds}'
            path = os.path.join(CLASSICAL_DIR, f'{key}.pkl')
            _models[key] = joblib.load(path)
            print(f"  βœ“ Loaded {key}")

    # ── XLM-RoBERTa transformers ──────────────────────────────────
    try:
        import torch
        from transformers import AutoTokenizer, AutoModelForSequenceClassification

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        _models['device'] = device
        print(f"  βœ“ Using device: {device}")

        # Shared tokenizer (all 3 models use the same base tokeniser)
        tokenizer_path = os.path.join(TRANSFORMER_DIR, 'xlmr_d1_final')
        _models['tokenizer'] = AutoTokenizer.from_pretrained(tokenizer_path)
        print("  βœ“ Tokeniser loaded")

        for ds, max_len in [('d1', 128), ('d2', 128), ('d3', 256)]:
            folder = os.path.join(TRANSFORMER_DIR, f'xlmr_{ds}_final')
            model  = AutoModelForSequenceClassification.from_pretrained(folder)
            model  = model.to(device)
            model.eval()
            _models[f'xlmr_{ds}']     = model
            _models[f'xlmr_{ds}_len'] = max_len
            print(f"  βœ“ Loaded XLM-RoBERTa {ds} (max_length={max_len})")

    except Exception as e:
        print(f"  ⚠ XLM-RoBERTa failed to load: {e}")
        print("    Classical models will still work.")

    _loaded = True
    print("  βœ… All models ready")


# ─────────────────────────────────────────────────────────────────
# TEXT CLEANING β€” same function used in both notebooks
# ─────────────────────────────────────────────────────────────────
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# ─────────────────────────────────────────────────────────────────
# PREDICTION HELPERS
# ─────────────────────────────────────────────────────────────────
def predict_classical(text_clean, ds):
    """
    Runs text through the 3 classical models for one dataset.
    Returns dict: { model_name: {label, confidence} }
    """
    tfidf = _models[f'tfidf_{ds}']
    le    = _models[f'le_{ds}']
    vec   = tfidf.transform([text_clean])

    results = {}
    display_names = {
        'logistic_regression': 'Logistic Regression',
        'svm':                 'SVM',
        'xgboost':             'XGBoost',
    }

    for key, display in display_names.items():
        model    = _models[f'{key}_{ds}']
        pred_idx = model.predict(vec)[0]
        raw_label = le.classes_[pred_idx]

        # Map D2 numeric labels to readable strings
        if ds == 'd2':
            label = D2_LABEL_MAP.get(raw_label, str(raw_label))
        else:
            label = str(raw_label)

        # Confidence: predict_proba if available, else softmax of decision_function
        if hasattr(model, 'predict_proba'):
            conf = float(model.predict_proba(vec)[0][pred_idx])
        elif hasattr(model, 'decision_function'):
            scores = model.decision_function(vec)[0]
            if np.ndim(scores) == 0:
                scores = np.array([float(-scores), float(scores)])
            e    = np.exp(scores - scores.max())
            conf = float(e[pred_idx] / e.sum())
        else:
            conf = 1.0

        results[display] = {
            'label':      label,
            'confidence': round(conf, 4),
        }

    return results


def predict_transformer(text_raw, ds):
    """
    Runs text through XLM-RoBERTa for one dataset.
    Returns { label, confidence, all_probs }
    all_probs = { class_name: probability } for all classes.
    Used for the class breakdown bars in the UI.
    """
    if f'xlmr_{ds}' not in _models:
        return None

    import torch

    model   = _models[f'xlmr_{ds}']
    tok     = _models['tokenizer']
    le      = _models[f'le_{ds}']
    max_len = _models[f'xlmr_{ds}_len']
    device  = _models.get('device', 'cpu')

    inputs = tok(
        text_raw,
        return_tensors='pt',
        max_length=max_len,
        truncation=True,
        padding='max_length'
    ).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits

    probs    = torch.softmax(logits, dim=1).cpu().numpy()[0]
    pred_idx = int(probs.argmax())
    raw_label = le.classes_[pred_idx]

    if ds == 'd2':
        label = D2_LABEL_MAP.get(raw_label, str(raw_label))
    else:
        label = str(raw_label)

    # Build all_probs dict with readable labels
    all_probs = {}
    for i, p in enumerate(probs):
        raw = le.classes_[i]
        readable = D2_LABEL_MAP.get(raw, str(raw)) if ds == 'd2' else str(raw)
        all_probs[readable] = round(float(p), 4)

    return {
        'label':      label,
        'confidence': round(float(probs[pred_idx]), 4),
        'all_probs':  all_probs,
    }


# ─────────────────────────────────────────────────────────────────
# MAIN FUNCTION β€” called by Flask /predict endpoint
# ─────────────────────────────────────────────────────────────────
def predict_all(raw_text):
    """
    Runs text through all 12 models across 3 datasets.

    Returns dict:
    {
      dataset1: {
        task, models: {LR, SVM, XGBoost, XLM-RoBERTa},
        winner_model, winner_prediction, winner_confidence,
        class_probs   ← only D1, 6-class breakdown from XLM-RoBERTa
      },
      dataset2: { same structure, D2 labels mapped to readable strings },
      dataset3: { same structure },
      risk_flag: bool,   ← True if β‰₯3 of 4 D3 models say "suicide"
      suicide_votes: "N/4 models flagged suicide risk",
      winner_summary: { depression_type, depressed, suicide_risk }
    }
    """
    clean = clean_text(raw_text)

    # ── Dataset 1: Depression type ────────────────────────────────
    d1 = predict_classical(clean, 'd1')
    xlmr1 = predict_transformer(raw_text, 'd1')
    if xlmr1:
        d1['XLM-RoBERTa'] = {k: xlmr1[k] for k in ('label','confidence')}

    d1_winner = max(d1.items(), key=lambda x: x[1]['confidence'])

    # ── Dataset 2: Binary depression ─────────────────────────────
    d2 = predict_classical(clean, 'd2')
    xlmr2 = predict_transformer(raw_text, 'd2')
    if xlmr2:
        d2['XLM-RoBERTa'] = {k: xlmr2[k] for k in ('label','confidence')}

    d2_winner = max(d2.items(), key=lambda x: x[1]['confidence'])

    # ── Dataset 3: Suicide risk ───────────────────────────────────
    d3 = predict_classical(clean, 'd3')
    xlmr3 = predict_transformer(raw_text, 'd3')
    if xlmr3:
        d3['XLM-RoBERTa'] = {k: xlmr3[k] for k in ('label','confidence')}

    d3_winner = max(d3.items(), key=lambda x: x[1]['confidence'])

    # ── Suicide risk flag β€” majority vote across 4 D3 models ─────
    suicide_count = sum(
        1 for r in d3.values()
        if 'suicide' in r['label'].lower() and 'non' not in r['label'].lower()
    )
    risk_flag = suicide_count >= 3

    return {
        'dataset1': {
            'task':               'Depression Type (6 Classes)',
            'models':             d1,
            'winner_model':       d1_winner[0],
            'winner_prediction':  d1_winner[1]['label'],
            'winner_confidence':  d1_winner[1]['confidence'],
            'class_probs':        xlmr1.get('all_probs', {}) if xlmr1 else {},
        },
        'dataset2': {
            'task':               'Depressed or Not?',
            'models':             d2,
            'winner_model':       d2_winner[0],
            'winner_prediction':  d2_winner[1]['label'],
            'winner_confidence':  d2_winner[1]['confidence'],
        },
        'dataset3': {
            'task':               'Suicide Risk Detection',
            'models':             d3,
            'winner_model':       d3_winner[0],
            'winner_prediction':  d3_winner[1]['label'],
            'winner_confidence':  d3_winner[1]['confidence'],
        },
        'risk_flag':     risk_flag,
        'suicide_votes': f'{suicide_count}/4 models flagged suicide risk',
        'winner_summary': {
            'depression_type': f"{d1_winner[1]['label']} ({d1_winner[1]['confidence']*100:.1f}% β€” {d1_winner[0]})",
            'depressed':       f"{d2_winner[1]['label']} ({d2_winner[1]['confidence']*100:.1f}% β€” {d2_winner[0]})",
            'suicide_risk':    f"{d3_winner[1]['label']} ({d3_winner[1]['confidence']*100:.1f}% β€” {d3_winner[0]})",
        }
    }