| """ |
| MindScan β Prediction Logic |
| NCI H9DAI Research Project 2026 |
| |
| All model loading and prediction functions. |
| Imported by app.py β do not run directly. |
| |
| Datasets: |
| D1 β Zenodo (Nusrat 2024) β 6-class depression type |
| D2 β Kaggle (albertobellardini) β binary depression (labels: '0'/'1') |
| D3 β Kaggle (nikhileswarkomati) β binary suicide risk |
| |
| Models per dataset: |
| Logistic Regression, SVM, XGBoost, XLM-RoBERTa |
| (Random Forest excluded β 646 MB, worst performer on D1/D3) |
| """ |
|
|
| import os, re, string, joblib |
| import numpy as np |
|
|
| |
| |
| |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
| CLASSICAL_DIR = os.path.join(BASE_DIR, 'models', 'classical') |
| TRANSFORMER_DIR = os.path.join(BASE_DIR, 'models', 'transformers') |
|
|
| |
| |
| |
| HF_XLMR_REPO = "Esvanth/mindscan-xlmr" |
|
|
| |
| |
| |
| |
| |
| D2_LABEL_MAP = { |
| '0': 'Not Depressed', |
| '1': 'Depressed', |
| 0: 'Not Depressed', |
| 1: 'Depressed', |
| } |
|
|
| |
| |
| |
| _models = {} |
| _loaded = False |
|
|
|
|
| def models_loaded(): |
| return _loaded |
|
|
|
|
| def load_all_models(): |
| """ |
| Loads all 12 models (4 per dataset Γ 3 datasets) into memory. |
| Called once at server startup. Takes ~30s on CPU due to XLM-RoBERTa. |
| """ |
| global _loaded |
|
|
| |
| for ds in ['d1', 'd2', 'd3']: |
| _models[f'le_{ds}'] = joblib.load(os.path.join(CLASSICAL_DIR, f'le_{ds}.pkl')) |
| _models[f'tfidf_{ds}'] = joblib.load(os.path.join(CLASSICAL_DIR, f'tfidf_{ds}.pkl')) |
| print(f" β Loaded encoders/tfidf for {ds}") |
|
|
| |
| for model_name in ['logistic_regression', 'svm', 'xgboost']: |
| for ds in ['d1', 'd2', 'd3']: |
| key = f'{model_name}_{ds}' |
| path = os.path.join(CLASSICAL_DIR, f'{key}.pkl') |
| _models[key] = joblib.load(path) |
| print(f" β Loaded {key}") |
|
|
| |
| try: |
| import torch |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
|
| device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| _models['device'] = device |
| print(f" β Using device: {device}") |
|
|
| |
| |
| d1_local = os.path.join(TRANSFORMER_DIR, 'xlmr_d1_final') |
| if not os.path.isdir(d1_local): |
| from huggingface_hub import snapshot_download |
| print(f" β Downloading transformers from {HF_XLMR_REPO} ...") |
| snapshot_download( |
| repo_id=HF_XLMR_REPO, |
| repo_type="model", |
| local_dir=TRANSFORMER_DIR, |
| local_dir_use_symlinks=False, |
| ) |
| print(" β Transformers downloaded") |
|
|
| |
| tokenizer_path = os.path.join(TRANSFORMER_DIR, 'xlmr_d1_final') |
| _models['tokenizer'] = AutoTokenizer.from_pretrained(tokenizer_path) |
| print(" β Tokeniser loaded") |
|
|
| for ds, max_len in [('d1', 128), ('d2', 128), ('d3', 256)]: |
| folder = os.path.join(TRANSFORMER_DIR, f'xlmr_{ds}_final') |
| model = AutoModelForSequenceClassification.from_pretrained(folder) |
| model = model.to(device) |
| model.eval() |
| _models[f'xlmr_{ds}'] = model |
| _models[f'xlmr_{ds}_len'] = max_len |
| print(f" β Loaded XLM-RoBERTa {ds} (max_length={max_len})") |
|
|
| except Exception as e: |
| print(f" β XLM-RoBERTa failed to load: {e}") |
| print(" Classical models will still work.") |
|
|
| _loaded = True |
| print(" β
All models ready") |
|
|
|
|
| |
| |
| |
| def clean_text(text): |
| text = str(text).lower() |
| text = re.sub(r'http\S+|www\S+|https\S+', '', text) |
| text = re.sub(r'@\w+', '', text) |
| text = re.sub(r'#', '', text) |
| text = text.translate(str.maketrans('', '', string.punctuation)) |
| text = re.sub(r'\s+', ' ', text).strip() |
| return text |
|
|
|
|
| |
| |
| |
| def predict_classical(text_clean, ds): |
| """ |
| Runs text through the 3 classical models for one dataset. |
| Returns dict: { model_name: {label, confidence} } |
| """ |
| tfidf = _models[f'tfidf_{ds}'] |
| le = _models[f'le_{ds}'] |
| vec = tfidf.transform([text_clean]) |
|
|
| results = {} |
| display_names = { |
| 'logistic_regression': 'Logistic Regression', |
| 'svm': 'SVM', |
| 'xgboost': 'XGBoost', |
| } |
|
|
| for key, display in display_names.items(): |
| model = _models[f'{key}_{ds}'] |
| pred_idx = model.predict(vec)[0] |
| raw_label = le.classes_[pred_idx] |
|
|
| |
| if ds == 'd2': |
| label = D2_LABEL_MAP.get(raw_label, str(raw_label)) |
| else: |
| label = str(raw_label) |
|
|
| |
| if hasattr(model, 'predict_proba'): |
| conf = float(model.predict_proba(vec)[0][pred_idx]) |
| elif hasattr(model, 'decision_function'): |
| scores = model.decision_function(vec)[0] |
| if np.ndim(scores) == 0: |
| scores = np.array([float(-scores), float(scores)]) |
| e = np.exp(scores - scores.max()) |
| conf = float(e[pred_idx] / e.sum()) |
| else: |
| conf = 1.0 |
|
|
| results[display] = { |
| 'label': label, |
| 'confidence': round(conf, 4), |
| } |
|
|
| return results |
|
|
|
|
| def predict_transformer(text_raw, ds): |
| """ |
| Runs text through XLM-RoBERTa for one dataset. |
| Returns { label, confidence, all_probs } |
| all_probs = { class_name: probability } for all classes. |
| Used for the class breakdown bars in the UI. |
| """ |
| if f'xlmr_{ds}' not in _models: |
| return None |
|
|
| import torch |
|
|
| model = _models[f'xlmr_{ds}'] |
| tok = _models['tokenizer'] |
| le = _models[f'le_{ds}'] |
| max_len = _models[f'xlmr_{ds}_len'] |
| device = _models.get('device', 'cpu') |
|
|
| inputs = tok( |
| text_raw, |
| return_tensors='pt', |
| max_length=max_len, |
| truncation=True, |
| padding='max_length' |
| ).to(device) |
|
|
| with torch.no_grad(): |
| logits = model(**inputs).logits |
|
|
| probs = torch.softmax(logits, dim=1).cpu().numpy()[0] |
| pred_idx = int(probs.argmax()) |
| raw_label = le.classes_[pred_idx] |
|
|
| if ds == 'd2': |
| label = D2_LABEL_MAP.get(raw_label, str(raw_label)) |
| else: |
| label = str(raw_label) |
|
|
| |
| all_probs = {} |
| for i, p in enumerate(probs): |
| raw = le.classes_[i] |
| readable = D2_LABEL_MAP.get(raw, str(raw)) if ds == 'd2' else str(raw) |
| all_probs[readable] = round(float(p), 4) |
|
|
| return { |
| 'label': label, |
| 'confidence': round(float(probs[pred_idx]), 4), |
| 'all_probs': all_probs, |
| } |
|
|
|
|
| |
| |
| |
| def predict_all(raw_text): |
| """ |
| Runs text through all 12 models across 3 datasets. |
| |
| Returns dict: |
| { |
| dataset1: { |
| task, models: {LR, SVM, XGBoost, XLM-RoBERTa}, |
| winner_model, winner_prediction, winner_confidence, |
| class_probs β only D1, 6-class breakdown from XLM-RoBERTa |
| }, |
| dataset2: { same structure, D2 labels mapped to readable strings }, |
| dataset3: { same structure }, |
| risk_flag: bool, β True if β₯3 of 4 D3 models say "suicide" |
| suicide_votes: "N/4 models flagged suicide risk", |
| winner_summary: { depression_type, depressed, suicide_risk } |
| } |
| """ |
| clean = clean_text(raw_text) |
|
|
| |
| d1 = predict_classical(clean, 'd1') |
| xlmr1 = predict_transformer(raw_text, 'd1') |
| if xlmr1: |
| d1['XLM-RoBERTa'] = {k: xlmr1[k] for k in ('label','confidence')} |
|
|
| d1_winner = max(d1.items(), key=lambda x: x[1]['confidence']) |
|
|
| |
| d2 = predict_classical(clean, 'd2') |
| xlmr2 = predict_transformer(raw_text, 'd2') |
| if xlmr2: |
| d2['XLM-RoBERTa'] = {k: xlmr2[k] for k in ('label','confidence')} |
|
|
| d2_winner = max(d2.items(), key=lambda x: x[1]['confidence']) |
|
|
| |
| d3 = predict_classical(clean, 'd3') |
| xlmr3 = predict_transformer(raw_text, 'd3') |
| if xlmr3: |
| d3['XLM-RoBERTa'] = {k: xlmr3[k] for k in ('label','confidence')} |
|
|
| d3_winner = max(d3.items(), key=lambda x: x[1]['confidence']) |
|
|
| |
| suicide_count = sum( |
| 1 for r in d3.values() |
| if 'suicide' in r['label'].lower() and 'non' not in r['label'].lower() |
| ) |
| risk_flag = suicide_count >= 3 |
|
|
| return { |
| 'dataset1': { |
| 'task': 'Depression Type (6 Classes)', |
| 'models': d1, |
| 'winner_model': d1_winner[0], |
| 'winner_prediction': d1_winner[1]['label'], |
| 'winner_confidence': d1_winner[1]['confidence'], |
| 'class_probs': xlmr1.get('all_probs', {}) if xlmr1 else {}, |
| }, |
| 'dataset2': { |
| 'task': 'Depressed or Not?', |
| 'models': d2, |
| 'winner_model': d2_winner[0], |
| 'winner_prediction': d2_winner[1]['label'], |
| 'winner_confidence': d2_winner[1]['confidence'], |
| }, |
| 'dataset3': { |
| 'task': 'Suicide Risk Detection', |
| 'models': d3, |
| 'winner_model': d3_winner[0], |
| 'winner_prediction': d3_winner[1]['label'], |
| 'winner_confidence': d3_winner[1]['confidence'], |
| }, |
| 'risk_flag': risk_flag, |
| 'suicide_votes': f'{suicide_count}/4 models flagged suicide risk', |
| 'winner_summary': { |
| 'depression_type': f"{d1_winner[1]['label']} ({d1_winner[1]['confidence']*100:.1f}% β {d1_winner[0]})", |
| 'depressed': f"{d2_winner[1]['label']} ({d2_winner[1]['confidence']*100:.1f}% β {d2_winner[0]})", |
| 'suicide_risk': f"{d3_winner[1]['label']} ({d3_winner[1]['confidence']*100:.1f}% β {d3_winner[0]})", |
| } |
| } |
|
|