File size: 12,769 Bytes

016c645

"""
MindScan — Prediction Logic
NCI H9DAI Research Project 2026

All model loading and prediction functions.
Imported by app.py — do not run directly.

Datasets:
  D1 — Zenodo (Nusrat 2024) — 6-class depression type
  D2 — Kaggle (albertobellardini) — binary depression (labels: '0'/'1')
  D3 — Kaggle (nikhileswarkomati) — binary suicide risk

Models per dataset:
  Logistic Regression, SVM, XGBoost, XLM-RoBERTa
  (Random Forest excluded — 646 MB, worst performer on D1/D3)
"""

import os, re, string, joblib
import numpy as np

# ─────────────────────────────────────────────────────────────────
# PATHS
# ─────────────────────────────────────────────────────────────────
BASE_DIR       = os.path.dirname(os.path.abspath(__file__))
CLASSICAL_DIR  = os.path.join(BASE_DIR, 'models', 'classical')
TRANSFORMER_DIR = os.path.join(BASE_DIR, 'models', 'transformers')

# ─────────────────────────────────────────────────────────────────
# D2 LABEL MAPPING
# The dataset uses '0' and '1' as labels.
# We map them to human-readable strings for the UI.
# ─────────────────────────────────────────────────────────────────
D2_LABEL_MAP = {
    '0': 'Not Depressed',
    '1': 'Depressed',
    0: 'Not Depressed',
    1: 'Depressed',
}

# ─────────────────────────────────────────────────────────────────
# MODEL STORAGE — populated by load_all_models()
# ─────────────────────────────────────────────────────────────────
_models = {}
_loaded = False


def models_loaded():
    return _loaded


def load_all_models():
    """
    Loads all 12 models (4 per dataset × 3 datasets) into memory.
    Called once at server startup. Takes ~30s on CPU due to XLM-RoBERTa.
    """
    global _loaded

    # ── Classical support files ───────────────────────────────────
    for ds in ['d1', 'd2', 'd3']:
        _models[f'le_{ds}']    = joblib.load(os.path.join(CLASSICAL_DIR, f'le_{ds}.pkl'))
        _models[f'tfidf_{ds}'] = joblib.load(os.path.join(CLASSICAL_DIR, f'tfidf_{ds}.pkl'))
        print(f"  ✓ Loaded encoders/tfidf for {ds}")

    # ── Classical models ──────────────────────────────────────────
    for model_name in ['logistic_regression', 'svm', 'xgboost']:
        for ds in ['d1', 'd2', 'd3']:
            key  = f'{model_name}_{ds}'
            path = os.path.join(CLASSICAL_DIR, f'{key}.pkl')
            _models[key] = joblib.load(path)
            print(f"  ✓ Loaded {key}")

    # ── XLM-RoBERTa transformers ──────────────────────────────────
    try:
        import torch
        from transformers import AutoTokenizer, AutoModelForSequenceClassification

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        _models['device'] = device
        print(f"  ✓ Using device: {device}")

        # Shared tokenizer (all 3 models use the same base tokeniser)
        tokenizer_path = os.path.join(TRANSFORMER_DIR, 'xlmr_d1_final')
        _models['tokenizer'] = AutoTokenizer.from_pretrained(tokenizer_path)
        print("  ✓ Tokeniser loaded")

        for ds, max_len in [('d1', 128), ('d2', 128), ('d3', 256)]:
            folder = os.path.join(TRANSFORMER_DIR, f'xlmr_{ds}_final')
            model  = AutoModelForSequenceClassification.from_pretrained(folder)
            model  = model.to(device)
            model.eval()
            _models[f'xlmr_{ds}']     = model
            _models[f'xlmr_{ds}_len'] = max_len
            print(f"  ✓ Loaded XLM-RoBERTa {ds} (max_length={max_len})")

    except Exception as e:
        print(f"  ⚠ XLM-RoBERTa failed to load: {e}")
        print("    Classical models will still work.")

    _loaded = True
    print("  ✅ All models ready")


# ─────────────────────────────────────────────────────────────────
# TEXT CLEANING — same function used in both notebooks
# ─────────────────────────────────────────────────────────────────
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# ─────────────────────────────────────────────────────────────────
# PREDICTION HELPERS
# ─────────────────────────────────────────────────────────────────
def predict_classical(text_clean, ds):
    """
    Runs text through the 3 classical models for one dataset.
    Returns dict: { model_name: {label, confidence} }
    """
    tfidf = _models[f'tfidf_{ds}']
    le    = _models[f'le_{ds}']
    vec   = tfidf.transform([text_clean])

    results = {}
    display_names = {
        'logistic_regression': 'Logistic Regression',
        'svm':                 'SVM',
        'xgboost':             'XGBoost',
    }

    for key, display in display_names.items():
        model    = _models[f'{key}_{ds}']
        pred_idx = model.predict(vec)[0]
        raw_label = le.classes_[pred_idx]

        # Map D2 numeric labels to readable strings
        if ds == 'd2':
            label = D2_LABEL_MAP.get(raw_label, str(raw_label))
        else:
            label = str(raw_label)

        # Confidence: predict_proba if available, else softmax of decision_function
        if hasattr(model, 'predict_proba'):
            conf = float(model.predict_proba(vec)[0][pred_idx])
        elif hasattr(model, 'decision_function'):
            scores = model.decision_function(vec)[0]
            if np.ndim(scores) == 0:
                scores = np.array([float(-scores), float(scores)])
            e    = np.exp(scores - scores.max())
            conf = float(e[pred_idx] / e.sum())
        else:
            conf = 1.0

        results[display] = {
            'label':      label,
            'confidence': round(conf, 4),
        }

    return results


def predict_transformer(text_raw, ds):
    """
    Runs text through XLM-RoBERTa for one dataset.
    Returns { label, confidence, all_probs }
    all_probs = { class_name: probability } for all classes.
    Used for the class breakdown bars in the UI.
    """
    if f'xlmr_{ds}' not in _models:
        return None

    import torch

    model   = _models[f'xlmr_{ds}']
    tok     = _models['tokenizer']
    le      = _models[f'le_{ds}']
    max_len = _models[f'xlmr_{ds}_len']
    device  = _models.get('device', 'cpu')

    inputs = tok(
        text_raw,
        return_tensors='pt',
        max_length=max_len,
        truncation=True,
        padding='max_length'
    ).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits

    probs    = torch.softmax(logits, dim=1).cpu().numpy()[0]
    pred_idx = int(probs.argmax())
    raw_label = le.classes_[pred_idx]

    if ds == 'd2':
        label = D2_LABEL_MAP.get(raw_label, str(raw_label))
    else:
        label = str(raw_label)

    # Build all_probs dict with readable labels
    all_probs = {}
    for i, p in enumerate(probs):
        raw = le.classes_[i]
        readable = D2_LABEL_MAP.get(raw, str(raw)) if ds == 'd2' else str(raw)
        all_probs[readable] = round(float(p), 4)

    return {
        'label':      label,
        'confidence': round(float(probs[pred_idx]), 4),
        'all_probs':  all_probs,
    }


# ─────────────────────────────────────────────────────────────────
# MAIN FUNCTION — called by Flask /predict endpoint
# ─────────────────────────────────────────────────────────────────
def predict_all(raw_text):
    """
    Runs text through all 12 models across 3 datasets.

    Returns dict:
    {
      dataset1: {
        task, models: {LR, SVM, XGBoost, XLM-RoBERTa},
        winner_model, winner_prediction, winner_confidence,
        class_probs   ← only D1, 6-class breakdown from XLM-RoBERTa
      },
      dataset2: { same structure, D2 labels mapped to readable strings },
      dataset3: { same structure },
      risk_flag: bool,   ← True if ≥3 of 4 D3 models say "suicide"
      suicide_votes: "N/4 models flagged suicide risk",
      winner_summary: { depression_type, depressed, suicide_risk }
    }
    """
    clean = clean_text(raw_text)

    # ── Dataset 1: Depression type ────────────────────────────────
    d1 = predict_classical(clean, 'd1')
    xlmr1 = predict_transformer(raw_text, 'd1')
    if xlmr1:
        d1['XLM-RoBERTa'] = {k: xlmr1[k] for k in ('label','confidence')}

    d1_winner = max(d1.items(), key=lambda x: x[1]['confidence'])

    # ── Dataset 2: Binary depression ─────────────────────────────
    d2 = predict_classical(clean, 'd2')
    xlmr2 = predict_transformer(raw_text, 'd2')
    if xlmr2:
        d2['XLM-RoBERTa'] = {k: xlmr2[k] for k in ('label','confidence')}

    d2_winner = max(d2.items(), key=lambda x: x[1]['confidence'])

    # ── Dataset 3: Suicide risk ───────────────────────────────────
    d3 = predict_classical(clean, 'd3')
    xlmr3 = predict_transformer(raw_text, 'd3')
    if xlmr3:
        d3['XLM-RoBERTa'] = {k: xlmr3[k] for k in ('label','confidence')}

    d3_winner = max(d3.items(), key=lambda x: x[1]['confidence'])

    # ── Suicide risk flag — majority vote across 4 D3 models ─────
    suicide_count = sum(
        1 for r in d3.values()
        if 'suicide' in r['label'].lower() and 'non' not in r['label'].lower()
    )
    risk_flag = suicide_count >= 3

    return {
        'dataset1': {
            'task':               'Depression Type (6 Classes)',
            'models':             d1,
            'winner_model':       d1_winner[0],
            'winner_prediction':  d1_winner[1]['label'],
            'winner_confidence':  d1_winner[1]['confidence'],
            'class_probs':        xlmr1.get('all_probs', {}) if xlmr1 else {},
        },
        'dataset2': {
            'task':               'Depressed or Not?',
            'models':             d2,
            'winner_model':       d2_winner[0],
            'winner_prediction':  d2_winner[1]['label'],
            'winner_confidence':  d2_winner[1]['confidence'],
        },
        'dataset3': {
            'task':               'Suicide Risk Detection',
            'models':             d3,
            'winner_model':       d3_winner[0],
            'winner_prediction':  d3_winner[1]['label'],
            'winner_confidence':  d3_winner[1]['confidence'],
        },
        'risk_flag':     risk_flag,
        'suicide_votes': f'{suicide_count}/4 models flagged suicide risk',
        'winner_summary': {
            'depression_type': f"{d1_winner[1]['label']} ({d1_winner[1]['confidence']*100:.1f}% — {d1_winner[0]})",
            'depressed':       f"{d2_winner[1]['label']} ({d2_winner[1]['confidence']*100:.1f}% — {d2_winner[0]})",
            'suicide_risk':    f"{d3_winner[1]['label']} ({d3_winner[1]['confidence']*100:.1f}% — {d3_winner[0]})",
        }
    }