Spaces:

Esvanth
/

mindscan

Running

App Files Files Community

Esvanth commited on Apr 20

Commit

202132a

verified ·

1 Parent(s): 78264fe

Add predict.py

Browse files

Files changed (1) hide show

predict.py +321 -0

predict.py ADDED Viewed

	@@ -0,0 +1,321 @@

+"""
+MindScan — Prediction Logic
+NCI H9DAI Research Project 2026
+All model loading and prediction functions.
+Imported by app.py — do not run directly.
+Datasets:
+  D1 — Zenodo (Nusrat 2024) — 6-class depression type
+  D2 — Kaggle (albertobellardini) — binary depression (labels: '0'/'1')
+  D3 — Kaggle (nikhileswarkomati) — binary suicide risk
+Models per dataset:
+  Logistic Regression, SVM, XGBoost, XLM-RoBERTa
+  (Random Forest excluded — 646 MB, worst performer on D1/D3)
+"""
+import os, re, string, joblib
+import numpy as np
+# ─────────────────────────────────────────────────────────────────
+# PATHS
+# ─────────────────────────────────────────────────────────────────
+BASE_DIR       = os.path.dirname(os.path.abspath(__file__))
+CLASSICAL_DIR  = os.path.join(BASE_DIR, 'models', 'classical')
+TRANSFORMER_DIR = os.path.join(BASE_DIR, 'models', 'transformers')
+# If transformers aren't present locally, fetch them from the HF model repo.
+# Used on HF Spaces where only app/classical are pushed and heavy weights live
+# in a separate model repo to avoid Space LFS limits.
+HF_XLMR_REPO = "Esvanth/mindscan-xlmr"
+# ─────────────────────────────────────────────────────────────────
+# D2 LABEL MAPPING
+# The dataset uses '0' and '1' as labels.
+# We map them to human-readable strings for the UI.
+# ─────────────────────────────────────────────────────────────────
+D2_LABEL_MAP = {
+    '0': 'Not Depressed',
+    '1': 'Depressed',
+    0: 'Not Depressed',
+    1: 'Depressed',
+}
+# ─────────────────────────────────────────────────────────────────
+# MODEL STORAGE — populated by load_all_models()
+# ─────────────────────────────────────────────────────────────────
+_models = {}
+_loaded = False
+def models_loaded():
+    return _loaded
+def load_all_models():
+    """
+    Loads all 12 models (4 per dataset × 3 datasets) into memory.
+    Called once at server startup. Takes ~30s on CPU due to XLM-RoBERTa.
+    """
+    global _loaded
+    # ── Classical support files ───────────────────────────────────
+    for ds in ['d1', 'd2', 'd3']:
+        _models[f'le_{ds}']    = joblib.load(os.path.join(CLASSICAL_DIR, f'le_{ds}.pkl'))
+        _models[f'tfidf_{ds}'] = joblib.load(os.path.join(CLASSICAL_DIR, f'tfidf_{ds}.pkl'))
+        print(f"  ✓ Loaded encoders/tfidf for {ds}")
+    # ── Classical models ──────────────────────────────────────────
+    for model_name in ['logistic_regression', 'svm', 'xgboost']:
+        for ds in ['d1', 'd2', 'd3']:
+            key  = f'{model_name}_{ds}'
+            path = os.path.join(CLASSICAL_DIR, f'{key}.pkl')
+            _models[key] = joblib.load(path)
+            print(f"  ✓ Loaded {key}")
+    # ── XLM-RoBERTa transformers ──────────────────────────────────
+    try:
+        import torch
+        from transformers import AutoTokenizer, AutoModelForSequenceClassification
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        _models['device'] = device
+        print(f"  ✓ Using device: {device}")
+        # On HF Spaces the weights aren't bundled with the app — fetch them
+        # from the model repo into TRANSFORMER_DIR on first startup.
+        d1_local = os.path.join(TRANSFORMER_DIR, 'xlmr_d1_final')
+        if not os.path.isdir(d1_local):
+            from huggingface_hub import snapshot_download
+            print(f"  ↓ Downloading transformers from {HF_XLMR_REPO} ...")
+            snapshot_download(
+                repo_id=HF_XLMR_REPO,
+                repo_type="model",
+                local_dir=TRANSFORMER_DIR,
+                local_dir_use_symlinks=False,
+            )
+            print("  ✓ Transformers downloaded")
+        # Shared tokenizer (all 3 models use the same base tokeniser)
+        tokenizer_path = os.path.join(TRANSFORMER_DIR, 'xlmr_d1_final')
+        _models['tokenizer'] = AutoTokenizer.from_pretrained(tokenizer_path)
+        print("  ✓ Tokeniser loaded")
+        for ds, max_len in [('d1', 128), ('d2', 128), ('d3', 256)]:
+            folder = os.path.join(TRANSFORMER_DIR, f'xlmr_{ds}_final')
+            model  = AutoModelForSequenceClassification.from_pretrained(folder)
+            model  = model.to(device)
+            model.eval()
+            _models[f'xlmr_{ds}']     = model
+            _models[f'xlmr_{ds}_len'] = max_len
+            print(f"  ✓ Loaded XLM-RoBERTa {ds} (max_length={max_len})")
+    except Exception as e:
+        print(f"  ⚠ XLM-RoBERTa failed to load: {e}")
+        print("    Classical models will still work.")
+    _loaded = True
+    print("  ✅ All models ready")
+# ─────────────────────────────────────────────────────────────────
+# TEXT CLEANING — same function used in both notebooks
+# ─────────────────────────────────────────────────────────────────
+def clean_text(text):
+    text = str(text).lower()
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
+    text = re.sub(r'@\w+', '', text)
+    text = re.sub(r'#', '', text)
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+# ─────────────────────────────────────────────────────────────────
+# PREDICTION HELPERS
+# ─────────────────────────────────────────────────────────────────
+def predict_classical(text_clean, ds):
+    """
+    Runs text through the 3 classical models for one dataset.
+    Returns dict: { model_name: {label, confidence} }
+    """
+    tfidf = _models[f'tfidf_{ds}']
+    le    = _models[f'le_{ds}']
+    vec   = tfidf.transform([text_clean])
+    results = {}
+    display_names = {
+        'logistic_regression': 'Logistic Regression',
+        'svm':                 'SVM',
+        'xgboost':             'XGBoost',
+    }
+    for key, display in display_names.items():
+        model    = _models[f'{key}_{ds}']
+        pred_idx = model.predict(vec)[0]
+        raw_label = le.classes_[pred_idx]
+        # Map D2 numeric labels to readable strings
+        if ds == 'd2':
+            label = D2_LABEL_MAP.get(raw_label, str(raw_label))
+        else:
+            label = str(raw_label)
+        # Confidence: predict_proba if available, else softmax of decision_function
+        if hasattr(model, 'predict_proba'):
+            conf = float(model.predict_proba(vec)[0][pred_idx])
+        elif hasattr(model, 'decision_function'):
+            scores = model.decision_function(vec)[0]
+            if np.ndim(scores) == 0:
+                scores = np.array([float(-scores), float(scores)])
+            e    = np.exp(scores - scores.max())
+            conf = float(e[pred_idx] / e.sum())
+        else:
+            conf = 1.0
+        results[display] = {
+            'label':      label,
+            'confidence': round(conf, 4),
+        }
+    return results
+def predict_transformer(text_raw, ds):
+    """
+    Runs text through XLM-RoBERTa for one dataset.
+    Returns { label, confidence, all_probs }
+    all_probs = { class_name: probability } for all classes.
+    Used for the class breakdown bars in the UI.
+    """
+    if f'xlmr_{ds}' not in _models:
+        return None
+    import torch
+    model   = _models[f'xlmr_{ds}']
+    tok     = _models['tokenizer']
+    le      = _models[f'le_{ds}']
+    max_len = _models[f'xlmr_{ds}_len']
+    device  = _models.get('device', 'cpu')
+    inputs = tok(
+        text_raw,
+        return_tensors='pt',
+        max_length=max_len,
+        truncation=True,
+        padding='max_length'
+    ).to(device)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs    = torch.softmax(logits, dim=1).cpu().numpy()[0]
+    pred_idx = int(probs.argmax())
+    raw_label = le.classes_[pred_idx]
+    if ds == 'd2':
+        label = D2_LABEL_MAP.get(raw_label, str(raw_label))
+    else:
+        label = str(raw_label)
+    # Build all_probs dict with readable labels
+    all_probs = {}
+    for i, p in enumerate(probs):
+        raw = le.classes_[i]
+        readable = D2_LABEL_MAP.get(raw, str(raw)) if ds == 'd2' else str(raw)
+        all_probs[readable] = round(float(p), 4)
+    return {
+        'label':      label,
+        'confidence': round(float(probs[pred_idx]), 4),
+        'all_probs':  all_probs,
+    }
+# ─────────────────────────────────────────────────────────────────
+# MAIN FUNCTION — called by Flask /predict endpoint
+# ─────────────────────────────────────────────────────────────────
+def predict_all(raw_text):
+    """
+    Runs text through all 12 models across 3 datasets.
+    Returns dict:
+    {
+      dataset1: {
+        task, models: {LR, SVM, XGBoost, XLM-RoBERTa},
+        winner_model, winner_prediction, winner_confidence,
+        class_probs   ← only D1, 6-class breakdown from XLM-RoBERTa
+      },
+      dataset2: { same structure, D2 labels mapped to readable strings },
+      dataset3: { same structure },
+      risk_flag: bool,   ← True if ≥3 of 4 D3 models say "suicide"
+      suicide_votes: "N/4 models flagged suicide risk",
+      winner_summary: { depression_type, depressed, suicide_risk }
+    }
+    """
+    clean = clean_text(raw_text)
+    # ── Dataset 1: Depression type ────────────────────────────────
+    d1 = predict_classical(clean, 'd1')
+    xlmr1 = predict_transformer(raw_text, 'd1')
+    if xlmr1:
+        d1['XLM-RoBERTa'] = {k: xlmr1[k] for k in ('label','confidence')}
+    d1_winner = max(d1.items(), key=lambda x: x[1]['confidence'])
+    # ── Dataset 2: Binary depression ─────────────────────────────
+    d2 = predict_classical(clean, 'd2')
+    xlmr2 = predict_transformer(raw_text, 'd2')
+    if xlmr2:
+        d2['XLM-RoBERTa'] = {k: xlmr2[k] for k in ('label','confidence')}
+    d2_winner = max(d2.items(), key=lambda x: x[1]['confidence'])
+    # ── Dataset 3: Suicide risk ───────────────────────────────────
+    d3 = predict_classical(clean, 'd3')
+    xlmr3 = predict_transformer(raw_text, 'd3')
+    if xlmr3:
+        d3['XLM-RoBERTa'] = {k: xlmr3[k] for k in ('label','confidence')}
+    d3_winner = max(d3.items(), key=lambda x: x[1]['confidence'])
+    # ── Suicide risk flag — majority vote across 4 D3 models ─────
+    suicide_count = sum(
+        1 for r in d3.values()
+        if 'suicide' in r['label'].lower() and 'non' not in r['label'].lower()
+    )
+    risk_flag = suicide_count >= 3
+    return {
+        'dataset1': {
+            'task':               'Depression Type (6 Classes)',
+            'models':             d1,
+            'winner_model':       d1_winner[0],
+            'winner_prediction':  d1_winner[1]['label'],
+            'winner_confidence':  d1_winner[1]['confidence'],
+            'class_probs':        xlmr1.get('all_probs', {}) if xlmr1 else {},
+        },
+        'dataset2': {
+            'task':               'Depressed or Not?',
+            'models':             d2,
+            'winner_model':       d2_winner[0],
+            'winner_prediction':  d2_winner[1]['label'],
+            'winner_confidence':  d2_winner[1]['confidence'],
+        },
+        'dataset3': {
+            'task':               'Suicide Risk Detection',
+            'models':             d3,
+            'winner_model':       d3_winner[0],
+            'winner_prediction':  d3_winner[1]['label'],
+            'winner_confidence':  d3_winner[1]['confidence'],
+        },
+        'risk_flag':     risk_flag,
+        'suicide_votes': f'{suicide_count}/4 models flagged suicide risk',
+        'winner_summary': {
+            'depression_type': f"{d1_winner[1]['label']} ({d1_winner[1]['confidence']*100:.1f}% — {d1_winner[0]})",
+            'depressed':       f"{d2_winner[1]['label']} ({d2_winner[1]['confidence']*100:.1f}% — {d2_winner[0]})",
+            'suicide_risk':    f"{d3_winner[1]['label']} ({d3_winner[1]['confidence']*100:.1f}% — {d3_winner[0]})",
+        }
+    }