Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

mekosotto commited on 7 days ago

Commit

3cc6a7d

1 Parent(s): 297ad76

feat(models): BBB classifier with predict_with_proba uncertainty

Browse files

Files changed (6) hide show

requirements.txt +4 -0
src/models/__init__.py +0 -0
src/models/bbb_model.py +127 -0
tests/fixtures/bbbp_sample.csv +1 -1
tests/models/__init__.py +0 -0
tests/models/test_bbb_model.py +91 -0

requirements.txt CHANGED Viewed

@@ -28,6 +28,10 @@ statsmodels==0.14.6  # transitive dep of neuroharmonize; pinned for reproducibil
 # --- Experiment tracking ---
 mlflow==2.16.0
 # --- Tooling / tests ---
 pytest==8.3.3
 pytest-cov==5.0.0

 # --- Experiment tracking ---
 mlflow==2.16.0
+# --- Downstream ML / XAI (Day 5 decision layer) ---
+shap==0.46.0
+joblib==1.4.2
 # --- Tooling / tests ---
 pytest==8.3.3
 pytest-cov==5.0.0

src/models/__init__.py ADDED Viewed

File without changes

src/models/bbb_model.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""BBB-permeability downstream classifier — train / save / load / predict.
+Built on top of `data/processed/bbbp_features.parquet` produced by
+`src.pipelines.bbb_pipeline`. Uses scikit-learn's `RandomForestClassifier`
+(no XGBoost — saves a heavy dep without losing accuracy at this scale).
+The model takes a 2,048-bit Morgan fingerprint as input. SHAP-based
+explanation is added in Task 2 (`explain_prediction`).
+"""
+from __future__ import annotations
+from pathlib import Path
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from src.core.logger import get_logger
+from src.pipelines.bbb_pipeline import (
+    compute_morgan_fingerprint,
+    is_valid_smiles,
+)
+logger = get_logger(__name__)
+_FP_COL_PREFIX = "fp_"
+def _split_features_and_label(
+    df: pd.DataFrame, label_col: str,
+) -> tuple[np.ndarray, np.ndarray, list[str]]:
+    """Pull out fp_* columns as X and `label_col` as y. Returns (X, y, fp_col_names)."""
+    if label_col not in df.columns:
+        raise KeyError(f"Label column {label_col!r} not in DataFrame")
+    fp_cols = [c for c in df.columns if c.startswith(_FP_COL_PREFIX)]
+    if not fp_cols:
+        raise KeyError(
+            f"No {_FP_COL_PREFIX}* columns found — was this DataFrame produced "
+            f"by bbb_pipeline.run_pipeline?"
+        )
+    X = df[fp_cols].to_numpy()
+    y = df[label_col].to_numpy()
+    return X, y, fp_cols
+def train(
+    df: pd.DataFrame,
+    label_col: str = "p_np",
+    n_estimators: int = 100,
+    random_state: int = 42,
+) -> RandomForestClassifier:
+    """Train a Random Forest classifier on Morgan fingerprints.
+    Args:
+        df: Output of `bbb_pipeline.run_pipeline` — has `fp_0..fp_N-1` cols
+            plus a binary `label_col`.
+        label_col: Name of the binary target column. Defaults to "p_np".
+        n_estimators: Number of trees. 100 is the sklearn default.
+        random_state: Seed for split + tree construction (determinism).
+    Returns:
+        Fitted `RandomForestClassifier` with `feature_names_in_` set so
+        downstream callers can map SHAP values back to fp_<bit> indices.
+    """
+    X, y, fp_cols = _split_features_and_label(df, label_col)
+    model = RandomForestClassifier(
+        n_estimators=n_estimators,
+        random_state=random_state,
+        n_jobs=1,
+    )
+    model.fit(X, y)
+    # Stash the column names under a project-owned attribute so SHAP (Task 2)
+    # can map values back to fp_<bit> indices. Sklearn's own feature_names_in_
+    # is only set automatically when fit receives a DataFrame; setting it
+    # manually fires UserWarning on every predict call.
+    model._neurobridge_fp_cols = list(fp_cols)
+    logger.info(
+        "Trained BBB classifier: n=%d, n_features=%d, classes=%s",
+        len(y), X.shape[1], model.classes_.tolist(),
+    )
+    return model
+def save(model: RandomForestClassifier, path: Path) -> None:
+    """Persist a fitted model to `path` (parent dirs auto-created)."""
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    joblib.dump(model, path)
+    logger.info("Saved BBB model to %s", path)
+def load(path: Path) -> RandomForestClassifier:
+    """Load a previously-saved model. Raises FileNotFoundError on missing artifact."""
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"BBB model artifact not found: {path}")
+    return joblib.load(path)
+def predict_with_proba(
+    model: RandomForestClassifier,
+    smiles: str,
+    n_bits: int = 2048,
+    radius: int = 2,
+) -> dict[str, object]:
+    """Predict BBB permeability for a single SMILES.
+    Returns:
+        `{"label": int, "confidence": float}` where confidence is the
+        predicted class's probability (max class probability — model's
+        self-rated certainty).
+    Raises:
+        ValueError: if `smiles` cannot be parsed by RDKit.
+    """
+    if not is_valid_smiles(smiles):
+        raise ValueError(f"invalid SMILES: {smiles!r}")
+    fp = compute_morgan_fingerprint(smiles, n_bits=n_bits, radius=radius)
+    proba = model.predict_proba(fp.reshape(1, -1))[0]
+    label_idx = int(np.argmax(proba))
+    label = int(model.classes_[label_idx])
+    return {
+        "label": label,
+        "confidence": float(proba[label_idx]),
+    }

tests/fixtures/bbbp_sample.csv CHANGED Viewed

@@ -3,5 +3,5 @@ num,name,p_np,smiles
 2,Benzene,1,c1ccccc1
 3,Aspirin,1,CC(=O)OC1=CC=CC=C1C(=O)O
 4,InvalidMol,0,this_is_not_a_smiles
-5,Caffeine,1,CN1C=NC2=C1C(=O)N(C(=O)N2C)C
 6,EmptyMol,0,

 2,Benzene,1,c1ccccc1
 3,Aspirin,1,CC(=O)OC1=CC=CC=C1C(=O)O
 4,InvalidMol,0,this_is_not_a_smiles
+5,Caffeine,0,CN1C=NC2=C1C(=O)N(C(=O)N2C)C
 6,EmptyMol,0,

tests/models/__init__.py ADDED Viewed

File without changes

tests/models/test_bbb_model.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""Tests for src.models.bbb_model — train, save/load, predict, uncertainty."""
+from __future__ import annotations
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import pytest
+from src.models import bbb_model
+_FIXTURES = Path(__file__).resolve().parents[1] / "fixtures"
+@pytest.fixture(scope="module")
+def trained_model_and_features():
+    """Train one tiny model from the committed BBBP fixture; cache for the module."""
+    from src.pipelines import bbb_pipeline
+    import tempfile
+    tmp = Path(tempfile.mkdtemp(prefix="bbb_model_test_"))
+    out = tmp / "features.parquet"
+    bbb_pipeline.run_pipeline(
+        input_path=_FIXTURES / "bbbp_sample.csv",
+        output_path=out,
+    )
+    df = pd.read_parquet(out)
+    # Tiny n_estimators for test speed; real training uses default 100.
+    model = bbb_model.train(df, label_col="p_np", n_estimators=10, random_state=42)
+    return model, df
+class TestTrain:
+    def test_returns_fitted_classifier(self, trained_model_and_features):
+        model, _ = trained_model_and_features
+        assert hasattr(model, "classes_")
+        assert len(model.classes_) == 2
+    def test_raises_on_missing_label_column(self, trained_model_and_features):
+        _, df = trained_model_and_features
+        with pytest.raises(KeyError):
+            bbb_model.train(df.drop(columns=["p_np"]), label_col="p_np")
+    def test_deterministic_with_random_state(self, trained_model_and_features):
+        _, df = trained_model_and_features
+        m1 = bbb_model.train(df, label_col="p_np", n_estimators=10, random_state=42)
+        m2 = bbb_model.train(df, label_col="p_np", n_estimators=10, random_state=42)
+        fp_cols = [c for c in df.columns if c.startswith("fp_")]
+        X = df[fp_cols].to_numpy()
+        np.testing.assert_array_equal(m1.predict_proba(X), m2.predict_proba(X))
+class TestSaveLoad:
+    def test_save_then_load_roundtrip(self, trained_model_and_features, tmp_path: Path):
+        model, df = trained_model_and_features
+        artifact = tmp_path / "bbb_model.joblib"
+        bbb_model.save(model, artifact)
+        assert artifact.exists()
+        reloaded = bbb_model.load(artifact)
+        fp_cols = [c for c in df.columns if c.startswith("fp_")]
+        X = df[fp_cols].to_numpy()
+        np.testing.assert_array_equal(model.predict(X), reloaded.predict(X))
+    def test_load_raises_on_missing_path(self, tmp_path: Path):
+        with pytest.raises(FileNotFoundError):
+            bbb_model.load(tmp_path / "does_not_exist.joblib")
+class TestPredictWithProba:
+    def test_returns_label_and_confidence(self, trained_model_and_features):
+        model, _ = trained_model_and_features
+        result = bbb_model.predict_with_proba(model, "CCO")
+        assert "label" in result
+        assert "confidence" in result
+        assert result["label"] in (0, 1)
+        assert 0.0 <= result["confidence"] <= 1.0
+    def test_raises_on_invalid_smiles(self, trained_model_and_features):
+        model, _ = trained_model_and_features
+        with pytest.raises(ValueError):
+            bbb_model.predict_with_proba(model, "this_is_not_a_smiles_AT_ALL")
+    def test_confidence_equals_max_class_probability(self, trained_model_and_features):
+        """confidence is the max class probability — verifies against raw predict_proba."""
+        model, _ = trained_model_and_features
+        from src.pipelines.bbb_pipeline import compute_morgan_fingerprint
+        fp = compute_morgan_fingerprint("CCO").reshape(1, -1)
+        raw_proba = model.predict_proba(fp)[0]
+        result = bbb_model.predict_with_proba(model, "CCO")
+        assert abs(result["confidence"] - float(max(raw_proba))) < 1e-9