feat(bbb): add Morgan fingerprint extraction with shape/dtype guarantees
Browse files
src/pipelines/bbb_pipeline.py
CHANGED
|
@@ -12,7 +12,10 @@ from __future__ import annotations
|
|
| 12 |
|
| 13 |
import math
|
| 14 |
|
|
|
|
| 15 |
from rdkit import Chem, RDLogger
|
|
|
|
|
|
|
| 16 |
|
| 17 |
from src.core.logger import get_logger
|
| 18 |
|
|
@@ -42,3 +45,34 @@ def is_valid_smiles(smiles: str | float | None) -> bool:
|
|
| 42 |
if not isinstance(smiles, str) or not smiles.strip():
|
| 43 |
return False
|
| 44 |
return Chem.MolFromSmiles(smiles) is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
import math
|
| 14 |
|
| 15 |
+
import numpy as np
|
| 16 |
from rdkit import Chem, RDLogger
|
| 17 |
+
from rdkit.Chem import AllChem
|
| 18 |
+
from rdkit.DataStructs import ConvertToNumpyArray
|
| 19 |
|
| 20 |
from src.core.logger import get_logger
|
| 21 |
|
|
|
|
| 45 |
if not isinstance(smiles, str) or not smiles.strip():
|
| 46 |
return False
|
| 47 |
return Chem.MolFromSmiles(smiles) is not None
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def compute_morgan_fingerprint(
|
| 51 |
+
smiles: str,
|
| 52 |
+
n_bits: int = 2048,
|
| 53 |
+
radius: int = 2,
|
| 54 |
+
) -> np.ndarray:
|
| 55 |
+
"""Compute the Morgan (ECFP-like) circular fingerprint for a SMILES.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
smiles: A SMILES string already known to be valid. Pass through
|
| 59 |
+
`is_valid_smiles` first if the source is untrusted.
|
| 60 |
+
n_bits: Length of the bit vector. 2048 is the de-facto default
|
| 61 |
+
for downstream scikit-learn classifiers.
|
| 62 |
+
radius: Morgan radius (2 ≈ ECFP4).
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
A 1-D `np.ndarray` of length `n_bits` and dtype `uint8`, where
|
| 66 |
+
each element is 0 or 1.
|
| 67 |
+
|
| 68 |
+
Raises:
|
| 69 |
+
ValueError: if `smiles` cannot be parsed by RDKit.
|
| 70 |
+
"""
|
| 71 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 72 |
+
if mol is None:
|
| 73 |
+
raise ValueError(f"invalid SMILES: {smiles!r}")
|
| 74 |
+
|
| 75 |
+
bit_vect = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
|
| 76 |
+
arr = np.zeros((n_bits,), dtype=np.uint8)
|
| 77 |
+
ConvertToNumpyArray(bit_vect, arr)
|
| 78 |
+
return arr
|
tests/pipelines/test_bbb_pipeline.py
CHANGED
|
@@ -31,3 +31,29 @@ class TestIsValidSmiles:
|
|
| 31 |
def test_rejects_nan(self) -> None:
|
| 32 |
import math
|
| 33 |
assert is_valid_smiles(math.nan) is False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
def test_rejects_nan(self) -> None:
|
| 32 |
import math
|
| 33 |
assert is_valid_smiles(math.nan) is False
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
import numpy as np
|
| 37 |
+
|
| 38 |
+
from src.pipelines.bbb_pipeline import compute_morgan_fingerprint
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class TestComputeMorganFingerprint:
|
| 42 |
+
def test_returns_numpy_array_of_correct_length(self) -> None:
|
| 43 |
+
fp = compute_morgan_fingerprint("CCCO", n_bits=2048, radius=2)
|
| 44 |
+
assert isinstance(fp, np.ndarray)
|
| 45 |
+
assert fp.shape == (2048,)
|
| 46 |
+
assert fp.dtype == np.uint8
|
| 47 |
+
|
| 48 |
+
def test_only_zero_or_one(self) -> None:
|
| 49 |
+
fp = compute_morgan_fingerprint("c1ccccc1", n_bits=1024, radius=2)
|
| 50 |
+
assert set(np.unique(fp).tolist()).issubset({0, 1})
|
| 51 |
+
|
| 52 |
+
def test_different_molecules_yield_different_fingerprints(self) -> None:
|
| 53 |
+
fp_a = compute_morgan_fingerprint("CCCO", n_bits=2048, radius=2)
|
| 54 |
+
fp_b = compute_morgan_fingerprint("c1ccccc1", n_bits=2048, radius=2)
|
| 55 |
+
assert not np.array_equal(fp_a, fp_b)
|
| 56 |
+
|
| 57 |
+
def test_invalid_smiles_raises_value_error(self) -> None:
|
| 58 |
+
with pytest.raises(ValueError, match="invalid SMILES"):
|
| 59 |
+
compute_morgan_fingerprint("not_a_smiles", n_bits=2048, radius=2)
|