mekosotto Claude Sonnet 4.6 commited on
Commit
b1bd8db
·
1 Parent(s): 0236e73

feat(bbb): add Morgan fingerprint extraction with shape/dtype guarantees

Browse files
src/pipelines/bbb_pipeline.py CHANGED
@@ -12,7 +12,10 @@ from __future__ import annotations
12
 
13
  import math
14
 
 
15
  from rdkit import Chem, RDLogger
 
 
16
 
17
  from src.core.logger import get_logger
18
 
@@ -42,3 +45,34 @@ def is_valid_smiles(smiles: str | float | None) -> bool:
42
  if not isinstance(smiles, str) or not smiles.strip():
43
  return False
44
  return Chem.MolFromSmiles(smiles) is not None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  import math
14
 
15
+ import numpy as np
16
  from rdkit import Chem, RDLogger
17
+ from rdkit.Chem import AllChem
18
+ from rdkit.DataStructs import ConvertToNumpyArray
19
 
20
  from src.core.logger import get_logger
21
 
 
45
  if not isinstance(smiles, str) or not smiles.strip():
46
  return False
47
  return Chem.MolFromSmiles(smiles) is not None
48
+
49
+
50
+ def compute_morgan_fingerprint(
51
+ smiles: str,
52
+ n_bits: int = 2048,
53
+ radius: int = 2,
54
+ ) -> np.ndarray:
55
+ """Compute the Morgan (ECFP-like) circular fingerprint for a SMILES.
56
+
57
+ Args:
58
+ smiles: A SMILES string already known to be valid. Pass through
59
+ `is_valid_smiles` first if the source is untrusted.
60
+ n_bits: Length of the bit vector. 2048 is the de-facto default
61
+ for downstream scikit-learn classifiers.
62
+ radius: Morgan radius (2 ≈ ECFP4).
63
+
64
+ Returns:
65
+ A 1-D `np.ndarray` of length `n_bits` and dtype `uint8`, where
66
+ each element is 0 or 1.
67
+
68
+ Raises:
69
+ ValueError: if `smiles` cannot be parsed by RDKit.
70
+ """
71
+ mol = Chem.MolFromSmiles(smiles)
72
+ if mol is None:
73
+ raise ValueError(f"invalid SMILES: {smiles!r}")
74
+
75
+ bit_vect = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
76
+ arr = np.zeros((n_bits,), dtype=np.uint8)
77
+ ConvertToNumpyArray(bit_vect, arr)
78
+ return arr
tests/pipelines/test_bbb_pipeline.py CHANGED
@@ -31,3 +31,29 @@ class TestIsValidSmiles:
31
  def test_rejects_nan(self) -> None:
32
  import math
33
  assert is_valid_smiles(math.nan) is False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def test_rejects_nan(self) -> None:
32
  import math
33
  assert is_valid_smiles(math.nan) is False
34
+
35
+
36
+ import numpy as np
37
+
38
+ from src.pipelines.bbb_pipeline import compute_morgan_fingerprint
39
+
40
+
41
+ class TestComputeMorganFingerprint:
42
+ def test_returns_numpy_array_of_correct_length(self) -> None:
43
+ fp = compute_morgan_fingerprint("CCCO", n_bits=2048, radius=2)
44
+ assert isinstance(fp, np.ndarray)
45
+ assert fp.shape == (2048,)
46
+ assert fp.dtype == np.uint8
47
+
48
+ def test_only_zero_or_one(self) -> None:
49
+ fp = compute_morgan_fingerprint("c1ccccc1", n_bits=1024, radius=2)
50
+ assert set(np.unique(fp).tolist()).issubset({0, 1})
51
+
52
+ def test_different_molecules_yield_different_fingerprints(self) -> None:
53
+ fp_a = compute_morgan_fingerprint("CCCO", n_bits=2048, radius=2)
54
+ fp_b = compute_morgan_fingerprint("c1ccccc1", n_bits=2048, radius=2)
55
+ assert not np.array_equal(fp_a, fp_b)
56
+
57
+ def test_invalid_smiles_raises_value_error(self) -> None:
58
+ with pytest.raises(ValueError, match="invalid SMILES"):
59
+ compute_morgan_fingerprint("not_a_smiles", n_bits=2048, radius=2)