mekosotto commited on
Commit
c4c8d1e
·
1 Parent(s): ed5752e

feat(bbb): add SMILES validity guard with RDKit + test fixture

Browse files
src/pipelines/bbb_pipeline.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """BBB (Blood-Brain Barrier) molecule pipeline.
2
+
3
+ Reads the Kaggle BBBP dataset (SMILES strings + binary penetration label),
4
+ filters chemically invalid SMILES, computes Morgan circular fingerprints with
5
+ RDKit, and writes a model-ready feature table to `data/processed/`.
6
+
7
+ This module follows the Data Readiness contract in AGENTS.md §4:
8
+ schema validity, domain validity (drop invalid SMILES), determinism,
9
+ traceability (row count in / out / dropped), and idempotent output.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import math
14
+ from typing import Any
15
+
16
+ from rdkit import Chem, RDLogger
17
+
18
+ from src.core.logger import get_logger
19
+
20
+ logger = get_logger(__name__)
21
+
22
+ # Suppress RDKit's noisy C++-level warning stream; we surface our own
23
+ # structured warnings via the project logger when a SMILES fails to parse.
24
+ RDLogger.DisableLog("rdApp.*")
25
+
26
+
27
+ def is_valid_smiles(smiles: Any) -> bool:
28
+ """Return True iff `smiles` is a non-empty string parseable by RDKit.
29
+
30
+ Handles the full set of garbage we expect from real CSVs:
31
+ None, NaN floats, empty strings, and unparseable text.
32
+ """
33
+ if smiles is None:
34
+ return False
35
+ if isinstance(smiles, float) and math.isnan(smiles):
36
+ return False
37
+ if not isinstance(smiles, str) or not smiles.strip():
38
+ return False
39
+ return Chem.MolFromSmiles(smiles) is not None
tests/fixtures/bbbp_sample.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ num,name,p_np,smiles
2
+ 1,Propanol,1,CCCO
3
+ 2,Benzene,1,c1ccccc1
4
+ 3,Aspirin,1,CC(=O)OC1=CC=CC=C1C(=O)O
5
+ 4,InvalidMol,0,this_is_not_a_smiles
6
+ 5,Caffeine,1,CN1C=NC2=C1C(=O)N(C(=O)N2C)C
7
+ 6,EmptyMol,0,
tests/pipelines/test_bbb_pipeline.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit + integration tests for the BBB (SMILES → Morgan FP) pipeline."""
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+
6
+ import pandas as pd
7
+ import pytest
8
+
9
+ from src.pipelines.bbb_pipeline import is_valid_smiles
10
+
11
+
12
+ FIXTURE = Path(__file__).parent.parent / "fixtures" / "bbbp_sample.csv"
13
+
14
+
15
+ class TestIsValidSmiles:
16
+ def test_accepts_simple_alcohol(self) -> None:
17
+ assert is_valid_smiles("CCCO") is True
18
+
19
+ def test_accepts_aromatic_ring(self) -> None:
20
+ assert is_valid_smiles("c1ccccc1") is True
21
+
22
+ def test_rejects_garbage_string(self) -> None:
23
+ assert is_valid_smiles("this_is_not_a_smiles") is False
24
+
25
+ def test_rejects_empty_string(self) -> None:
26
+ assert is_valid_smiles("") is False
27
+
28
+ def test_rejects_none(self) -> None:
29
+ assert is_valid_smiles(None) is False # type: ignore[arg-type]
30
+
31
+ def test_rejects_nan(self) -> None:
32
+ import math
33
+ assert is_valid_smiles(math.nan) is False # type: ignore[arg-type]