feat(bbb): add SMILES validity guard with RDKit + test fixture
Browse files
src/pipelines/bbb_pipeline.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""BBB (Blood-Brain Barrier) molecule pipeline.
|
| 2 |
+
|
| 3 |
+
Reads the Kaggle BBBP dataset (SMILES strings + binary penetration label),
|
| 4 |
+
filters chemically invalid SMILES, computes Morgan circular fingerprints with
|
| 5 |
+
RDKit, and writes a model-ready feature table to `data/processed/`.
|
| 6 |
+
|
| 7 |
+
This module follows the Data Readiness contract in AGENTS.md §4:
|
| 8 |
+
schema validity, domain validity (drop invalid SMILES), determinism,
|
| 9 |
+
traceability (row count in / out / dropped), and idempotent output.
|
| 10 |
+
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import math
|
| 14 |
+
from typing import Any
|
| 15 |
+
|
| 16 |
+
from rdkit import Chem, RDLogger
|
| 17 |
+
|
| 18 |
+
from src.core.logger import get_logger
|
| 19 |
+
|
| 20 |
+
logger = get_logger(__name__)
|
| 21 |
+
|
| 22 |
+
# Suppress RDKit's noisy C++-level warning stream; we surface our own
|
| 23 |
+
# structured warnings via the project logger when a SMILES fails to parse.
|
| 24 |
+
RDLogger.DisableLog("rdApp.*")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def is_valid_smiles(smiles: Any) -> bool:
|
| 28 |
+
"""Return True iff `smiles` is a non-empty string parseable by RDKit.
|
| 29 |
+
|
| 30 |
+
Handles the full set of garbage we expect from real CSVs:
|
| 31 |
+
None, NaN floats, empty strings, and unparseable text.
|
| 32 |
+
"""
|
| 33 |
+
if smiles is None:
|
| 34 |
+
return False
|
| 35 |
+
if isinstance(smiles, float) and math.isnan(smiles):
|
| 36 |
+
return False
|
| 37 |
+
if not isinstance(smiles, str) or not smiles.strip():
|
| 38 |
+
return False
|
| 39 |
+
return Chem.MolFromSmiles(smiles) is not None
|
tests/fixtures/bbbp_sample.csv
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
num,name,p_np,smiles
|
| 2 |
+
1,Propanol,1,CCCO
|
| 3 |
+
2,Benzene,1,c1ccccc1
|
| 4 |
+
3,Aspirin,1,CC(=O)OC1=CC=CC=C1C(=O)O
|
| 5 |
+
4,InvalidMol,0,this_is_not_a_smiles
|
| 6 |
+
5,Caffeine,1,CN1C=NC2=C1C(=O)N(C(=O)N2C)C
|
| 7 |
+
6,EmptyMol,0,
|
tests/pipelines/test_bbb_pipeline.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit + integration tests for the BBB (SMILES → Morgan FP) pipeline."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
from src.pipelines.bbb_pipeline import is_valid_smiles
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
FIXTURE = Path(__file__).parent.parent / "fixtures" / "bbbp_sample.csv"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class TestIsValidSmiles:
|
| 16 |
+
def test_accepts_simple_alcohol(self) -> None:
|
| 17 |
+
assert is_valid_smiles("CCCO") is True
|
| 18 |
+
|
| 19 |
+
def test_accepts_aromatic_ring(self) -> None:
|
| 20 |
+
assert is_valid_smiles("c1ccccc1") is True
|
| 21 |
+
|
| 22 |
+
def test_rejects_garbage_string(self) -> None:
|
| 23 |
+
assert is_valid_smiles("this_is_not_a_smiles") is False
|
| 24 |
+
|
| 25 |
+
def test_rejects_empty_string(self) -> None:
|
| 26 |
+
assert is_valid_smiles("") is False
|
| 27 |
+
|
| 28 |
+
def test_rejects_none(self) -> None:
|
| 29 |
+
assert is_valid_smiles(None) is False # type: ignore[arg-type]
|
| 30 |
+
|
| 31 |
+
def test_rejects_nan(self) -> None:
|
| 32 |
+
import math
|
| 33 |
+
assert is_valid_smiles(math.nan) is False # type: ignore[arg-type]
|