Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

mekosotto Claude Sonnet 4.6 commited on 8 days ago

Commit

48cf9c9

1 Parent(s): 049a352

feat(bbb): add run_pipeline orchestrator + CLI entrypoint with idempotent writes

Browse files

Files changed (2) hide show

src/pipelines/bbb_pipeline.py +58 -0
tests/pipelines/test_bbb_pipeline.py +50 -0

src/pipelines/bbb_pipeline.py CHANGED Viewed

@@ -11,6 +11,7 @@ traceability (row count in / out / dropped), and idempotent output.
 from __future__ import annotations
 import math
 import numpy as np
 import pandas as pd
@@ -177,3 +178,60 @@ def extract_features_from_dataframe(
         n_total, len(out), n_invalid, 100.0 * n_invalid / max(n_total, 1),
     )
     return out

 from __future__ import annotations
 import math
+from pathlib import Path
 import numpy as np
 import pandas as pd
         n_total, len(out), n_invalid, 100.0 * n_invalid / max(n_total, 1),
     )
     return out
+DEFAULT_INPUT = Path("data/raw/bbbp.csv")
+DEFAULT_OUTPUT = Path("data/processed/bbbp_features.csv")
+def run_pipeline(
+    input_path: Path = DEFAULT_INPUT,
+    output_path: Path = DEFAULT_OUTPUT,
+    smiles_col: str = "smiles",
+    n_bits: int = 2048,
+    radius: int = 2,
+) -> None:
+    """Run the BBB pipeline end-to-end: raw CSV → processed feature CSV.
+    Reads the Kaggle BBBP CSV at `input_path`, validates and converts
+    SMILES into Morgan fingerprints, and writes the model-ready table
+    to `output_path`. Output is overwritten on every run (idempotent).
+    Args:
+        input_path: Path to the raw BBBP CSV (must include `smiles_col`).
+        output_path: Where to write the processed feature CSV. Parent
+            directory is created if missing.
+        smiles_col: SMILES column name in the raw CSV.
+        n_bits: Morgan fingerprint length.
+        radius: Morgan radius.
+    Raises:
+        FileNotFoundError: if `input_path` does not exist.
+        KeyError: if `smiles_col` is missing from the CSV.
+    """
+    input_path = Path(input_path)
+    output_path = Path(output_path)
+    if not input_path.exists():
+        raise FileNotFoundError(f"Raw BBBP file not found: {input_path}")
+    logger.info("Reading raw BBBP from %s", input_path)
+    df = pd.read_csv(input_path)
+    logger.info("Loaded %d rows, columns=%s", len(df), list(df.columns))
+    features = extract_features_from_dataframe(
+        df, smiles_col=smiles_col, n_bits=n_bits, radius=radius,
+    )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    features.to_csv(output_path, index=False)
+    logger.info(
+        "Wrote processed features to %s (rows=%d, cols=%d)",
+        output_path, len(features), features.shape[1],
+    )
+if __name__ == "__main__":
+    # Production-ready CLI entrypoint:
+    #   python -m src.pipelines.bbb_pipeline
+    run_pipeline()

tests/pipelines/test_bbb_pipeline.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
 from pathlib import Path
 import numpy as np
 import pandas as pd
@@ -11,6 +12,7 @@ from src.pipelines.bbb_pipeline import (
     compute_morgan_fingerprint,
     extract_features_from_dataframe,
     is_valid_smiles,
 )
@@ -136,3 +138,51 @@ class TestExtractFeaturesFromDataFrame:
         output = buf.getvalue()
         assert "Dropping 2/6 rows with invalid SMILES" in output
         assert "Feature extraction complete: in=6, out=4, dropped=2" in output

 from __future__ import annotations
 from pathlib import Path
+import shutil
 import numpy as np
 import pandas as pd
     compute_morgan_fingerprint,
     extract_features_from_dataframe,
     is_valid_smiles,
+    run_pipeline,
 )
         output = buf.getvalue()
         assert "Dropping 2/6 rows with invalid SMILES" in output
         assert "Feature extraction complete: in=6, out=4, dropped=2" in output
+class TestRunPipeline:
+    def test_end_to_end_writes_processed_csv(self, tmp_path: Path) -> None:
+        # Arrange: copy fixture into a synthetic raw layout.
+        raw_dir = tmp_path / "data" / "raw"
+        proc_dir = tmp_path / "data" / "processed"
+        raw_dir.mkdir(parents=True)
+        proc_dir.mkdir(parents=True)
+        input_path = raw_dir / "bbbp.csv"
+        output_path = proc_dir / "bbbp_features.csv"
+        shutil.copy(FIXTURE, input_path)
+        # Act
+        run_pipeline(input_path=input_path, output_path=output_path, n_bits=128, radius=2)
+        # Assert: file exists
+        assert output_path.exists(), "pipeline must write processed CSV"
+        # Assert: content is correct
+        out = pd.read_csv(output_path)
+        assert len(out) == 4  # 6 raw - 2 invalid
+        assert "p_np" in out.columns
+        assert sum(c.startswith("fp_") for c in out.columns) == 128
+        assert "smiles" not in out.columns
+    def test_run_pipeline_is_idempotent(self, tmp_path: Path) -> None:
+        raw_dir = tmp_path / "data" / "raw"
+        proc_dir = tmp_path / "data" / "processed"
+        raw_dir.mkdir(parents=True)
+        proc_dir.mkdir(parents=True)
+        input_path = raw_dir / "bbbp.csv"
+        output_path = proc_dir / "bbbp_features.csv"
+        shutil.copy(FIXTURE, input_path)
+        run_pipeline(input_path=input_path, output_path=output_path, n_bits=64, radius=2)
+        first_bytes = output_path.read_bytes()
+        run_pipeline(input_path=input_path, output_path=output_path, n_bits=64, radius=2)
+        second_bytes = output_path.read_bytes()
+        assert first_bytes == second_bytes, "pipeline output must be byte-deterministic"
+    def test_run_pipeline_raises_when_input_missing(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError):
+            run_pipeline(
+                input_path=tmp_path / "nope.csv",
+                output_path=tmp_path / "out.csv",
+            )