Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

mekosotto Claude Sonnet 4.6 commited on 8 days ago

Commit

915880e

1 Parent(s): 32d3a5f

feat(bbb): switch processed output to Parquet for dtype preservation

Browse files

Files changed (4) hide show

AGENTS.md +10 -1
requirements.txt +2 -1
src/pipelines/bbb_pipeline.py +8 -5
tests/pipelines/test_bbb_pipeline.py +22 -6

AGENTS.md CHANGED Viewed

@@ -31,7 +31,7 @@ All experiment runs are tracked in **MLflow**. All services ship as **Docker** i
 ├── pytest.ini
 ├── data/
 │   ├── raw/                  # Untouched source data. NEVER train on this directly.
-│   └── processed/            # Pipeline output. Model-ready outputs (overwritten on each run; see §4).
 ├── src/
 │   ├── api/                  # FastAPI routers, request/response schemas
 │   ├── pipelines/            # One file per modality. Pure functions + a `run_pipeline()` entry.
@@ -85,3 +85,12 @@ refactored into a pipeline.
 5. Write deterministic output to `output_path`.
 6. Document any new dependency in `requirements.txt` (pinned).
 7. Add a one-line entry to this file's pipeline table.

 ├── pytest.ini
 ├── data/
 │   ├── raw/                  # Untouched source data. NEVER train on this directly.
+│   └── processed/            # Pipeline output as Parquet (preserves dtypes; overwritten each run; see §4).
 ├── src/
 │   ├── api/                  # FastAPI routers, request/response schemas
 │   ├── pipelines/            # One file per modality. Pure functions + a `run_pipeline()` entry.
 5. Write deterministic output to `output_path`.
 6. Document any new dependency in `requirements.txt` (pinned).
 7. Add a one-line entry to this file's pipeline table.
+## 6. Storage Format Convention
+All `data/processed/` outputs MUST be **Parquet** (`pyarrow` engine, `compression="snappy"`):
+- Preserves dtypes (uint8 fingerprints stay uint8; float32 EEG features stay float32) — CSV silently widens numeric columns and is unsuitable for the high-dimensional float arrays produced by the EEG and MRI pipelines.
+- Byte-deterministic with fixed compression and single-threaded writes (satisfies §4 Determinism).
+- Read with `pd.read_parquet(path)`; no dtype hints required.
+The raw `data/raw/` inputs may be in any vendor-supplied format (CSV for BBBP, EDF/FIF for EEG, NIfTI for MRI).

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-# Requires Python 3.10–3.12 (rdkit / numpy / pandas / scipy / scikit-learn pins ship cp310–cp312 wheels only).
 # See AGENTS.md §3 for the full coding-standards contract.
 # --- Web / API layer ---
@@ -9,6 +9,7 @@ pydantic==2.9.2
 # --- Core data stack ---
 numpy==1.26.4
 pandas==2.2.2
 scipy==1.13.1
 scikit-learn==1.5.1

+# Requires Python 3.10–3.12 (rdkit / numpy / pandas / pyarrow / scipy / scikit-learn pins ship cp310–cp312 wheels only).
 # See AGENTS.md §3 for the full coding-standards contract.
 # --- Web / API layer ---
 # --- Core data stack ---
 numpy==1.26.4
 pandas==2.2.2
+pyarrow==17.0.0
 scipy==1.13.1
 scikit-learn==1.5.1

src/pipelines/bbb_pipeline.py CHANGED Viewed

@@ -36,7 +36,7 @@ RDLogger.DisableLog("rdApp.*")
 # Default I/O paths for the BBB pipeline. Override via run_pipeline() args.
 DEFAULT_INPUT = Path("data/raw/bbbp.csv")
-DEFAULT_OUTPUT = Path("data/processed/bbbp_features.csv")
 def is_valid_smiles(smiles: str | float | None) -> bool:
@@ -196,12 +196,13 @@ def run_pipeline(
     Reads the Kaggle BBBP CSV at `input_path`, validates and converts
     SMILES into Morgan fingerprints, and writes the model-ready table
-    to `output_path`. Output is overwritten on every run (idempotent).
     Args:
         input_path: Path to the raw BBBP CSV (must include `smiles_col`).
-        output_path: Where to write the processed feature CSV. Parent
-            directory is created if missing.
         smiles_col: SMILES column name in the raw CSV.
         n_bits: Morgan fingerprint length.
         radius: Morgan radius.
@@ -230,7 +231,9 @@ def run_pipeline(
         raise IsADirectoryError(
             f"output_path must be a file, got a directory: {output_path}"
         )
-    features.to_csv(output_path, index=False, lineterminator="\n")
     logger.info(
         "Wrote processed features to %s (rows=%d, cols=%d)",
         output_path, len(features), features.shape[1],

 # Default I/O paths for the BBB pipeline. Override via run_pipeline() args.
 DEFAULT_INPUT = Path("data/raw/bbbp.csv")
+DEFAULT_OUTPUT = Path("data/processed/bbbp_features.parquet")
 def is_valid_smiles(smiles: str | float | None) -> bool:
     Reads the Kaggle BBBP CSV at `input_path`, validates and converts
     SMILES into Morgan fingerprints, and writes the model-ready table
+    as a Parquet file at `output_path`. Output is overwritten on every
+    run (idempotent) and preserves the uint8 dtype of fingerprint columns.
     Args:
         input_path: Path to the raw BBBP CSV (must include `smiles_col`).
+        output_path: Where to write the processed feature Parquet file.
+            Parent directory is created if missing.
         smiles_col: SMILES column name in the raw CSV.
         n_bits: Morgan fingerprint length.
         radius: Morgan radius.
         raise IsADirectoryError(
             f"output_path must be a file, got a directory: {output_path}"
         )
+    # Parquet preserves dtypes (uint8 stays uint8) and is byte-deterministic
+    # when compression is fixed. Used across BBB / EEG / MRI pipelines.
+    features.to_parquet(output_path, index=False, engine="pyarrow", compression="snappy")
     logger.info(
         "Wrote processed features to %s (rows=%d, cols=%d)",
         output_path, len(features), features.shape[1],

tests/pipelines/test_bbb_pipeline.py CHANGED Viewed

@@ -141,36 +141,52 @@ class TestExtractFeaturesFromDataFrame:
 class TestRunPipeline:
-    def test_end_to_end_writes_processed_csv(self, tmp_path: Path) -> None:
         # Arrange: copy fixture into a synthetic raw layout.
         raw_dir = tmp_path / "data" / "raw"
         proc_dir = tmp_path / "data" / "processed"
         raw_dir.mkdir(parents=True)
         proc_dir.mkdir(parents=True)
         input_path = raw_dir / "bbbp.csv"
-        output_path = proc_dir / "bbbp_features.csv"
         shutil.copy(FIXTURE, input_path)
         # Act
         run_pipeline(input_path=input_path, output_path=output_path, n_bits=128, radius=2)
         # Assert: file exists
-        assert output_path.exists(), "pipeline must write processed CSV"
         # Assert: content is correct
-        out = pd.read_csv(output_path)
         assert len(out) == 4  # 6 raw - 2 invalid
         assert "p_np" in out.columns
         assert sum(c.startswith("fp_") for c in out.columns) == 128
         assert "smiles" not in out.columns
     def test_run_pipeline_is_idempotent(self, tmp_path: Path) -> None:
         raw_dir = tmp_path / "data" / "raw"
         proc_dir = tmp_path / "data" / "processed"
         raw_dir.mkdir(parents=True)
         proc_dir.mkdir(parents=True)
         input_path = raw_dir / "bbbp.csv"
-        output_path = proc_dir / "bbbp_features.csv"
         shutil.copy(FIXTURE, input_path)
         run_pipeline(input_path=input_path, output_path=output_path, n_bits=64, radius=2)
@@ -184,7 +200,7 @@ class TestRunPipeline:
         with pytest.raises(FileNotFoundError):
             run_pipeline(
                 input_path=tmp_path / "nope.csv",
-                output_path=tmp_path / "out.csv",
             )
     def test_run_pipeline_rejects_directory_as_output(self, tmp_path: Path) -> None:

 class TestRunPipeline:
+    def test_end_to_end_writes_processed_parquet(self, tmp_path: Path) -> None:
         # Arrange: copy fixture into a synthetic raw layout.
         raw_dir = tmp_path / "data" / "raw"
         proc_dir = tmp_path / "data" / "processed"
         raw_dir.mkdir(parents=True)
         proc_dir.mkdir(parents=True)
         input_path = raw_dir / "bbbp.csv"
+        output_path = proc_dir / "bbbp_features.parquet"
         shutil.copy(FIXTURE, input_path)
         # Act
         run_pipeline(input_path=input_path, output_path=output_path, n_bits=128, radius=2)
         # Assert: file exists
+        assert output_path.exists(), "pipeline must write processed Parquet"
         # Assert: content is correct
+        out = pd.read_parquet(output_path)
         assert len(out) == 4  # 6 raw - 2 invalid
         assert "p_np" in out.columns
         assert sum(c.startswith("fp_") for c in out.columns) == 128
         assert "smiles" not in out.columns
+    def test_run_pipeline_preserves_uint8_dtype(self, tmp_path: Path) -> None:
+        """The Parquet round-trip must keep fp_* columns as uint8 (not widen to int64)."""
+        raw_dir = tmp_path / "data" / "raw"
+        proc_dir = tmp_path / "data" / "processed"
+        raw_dir.mkdir(parents=True)
+        proc_dir.mkdir(parents=True)
+        input_path = raw_dir / "bbbp.csv"
+        output_path = proc_dir / "bbbp_features.parquet"
+        shutil.copy(FIXTURE, input_path)
+        run_pipeline(input_path=input_path, output_path=output_path, n_bits=64, radius=2)
+        out = pd.read_parquet(output_path)
+        fp_cols = [c for c in out.columns if c.startswith("fp_")]
+        for col in fp_cols:
+            assert out[col].dtype == np.uint8, f"{col} widened to {out[col].dtype}"
     def test_run_pipeline_is_idempotent(self, tmp_path: Path) -> None:
         raw_dir = tmp_path / "data" / "raw"
         proc_dir = tmp_path / "data" / "processed"
         raw_dir.mkdir(parents=True)
         proc_dir.mkdir(parents=True)
         input_path = raw_dir / "bbbp.csv"
+        output_path = proc_dir / "bbbp_features.parquet"
         shutil.copy(FIXTURE, input_path)
         run_pipeline(input_path=input_path, output_path=output_path, n_bits=64, radius=2)
         with pytest.raises(FileNotFoundError):
             run_pipeline(
                 input_path=tmp_path / "nope.csv",
+                output_path=tmp_path / "out.parquet",
             )
     def test_run_pipeline_rejects_directory_as_output(self, tmp_path: Path) -> None: