mekosotto commited on
Commit
3670eb8
·
1 Parent(s): ce17bc7

feat(mri): add is_valid_volume guard for NaN/inf/shape/dtype on 3-D arrays

Browse files
src/pipelines/mri_pipeline.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MRI (magnetic resonance imaging) pipeline.
2
+
3
+ Loads NIfTI volumes (`.nii` / `.nii.gz`), applies a brain mask, harmonizes
4
+ across sites with ComBat (`neuroHarmonize`), and writes per-subject ROI
5
+ statistics as a model-ready Parquet at `data/processed/mri_features.parquet`.
6
+
7
+ Follows the Data Readiness contract in AGENTS.md §4 and the Parquet storage
8
+ convention in §6: schema validity, domain validity (drop NaN/inf volumes
9
+ with a logged WARNING), determinism (ComBat is RNG-free given fixed input),
10
+ traceability (in/out/dropped counts at INFO), and idempotent overwrite.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import os
15
+
16
+ import numpy as np
17
+ import pyarrow as pa
18
+
19
+ from src.core.logger import get_logger
20
+
21
+ logger = get_logger(__name__)
22
+
23
+ # Pin BLAS / OpenMP / pyarrow to single-threaded mode so byte-determinism
24
+ # (AGENTS.md §4 rule 3) holds across hardware. Without this, multi-threaded
25
+ # floating-point reductions can reorder and produce non-bit-identical output.
26
+ os.environ.setdefault("OMP_NUM_THREADS", "1")
27
+ os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
28
+ os.environ.setdefault("MKL_NUM_THREADS", "1")
29
+ pa.set_cpu_count(1)
30
+ pa.set_io_thread_count(1)
31
+
32
+
33
+ def is_valid_volume(volume: np.ndarray | None) -> bool:
34
+ """Return True iff `volume` is a non-empty 3-D numeric array with no NaN/inf.
35
+
36
+ Used to drop corrupted volumes before masking + feature extraction.
37
+ Defensive against the full set of garbage we expect from real archives:
38
+ lists, None, NaN/inf samples, zero-sized arrays, string-dtype arrays.
39
+ """
40
+ if not isinstance(volume, np.ndarray):
41
+ return False
42
+ if volume.ndim != 3:
43
+ return False
44
+ if volume.size == 0:
45
+ return False
46
+ if not np.issubdtype(volume.dtype, np.number):
47
+ return False
48
+ if not np.all(np.isfinite(volume)):
49
+ return False
50
+ return True
tests/pipelines/test_mri_pipeline.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit + integration tests for the MRI ComBat pipeline."""
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import pytest
8
+
9
+ from src.pipelines.mri_pipeline import is_valid_volume
10
+
11
+
12
+ FIXTURE_DIR = Path(__file__).parent.parent / "fixtures" / "mri_sample"
13
+
14
+
15
+ class TestIsValidVolume:
16
+ def test_accepts_3d_finite_array(self) -> None:
17
+ vol = np.zeros((8, 8, 8), dtype=np.float64)
18
+ assert is_valid_volume(vol) is True
19
+
20
+ def test_rejects_wrong_dimension(self) -> None:
21
+ assert is_valid_volume(np.zeros((8, 8))) is False
22
+ assert is_valid_volume(np.zeros((8, 8, 8, 2))) is False
23
+
24
+ def test_rejects_nan(self) -> None:
25
+ vol = np.zeros((8, 8, 8))
26
+ vol[0, 0, 0] = np.nan
27
+ assert is_valid_volume(vol) is False
28
+
29
+ def test_rejects_inf(self) -> None:
30
+ vol = np.zeros((8, 8, 8))
31
+ vol[1, 1, 1] = np.inf
32
+ assert is_valid_volume(vol) is False
33
+ vol[1, 1, 1] = -np.inf
34
+ assert is_valid_volume(vol) is False
35
+
36
+ def test_rejects_empty(self) -> None:
37
+ assert is_valid_volume(np.zeros((0, 8, 8))) is False
38
+ assert is_valid_volume(np.zeros((8, 0, 8))) is False
39
+ assert is_valid_volume(np.zeros((8, 8, 0))) is False
40
+
41
+ def test_rejects_non_numeric_dtype(self) -> None:
42
+ vol = np.array([[["a", "b"], ["c", "d"]]])
43
+ assert is_valid_volume(vol) is False
44
+
45
+ def test_rejects_non_array(self) -> None:
46
+ assert is_valid_volume([[[1, 2]], [[3, 4]]]) is False
47
+ assert is_valid_volume(None) is False