fix(mri): pin ComBat transitive deps; clarify np.round(14) intent; guard inputs
Browse files- requirements.txt +3 -1
- src/pipelines/mri_pipeline.py +14 -3
- tests/pipelines/test_mri_pipeline.py +12 -0
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# Requires Python 3.10–3.12 (rdkit / numpy / pandas / pyarrow / scipy / scikit-learn pins ship cp310–cp312 wheels only).
|
| 2 |
# See AGENTS.md §3 for the full coding-standards contract.
|
| 3 |
|
| 4 |
# --- Web / API layer ---
|
|
@@ -22,6 +22,8 @@ mne==1.7.1
|
|
| 22 |
# --- Modality: image (MRI pipeline) ---
|
| 23 |
nibabel==5.2.1
|
| 24 |
neuroharmonize==2.4.5 # ComBat harmonization wrapper
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# --- Experiment tracking ---
|
| 27 |
mlflow==2.16.0
|
|
|
|
| 1 |
+
# Requires Python 3.10–3.12 (rdkit / numpy / pandas / pyarrow / scipy / scikit-learn / statsmodels pins ship cp310–cp312 wheels only).
|
| 2 |
# See AGENTS.md §3 for the full coding-standards contract.
|
| 3 |
|
| 4 |
# --- Web / API layer ---
|
|
|
|
| 22 |
# --- Modality: image (MRI pipeline) ---
|
| 23 |
nibabel==5.2.1
|
| 24 |
neuroharmonize==2.4.5 # ComBat harmonization wrapper
|
| 25 |
+
neuroCombat==0.2.12 # transitive dep of neuroharmonize; pinned for reproducibility
|
| 26 |
+
statsmodels==0.14.6 # transitive dep of neuroharmonize; pinned for reproducibility
|
| 27 |
|
| 28 |
# --- Experiment tracking ---
|
| 29 |
mlflow==2.16.0
|
src/pipelines/mri_pipeline.py
CHANGED
|
@@ -224,6 +224,13 @@ def harmonize_combat(
|
|
| 224 |
"""
|
| 225 |
from neuroHarmonize import harmonizationLearn
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
if sites.nunique() < 2:
|
| 228 |
raise ValueError(
|
| 229 |
f"ComBat requires at least 2 sites; got {sites.nunique()} "
|
|
@@ -234,9 +241,13 @@ def harmonize_combat(
|
|
| 234 |
covars = pd.DataFrame({"SITE": sites.to_numpy()})
|
| 235 |
|
| 236 |
_, harmonized = harmonizationLearn(matrix, covars)
|
| 237 |
-
#
|
| 238 |
-
#
|
| 239 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
out = pd.DataFrame(
|
| 241 |
np.round(np.asarray(harmonized, dtype=np.float64), 14),
|
| 242 |
columns=list(feature_cols),
|
|
|
|
| 224 |
"""
|
| 225 |
from neuroHarmonize import harmonizationLearn
|
| 226 |
|
| 227 |
+
if not feature_cols:
|
| 228 |
+
raise ValueError("feature_cols must be a non-empty list")
|
| 229 |
+
if len(features) != len(sites):
|
| 230 |
+
raise ValueError(
|
| 231 |
+
f"features has {len(features)} rows but sites has {len(sites)} elements"
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
if sites.nunique() < 2:
|
| 235 |
raise ValueError(
|
| 236 |
f"ComBat requires at least 2 sites; got {sites.nunique()} "
|
|
|
|
| 241 |
covars = pd.DataFrame({"SITE": sites.to_numpy()})
|
| 242 |
|
| 243 |
_, harmonized = harmonizationLearn(matrix, covars)
|
| 244 |
+
# Defensive: with OMP/OPENBLAS/MKL_NUM_THREADS=1 (set at module import,
|
| 245 |
+
# per AGENTS.md §4), harmonizationLearn is already bit-identical across
|
| 246 |
+
# calls. np.round(14) provides an additional determinism boundary for
|
| 247 |
+
# environments where those env pins are overridden before module load
|
| 248 |
+
# (e.g. a sub-process that re-exports a thread count). It discards ~5
|
| 249 |
+
# trailing-mantissa bits, which is well below ComBat's biological
|
| 250 |
+
# effect-size precision floor.
|
| 251 |
out = pd.DataFrame(
|
| 252 |
np.round(np.asarray(harmonized, dtype=np.float64), 14),
|
| 253 |
columns=list(feature_cols),
|
tests/pipelines/test_mri_pipeline.py
CHANGED
|
@@ -272,3 +272,15 @@ class TestHarmonizeCombat:
|
|
| 272 |
sites_one = pd.Series(["A"] * len(df), name="site")
|
| 273 |
with pytest.raises(ValueError, match="at least 2 sites"):
|
| 274 |
harmonize_combat(df, sites_one, feature_cols)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
sites_one = pd.Series(["A"] * len(df), name="site")
|
| 273 |
with pytest.raises(ValueError, match="at least 2 sites"):
|
| 274 |
harmonize_combat(df, sites_one, feature_cols)
|
| 275 |
+
|
| 276 |
+
def test_raises_on_empty_feature_cols(self) -> None:
|
| 277 |
+
df, sites, _ = self._build_two_site_features()
|
| 278 |
+
with pytest.raises(ValueError, match="feature_cols must be a non-empty list"):
|
| 279 |
+
harmonize_combat(df, sites, [])
|
| 280 |
+
|
| 281 |
+
def test_raises_on_length_mismatch(self) -> None:
|
| 282 |
+
df, sites, feature_cols = self._build_two_site_features()
|
| 283 |
+
# sites has 6 entries; truncate to 5 to force a mismatch.
|
| 284 |
+
bad_sites = sites.iloc[:5]
|
| 285 |
+
with pytest.raises(ValueError, match=r"features has 6 rows but sites has 5 elements"):
|
| 286 |
+
harmonize_combat(df, bad_sites, feature_cols)
|