mekosotto Claude Sonnet 4.6 commited on
Commit
f7e54c4
·
1 Parent(s): c68ac12

fix(mri): pin ComBat transitive deps; clarify np.round(14) intent; guard inputs

Browse files
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- # Requires Python 3.10–3.12 (rdkit / numpy / pandas / pyarrow / scipy / scikit-learn pins ship cp310–cp312 wheels only).
2
  # See AGENTS.md §3 for the full coding-standards contract.
3
 
4
  # --- Web / API layer ---
@@ -22,6 +22,8 @@ mne==1.7.1
22
  # --- Modality: image (MRI pipeline) ---
23
  nibabel==5.2.1
24
  neuroharmonize==2.4.5 # ComBat harmonization wrapper
 
 
25
 
26
  # --- Experiment tracking ---
27
  mlflow==2.16.0
 
1
+ # Requires Python 3.10–3.12 (rdkit / numpy / pandas / pyarrow / scipy / scikit-learn / statsmodels pins ship cp310–cp312 wheels only).
2
  # See AGENTS.md §3 for the full coding-standards contract.
3
 
4
  # --- Web / API layer ---
 
22
  # --- Modality: image (MRI pipeline) ---
23
  nibabel==5.2.1
24
  neuroharmonize==2.4.5 # ComBat harmonization wrapper
25
+ neuroCombat==0.2.12 # transitive dep of neuroharmonize; pinned for reproducibility
26
+ statsmodels==0.14.6 # transitive dep of neuroharmonize; pinned for reproducibility
27
 
28
  # --- Experiment tracking ---
29
  mlflow==2.16.0
src/pipelines/mri_pipeline.py CHANGED
@@ -224,6 +224,13 @@ def harmonize_combat(
224
  """
225
  from neuroHarmonize import harmonizationLearn
226
 
 
 
 
 
 
 
 
227
  if sites.nunique() < 2:
228
  raise ValueError(
229
  f"ComBat requires at least 2 sites; got {sites.nunique()} "
@@ -234,9 +241,13 @@ def harmonize_combat(
234
  covars = pd.DataFrame({"SITE": sites.to_numpy()})
235
 
236
  _, harmonized = harmonizationLearn(matrix, covars)
237
- # Round to 14 decimal places to eliminate sub-ULP floating-point noise
238
- # (neuroHarmonize's internal matrix ops can produce ±1-ULP variation
239
- # across calls; 14 d.p. retains all meaningful precision at float64).
 
 
 
 
240
  out = pd.DataFrame(
241
  np.round(np.asarray(harmonized, dtype=np.float64), 14),
242
  columns=list(feature_cols),
 
224
  """
225
  from neuroHarmonize import harmonizationLearn
226
 
227
+ if not feature_cols:
228
+ raise ValueError("feature_cols must be a non-empty list")
229
+ if len(features) != len(sites):
230
+ raise ValueError(
231
+ f"features has {len(features)} rows but sites has {len(sites)} elements"
232
+ )
233
+
234
  if sites.nunique() < 2:
235
  raise ValueError(
236
  f"ComBat requires at least 2 sites; got {sites.nunique()} "
 
241
  covars = pd.DataFrame({"SITE": sites.to_numpy()})
242
 
243
  _, harmonized = harmonizationLearn(matrix, covars)
244
+ # Defensive: with OMP/OPENBLAS/MKL_NUM_THREADS=1 (set at module import,
245
+ # per AGENTS.md §4), harmonizationLearn is already bit-identical across
246
+ # calls. np.round(14) provides an additional determinism boundary for
247
+ # environments where those env pins are overridden before module load
248
+ # (e.g. a sub-process that re-exports a thread count). It discards ~5
249
+ # trailing-mantissa bits, which is well below ComBat's biological
250
+ # effect-size precision floor.
251
  out = pd.DataFrame(
252
  np.round(np.asarray(harmonized, dtype=np.float64), 14),
253
  columns=list(feature_cols),
tests/pipelines/test_mri_pipeline.py CHANGED
@@ -272,3 +272,15 @@ class TestHarmonizeCombat:
272
  sites_one = pd.Series(["A"] * len(df), name="site")
273
  with pytest.raises(ValueError, match="at least 2 sites"):
274
  harmonize_combat(df, sites_one, feature_cols)
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  sites_one = pd.Series(["A"] * len(df), name="site")
273
  with pytest.raises(ValueError, match="at least 2 sites"):
274
  harmonize_combat(df, sites_one, feature_cols)
275
+
276
+ def test_raises_on_empty_feature_cols(self) -> None:
277
+ df, sites, _ = self._build_two_site_features()
278
+ with pytest.raises(ValueError, match="feature_cols must be a non-empty list"):
279
+ harmonize_combat(df, sites, [])
280
+
281
+ def test_raises_on_length_mismatch(self) -> None:
282
+ df, sites, feature_cols = self._build_two_site_features()
283
+ # sites has 6 entries; truncate to 5 to force a mismatch.
284
+ bad_sites = sites.iloc[:5]
285
+ with pytest.raises(ValueError, match=r"features has 6 rows but sites has 5 elements"):
286
+ harmonize_combat(df, bad_sites, feature_cols)