Spaces:

mekosotto
/

hackathon

Running

mekosotto Claude Sonnet 4.6 commited on 7 days ago

Commit

f7e54c4

1 Parent(s): c68ac12

fix(mri): pin ComBat transitive deps; clarify np.round(14) intent; guard inputs

Files changed (3) hide show

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-# Requires Python 3.10–3.12 (rdkit / numpy / pandas / pyarrow / scipy / scikit-learn pins ship cp310–cp312 wheels only).
 # See AGENTS.md §3 for the full coding-standards contract.
 # --- Web / API layer ---
@@ -22,6 +22,8 @@ mne==1.7.1
 # --- Modality: image (MRI pipeline) ---
 nibabel==5.2.1
 neuroharmonize==2.4.5  # ComBat harmonization wrapper
 # --- Experiment tracking ---
 mlflow==2.16.0

+# Requires Python 3.10–3.12 (rdkit / numpy / pandas / pyarrow / scipy / scikit-learn / statsmodels pins ship cp310–cp312 wheels only).
 # See AGENTS.md §3 for the full coding-standards contract.
 # --- Web / API layer ---
 # --- Modality: image (MRI pipeline) ---
 nibabel==5.2.1
 neuroharmonize==2.4.5  # ComBat harmonization wrapper
+neuroCombat==0.2.12  # transitive dep of neuroharmonize; pinned for reproducibility
+statsmodels==0.14.6  # transitive dep of neuroharmonize; pinned for reproducibility
 # --- Experiment tracking ---
 mlflow==2.16.0

src/pipelines/mri_pipeline.py CHANGED Viewed

@@ -224,6 +224,13 @@ def harmonize_combat(
     """
     from neuroHarmonize import harmonizationLearn
     if sites.nunique() < 2:
         raise ValueError(
             f"ComBat requires at least 2 sites; got {sites.nunique()} "
@@ -234,9 +241,13 @@ def harmonize_combat(
     covars = pd.DataFrame({"SITE": sites.to_numpy()})
     _, harmonized = harmonizationLearn(matrix, covars)
-    # Round to 14 decimal places to eliminate sub-ULP floating-point noise
-    # (neuroHarmonize's internal matrix ops can produce ±1-ULP variation
-    # across calls; 14 d.p. retains all meaningful precision at float64).
     out = pd.DataFrame(
         np.round(np.asarray(harmonized, dtype=np.float64), 14),
         columns=list(feature_cols),

     """
     from neuroHarmonize import harmonizationLearn
+    if not feature_cols:
+        raise ValueError("feature_cols must be a non-empty list")
+    if len(features) != len(sites):
+        raise ValueError(
+            f"features has {len(features)} rows but sites has {len(sites)} elements"
+        )
     if sites.nunique() < 2:
         raise ValueError(
             f"ComBat requires at least 2 sites; got {sites.nunique()} "
     covars = pd.DataFrame({"SITE": sites.to_numpy()})
     _, harmonized = harmonizationLearn(matrix, covars)
+    # Defensive: with OMP/OPENBLAS/MKL_NUM_THREADS=1 (set at module import,
+    # per AGENTS.md §4), harmonizationLearn is already bit-identical across
+    # calls. np.round(14) provides an additional determinism boundary for
+    # environments where those env pins are overridden before module load
+    # (e.g. a sub-process that re-exports a thread count). It discards ~5
+    # trailing-mantissa bits, which is well below ComBat's biological
+    # effect-size precision floor.
     out = pd.DataFrame(
         np.round(np.asarray(harmonized, dtype=np.float64), 14),
         columns=list(feature_cols),

tests/pipelines/test_mri_pipeline.py CHANGED Viewed

@@ -272,3 +272,15 @@ class TestHarmonizeCombat:
         sites_one = pd.Series(["A"] * len(df), name="site")
         with pytest.raises(ValueError, match="at least 2 sites"):
             harmonize_combat(df, sites_one, feature_cols)

         sites_one = pd.Series(["A"] * len(df), name="site")
         with pytest.raises(ValueError, match="at least 2 sites"):
             harmonize_combat(df, sites_one, feature_cols)
+    def test_raises_on_empty_feature_cols(self) -> None:
+        df, sites, _ = self._build_two_site_features()
+        with pytest.raises(ValueError, match="feature_cols must be a non-empty list"):
+            harmonize_combat(df, sites, [])
+    def test_raises_on_length_mismatch(self) -> None:
+        df, sites, feature_cols = self._build_two_site_features()
+        # sites has 6 entries; truncate to 5 to force a mismatch.
+        bad_sites = sites.iloc[:5]
+        with pytest.raises(ValueError, match=r"features has 6 rows but sites has 5 elements"):
+            harmonize_combat(df, bad_sites, feature_cols)