Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

mekosotto Claude Sonnet 4.6 commited on 7 days ago

Commit

b18a079

1 Parent(s): 7215c7f

fix(mri): handle all-constant features; tighten variance threshold; reorder log

Browse files

Files changed (2) hide show

src/pipelines/mri_pipeline.py +38 -17
tests/pipelines/test_mri_pipeline.py +67 -0

src/pipelines/mri_pipeline.py CHANGED Viewed

@@ -266,6 +266,14 @@ DEFAULT_INPUT = Path("data/raw/mri")
 DEFAULT_OUTPUT = Path("data/processed/mri_features.parquet")
 def _list_nifti_volumes(input_dir: Path) -> list[Path]:
     """Return sorted list of .nii / .nii.gz files in `input_dir`."""
     return sorted(
@@ -326,8 +334,7 @@ def run_pipeline(
             continue
         mask = mask_brain(volume, intensity_threshold=intensity_threshold)
         feats = extract_features_from_volume(volume, mask, n_roi_axes=n_roi_axes)
-        feats["subject_id"] = subject_id
-        rows.append(feats)
     n_total = len(nifti_paths)
     n_dropped = len(invalid_subject_ids)
@@ -373,18 +380,31 @@ def run_pipeline(
             f"sites_csv missing site assignment for subjects: {missing}"
         )
-    # ComBat cannot handle zero-variance columns (var_pooled = 0 → NaN divide).
-    # Split feature_cols into variable (harmonize) and constant (pass through).
-    var_feature_cols = [c for c in feature_cols if raw_features[c].std() > 0]
-    zero_var_cols = [c for c in feature_cols if raw_features[c].std() == 0]
-    harmonized = harmonize_combat(
-        raw_features, raw_features["site"], var_feature_cols,
-    )
-    # Re-attach zero-variance columns (unchanged) and restore original column order.
-    for c in zero_var_cols:
-        harmonized[c] = raw_features[c].to_numpy()
-    harmonized = harmonized[feature_cols]
     final = pd.concat(
         [raw_features[["subject_id", "site"]].reset_index(drop=True),
@@ -392,6 +412,11 @@ def run_pipeline(
         axis=1,
     )
     output_path.parent.mkdir(parents=True, exist_ok=True)
     if output_path.is_dir():
         raise IsADirectoryError(
@@ -402,10 +427,6 @@ def run_pipeline(
     final.to_parquet(
         output_path, index=False, engine="pyarrow", compression="snappy",
     )
-    logger.info(
-        "Feature extraction complete: in=%d, out=%d, dropped=%d (%.2f%%)",
-        n_total, len(final), n_dropped, 100.0 * n_dropped / max(n_total, 1),
-    )
     logger.info(
         "Wrote processed features to %s (rows=%d, cols=%d)",
         output_path, len(final), final.shape[1],

 DEFAULT_OUTPUT = Path("data/processed/mri_features.parquet")
+# Variance floor used to decide whether a feature column is "constant" for
+# ComBat. Strict ``std() > 0`` would still send near-zero-variance columns
+# (e.g. ULP-level differences) into ComBat, where var_pooled ≈ 0 produces
+# NaN. 1e-8 is well above machine epsilon and far below any biologically
+# meaningful signal variance.
+_MIN_VAR_THRESHOLD: float = 1e-8
 def _list_nifti_volumes(input_dir: Path) -> list[Path]:
     """Return sorted list of .nii / .nii.gz files in `input_dir`."""
     return sorted(
             continue
         mask = mask_brain(volume, intensity_threshold=intensity_threshold)
         feats = extract_features_from_volume(volume, mask, n_roi_axes=n_roi_axes)
+        rows.append({"subject_id": subject_id, **feats})
     n_total = len(nifti_paths)
     n_dropped = len(invalid_subject_ids)
             f"sites_csv missing site assignment for subjects: {missing}"
         )
+    # ComBat cannot handle (near-)zero-variance columns: var_pooled ≈ 0 produces
+    # NaN. Split feature_cols on a strictly-positive variance floor so ULP-level
+    # noise is treated as constant.
+    col_std = raw_features[feature_cols].std()
+    var_feature_cols = [c for c in feature_cols if col_std[c] > _MIN_VAR_THRESHOLD]
+    zero_var_cols = [c for c in feature_cols if col_std[c] <= _MIN_VAR_THRESHOLD]
+    if not var_feature_cols:
+        # Degenerate dataset: every feature is essentially constant. ComBat has
+        # no signal to harmonize on; pass all columns through and warn.
+        logger.warning(
+            "All %d feature columns have variance ≤ %.1e; ComBat skipped "
+            "(output contains unharmonized features).",
+            len(feature_cols), _MIN_VAR_THRESHOLD,
+        )
+        harmonized = raw_features[feature_cols].copy()
+    else:
+        harmonized = harmonize_combat(
+            raw_features, raw_features["site"], var_feature_cols,
+        )
+        # Re-attach zero-variance columns (unchanged) and restore the original
+        # column order.
+        for c in zero_var_cols:
+            harmonized[c] = raw_features[c].to_numpy()
+        harmonized = harmonized[feature_cols]
     final = pd.concat(
         [raw_features[["subject_id", "site"]].reset_index(drop=True),
         axis=1,
     )
+    logger.info(
+        "Feature extraction complete: in=%d, out=%d, dropped=%d (%.2f%%)",
+        n_total, len(final), n_dropped, 100.0 * n_dropped / max(n_total, 1),
+    )
     output_path.parent.mkdir(parents=True, exist_ok=True)
     if output_path.is_dir():
         raise IsADirectoryError(
     final.to_parquet(
         output_path, index=False, engine="pyarrow", compression="snappy",
     )
     logger.info(
         "Wrote processed features to %s (rows=%d, cols=%d)",
         output_path, len(final), final.shape[1],

tests/pipelines/test_mri_pipeline.py CHANGED Viewed

@@ -378,3 +378,70 @@ class TestRunPipeline:
         # 5 surviving valid subjects (subject_5 dropped).
         assert len(df) == 5
         assert "subject_5" not in df["subject_id"].tolist()

         # 5 surviving valid subjects (subject_5 dropped).
         assert len(df) == 5
         assert "subject_5" not in df["subject_id"].tolist()
+    def test_run_pipeline_handles_all_constant_features(self, tmp_path: Path) -> None:
+        """Degenerate dataset: every feature column is constant — ComBat must be
+        skipped gracefully with a WARNING, not crash with ValueError."""
+        import io
+        import logging
+        from src.core.logger import get_logger
+        from src.pipelines import mri_pipeline as mod
+        raw_dir, sites_csv, output_path = self._stage_inputs(tmp_path)
+        # Overwrite all volumes with the same constant intensity so every
+        # feature column is identical across subjects.
+        affine = np.eye(4)
+        for nii in sorted(raw_dir.glob("*.nii.gz")):
+            const_vol = np.full((8, 8, 8), 7.0, dtype=np.float64)
+            nib.save(nib.Nifti1Image(const_vol, affine=affine), nii)
+        logger = get_logger(mod.__name__, level=logging.INFO)
+        handler = logger.handlers[0]
+        buf = io.StringIO()
+        original_stream = handler.stream
+        handler.stream = buf
+        try:
+            run_pipeline(
+                input_dir=raw_dir, sites_csv=sites_csv,
+                output_path=output_path, intensity_threshold=1.0,
+            )
+        finally:
+            handler.stream = original_stream
+        df = pd.read_parquet(output_path)
+        assert len(df) == 6
+        feat_cols = [c for c in df.columns if c.startswith("feat_")]
+        # All-zero-variance fallback: features pass through unchanged.
+        assert df[feat_cols].notna().all().all()
+        log_output = buf.getvalue()
+        assert "ComBat skipped" in log_output
+    def test_run_pipeline_extraction_log_precedes_write(self, tmp_path: Path) -> None:
+        """The 'Feature extraction complete' INFO must fire BEFORE the
+        'Wrote processed features' INFO so that operators get a summary
+        even if to_parquet raises."""
+        import io
+        import logging
+        from src.core.logger import get_logger
+        from src.pipelines import mri_pipeline as mod
+        raw_dir, sites_csv, output_path = self._stage_inputs(tmp_path)
+        logger = get_logger(mod.__name__, level=logging.INFO)
+        handler = logger.handlers[0]
+        buf = io.StringIO()
+        original_stream = handler.stream
+        handler.stream = buf
+        try:
+            run_pipeline(
+                input_dir=raw_dir, sites_csv=sites_csv, output_path=output_path,
+            )
+        finally:
+            handler.stream = original_stream
+        log_output = buf.getvalue()
+        extract_idx = log_output.index("Feature extraction complete:")
+        wrote_idx = log_output.index("Wrote processed features to")
+        assert extract_idx < wrote_idx, "extraction summary must precede write log"