Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

mekosotto Claude Opus 4.7 (1M context) commited on 8 days ago

Commit

0d591d4

1 Parent(s): c26c6a2

fix(eeg): NaN-clean features for flat channels; guard zero-size epochs; assert WARNING

Browse files

Files changed (2) hide show

src/pipelines/eeg_pipeline.py +21 -2
tests/pipelines/test_eeg_pipeline.py +40 -7

src/pipelines/eeg_pipeline.py CHANGED Viewed

@@ -253,6 +253,11 @@ def compute_features_from_epoch(epoch: np.ndarray, sfreq: float) -> np.ndarray:
       - ``kurtosis`` uses ``scipy.stats.kurtosis(fisher=True, bias=True)`` —
         Fisher's *excess* kurtosis (Gaussian → 0, not 3). Add 3 if Pearson
         kurtosis is required downstream.
     Precondition: `epoch` must be finite (no NaN/inf). Filter via
     `is_valid_epoch` before calling — feature values are NaN-propagating.
@@ -274,7 +279,11 @@ def compute_features_from_epoch(epoch: np.ndarray, sfreq: float) -> np.ndarray:
             feats.append(_band_power(freqs, psd, lo, hi))
         for _name, fn in _STATS_FUNCS:
             feats.append(fn(x))
-    return np.asarray(feats, dtype=np.float64)
 def _build_feature_columns(eeg_ch_names: list[str]) -> list[str]:
@@ -315,6 +324,11 @@ def extract_features_from_recording(
     Returns:
         A `pd.DataFrame` with one row per valid epoch and ``n_eeg_channels *
         (len(EEG_BANDS) + len(STATS))`` ``feat_*`` columns.
     """
     filtered = bandpass_filter(raw, l_freq=1.0, h_freq=40.0)
     cleaned = remove_artifacts_with_ica(
@@ -326,10 +340,15 @@ def extract_features_from_recording(
     sfreq = float(cleaned.info["sfreq"])
     n_samples_per_epoch = int(round(epoch_duration_s * sfreq))
     eeg_picks = mne.pick_types(cleaned.info, eeg=True, meg=False, eog=False)
     eeg_names = [cleaned.ch_names[i] for i in eeg_picks]
     data = cleaned.get_data(picks=eeg_picks)  # shape (n_eeg, n_times)
-    n_eeg, n_times = data.shape
     n_total_epochs = n_times // n_samples_per_epoch
     feature_cols = _build_feature_columns(eeg_names)

       - ``kurtosis`` uses ``scipy.stats.kurtosis(fisher=True, bias=True)`` —
         Fisher's *excess* kurtosis (Gaussian → 0, not 3). Add 3 if Pearson
         kurtosis is required downstream.
+      - For constant-valued channels (zero variance), ``skew`` and
+        ``kurtosis`` are mathematically undefined and scipy returns NaN.
+        We post-process the feature vector with ``np.nan_to_num`` to map
+        any NaN/inf to 0.0, preserving the "no NaN survives" Parquet
+        contract from AGENTS.md §6.
     Precondition: `epoch` must be finite (no NaN/inf). Filter via
     `is_valid_epoch` before calling — feature values are NaN-propagating.
             feats.append(_band_power(freqs, psd, lo, hi))
         for _name, fn in _STATS_FUNCS:
             feats.append(fn(x))
+    arr = np.asarray(feats, dtype=np.float64)
+    # Constant-valued / zero-variance channels (e.g., disconnected electrodes)
+    # make scipy.stats.skew / kurtosis return NaN. Map those to 0.0 so the
+    # downstream Parquet contract ("no NaN in feature table") holds.
+    return np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)
 def _build_feature_columns(eeg_ch_names: list[str]) -> list[str]:
     Returns:
         A `pd.DataFrame` with one row per valid epoch and ``n_eeg_channels *
         (len(EEG_BANDS) + len(STATS))`` ``feat_*`` columns.
+    Raises:
+        ValueError: if `epoch_duration_s * sfreq` rounds to less than 1 sample.
+            (Other ValueError sources can propagate from `bandpass_filter`
+            and `remove_artifacts_with_ica`; see their respective docstrings.)
     """
     filtered = bandpass_filter(raw, l_freq=1.0, h_freq=40.0)
     cleaned = remove_artifacts_with_ica(
     sfreq = float(cleaned.info["sfreq"])
     n_samples_per_epoch = int(round(epoch_duration_s * sfreq))
+    if n_samples_per_epoch < 1:
+        raise ValueError(
+            f"epoch_duration_s={epoch_duration_s!r} at sfreq={sfreq} Hz produces "
+            f"{n_samples_per_epoch} samples per epoch (must be >= 1)"
+        )
     eeg_picks = mne.pick_types(cleaned.info, eeg=True, meg=False, eog=False)
     eeg_names = [cleaned.ch_names[i] for i in eeg_picks]
     data = cleaned.get_data(picks=eeg_picks)  # shape (n_eeg, n_times)
+    _, n_times = data.shape
     n_total_epochs = n_times // n_samples_per_epoch
     feature_cols = _build_feature_columns(eeg_names)

tests/pipelines/test_eeg_pipeline.py CHANGED Viewed

@@ -229,6 +229,12 @@ class TestComputeFeaturesFromEpoch:
         derived_names = tuple(name for name, _ in _STATS_FUNCS)
         assert derived_names == STATS
 class TestExtractFeaturesFromRecording:
     def _load(self) -> mne.io.BaseRaw:
@@ -284,27 +290,54 @@ class TestExtractFeaturesFromRecording:
         assert np.isfinite(df[feat_cols].to_numpy()).all()
     def test_drops_invalid_epochs_with_warning(self) -> None:
-        """A NaN in the recording: at least one epoch dropped, no NaN survives.
         The bandpass filter is a long FIR convolution, so a single NaN sample
         spreads across many samples. The principled behavior is therefore:
         (a) drop every contaminated epoch, not just the source epoch, and
         (b) guarantee no NaN in the output. The exact drop count depends on
         the filter's FIR length, so we assert range + cleanliness instead of
-        an exact number.
         """
         raw = self._load()
-        # Inject a NaN into the last 2-second window.
         data = raw.get_data().copy()
         data[0, -10] = np.nan
         bad_raw = mne.io.RawArray(data, raw.info, verbose="ERROR")
-        df = extract_features_from_recording(
-            bad_raw, epoch_duration_s=2.0, eog_ch_name="EOG061",
-            n_components=4, random_state=97,
-        )
         # At least one epoch dropped (vs the clean 5-row baseline).
         assert len(df) < 5
         # No NaN/inf must survive into the feature table.
         feat_cols = [c for c in df.columns if c.startswith("feat_")]
         assert df[feat_cols].notna().all().all()
         assert np.isfinite(df[feat_cols].to_numpy()).all()

         derived_names = tuple(name for name, _ in _STATS_FUNCS)
         assert derived_names == STATS
+    def test_constant_channel_yields_finite_features(self) -> None:
+        """A flat-line channel must not produce NaN features (skew/kurtosis are undefined for zero-variance)."""
+        epoch = np.zeros((4, 512), dtype=np.float64)
+        out = compute_features_from_epoch(epoch, sfreq=256.0)
+        assert np.all(np.isfinite(out))
 class TestExtractFeaturesFromRecording:
     def _load(self) -> mne.io.BaseRaw:
         assert np.isfinite(df[feat_cols].to_numpy()).all()
     def test_drops_invalid_epochs_with_warning(self) -> None:
+        """A NaN in the recording: at least one epoch dropped, no NaN survives, WARNING is logged.
         The bandpass filter is a long FIR convolution, so a single NaN sample
         spreads across many samples. The principled behavior is therefore:
         (a) drop every contaminated epoch, not just the source epoch, and
         (b) guarantee no NaN in the output. The exact drop count depends on
         the filter's FIR length, so we assert range + cleanliness instead of
+        an exact number. The WARNING line is part of the AGENTS.md §4
+        traceability contract and must always fire when drops happen.
         """
+        import io
+        import logging
+        from src.core.logger import get_logger
+        from src.pipelines import eeg_pipeline as mod
         raw = self._load()
         data = raw.get_data().copy()
         data[0, -10] = np.nan
         bad_raw = mne.io.RawArray(data, raw.info, verbose="ERROR")
+        logger = get_logger(mod.__name__, level=logging.INFO)
+        handler = logger.handlers[0]
+        buf = io.StringIO()
+        original_stream = handler.stream
+        handler.stream = buf
+        try:
+            df = extract_features_from_recording(
+                bad_raw, epoch_duration_s=2.0, eog_ch_name="EOG061",
+                n_components=4, random_state=97,
+            )
+        finally:
+            handler.stream = original_stream
         # At least one epoch dropped (vs the clean 5-row baseline).
         assert len(df) < 5
         # No NaN/inf must survive into the feature table.
         feat_cols = [c for c in df.columns if c.startswith("feat_")]
         assert df[feat_cols].notna().all().all()
         assert np.isfinite(df[feat_cols].to_numpy()).all()
+        # AGENTS.md §4: the WARNING line was actually emitted.
+        log_output = buf.getvalue()
+        assert "Dropping" in log_output and "epochs with invalid samples" in log_output
+    def test_raises_when_epoch_duration_too_small(self) -> None:
+        raw = self._load()
+        with pytest.raises(ValueError, match="must be >= 1"):
+            extract_features_from_recording(
+                raw, epoch_duration_s=1e-6, eog_ch_name="EOG061",
+                n_components=4, random_state=97,
+            )