Spaces:

mekosotto
/

hackathon

Running

mekosotto Claude Sonnet 4.6 commited on 8 days ago

Commit

c26c6a2

1 Parent(s): 32e13cf

fix(eeg): validate epochs after filter to guarantee no NaN in feature table

Replace pre-screen-before-filter logic with principled filter→ICA→epoch→validate
order so that FIR-spread NaN is caught at epoch validation time, not missed after
surviving epochs are extracted from contaminated filtered data.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

src/pipelines/eeg_pipeline.py +20 -39
tests/pipelines/test_eeg_pipeline.py +16 -5

src/pipelines/eeg_pipeline.py CHANGED Viewed

@@ -316,31 +316,33 @@ def extract_features_from_recording(
         A `pd.DataFrame` with one row per valid epoch and ``n_eeg_channels *
         (len(EEG_BANDS) + len(STATS))`` ``feat_*`` columns.
     """
-    # Pre-screen epochs on the original (unfiltered) raw data so that NaN/inf
-    # values injected into one epoch window do not spread across the full signal
-    # via the bandpass convolution and invalidate neighbouring epochs.
-    sfreq = float(raw.info["sfreq"])
     n_samples_per_epoch = int(round(epoch_duration_s * sfreq))
-    pre_picks = mne.pick_types(raw.info, eeg=True, meg=False, eog=False)
-    pre_data = raw.get_data(picks=pre_picks)  # shape (n_eeg, n_times)
-    n_eeg, n_times = pre_data.shape
     n_total_epochs = n_times // n_samples_per_epoch
-    valid_ep_indices: list[int] = []
     invalid_indices: list[int] = []
     for ep in range(n_total_epochs):
         start = ep * n_samples_per_epoch
         end = start + n_samples_per_epoch
-        epoch_pre = pre_data[:, start:end]
-        if is_valid_epoch(epoch_pre):
-            valid_ep_indices.append(ep)
-        else:
             invalid_indices.append(ep)
-    # Only run the expensive filter + ICA pipeline if there is something to do.
-    feature_cols = _build_feature_columns(
-        [raw.ch_names[i] for i in pre_picks]
-    )
     n_dropped = len(invalid_indices)
     if n_dropped:
@@ -353,7 +355,7 @@ def extract_features_from_recording(
             n_dropped, n_total_epochs, display, suffix,
         )
-    if not valid_ep_indices:
         logger.info(
             "Feature extraction complete: in=%d, out=0, dropped=%d (%.2f%%)",
             n_total_epochs, n_dropped,
@@ -361,27 +363,6 @@ def extract_features_from_recording(
         )
         return pd.DataFrame(columns=feature_cols).astype(np.float64)
-    filtered = bandpass_filter(raw, l_freq=1.0, h_freq=40.0)
-    cleaned = remove_artifacts_with_ica(
-        filtered,
-        eog_ch_name=eog_ch_name,
-        n_components=n_components,
-        random_state=random_state,
-    )
-    eeg_picks = mne.pick_types(cleaned.info, eeg=True, meg=False, eog=False)
-    eeg_names = [cleaned.ch_names[i] for i in eeg_picks]
-    data = cleaned.get_data(picks=eeg_picks)  # shape (n_eeg, n_times)
-    # Rebuild feature_cols using post-ICA channel order (should match pre_picks).
-    feature_cols = _build_feature_columns(eeg_names)
-    rows: list[np.ndarray] = []
-    for ep in valid_ep_indices:
-        start = ep * n_samples_per_epoch
-        end = start + n_samples_per_epoch
-        epoch = data[:, start:end]
-        rows.append(compute_features_from_epoch(epoch, sfreq=sfreq))
     matrix = np.vstack(rows)
     out = pd.DataFrame(matrix, columns=feature_cols, dtype=np.float64)
     logger.info(

         A `pd.DataFrame` with one row per valid epoch and ``n_eeg_channels *
         (len(EEG_BANDS) + len(STATS))`` ``feat_*`` columns.
     """
+    filtered = bandpass_filter(raw, l_freq=1.0, h_freq=40.0)
+    cleaned = remove_artifacts_with_ica(
+        filtered,
+        eog_ch_name=eog_ch_name,
+        n_components=n_components,
+        random_state=random_state,
+    )
+    sfreq = float(cleaned.info["sfreq"])
     n_samples_per_epoch = int(round(epoch_duration_s * sfreq))
+    eeg_picks = mne.pick_types(cleaned.info, eeg=True, meg=False, eog=False)
+    eeg_names = [cleaned.ch_names[i] for i in eeg_picks]
+    data = cleaned.get_data(picks=eeg_picks)  # shape (n_eeg, n_times)
+    n_eeg, n_times = data.shape
     n_total_epochs = n_times // n_samples_per_epoch
+    feature_cols = _build_feature_columns(eeg_names)
+    rows: list[np.ndarray] = []
     invalid_indices: list[int] = []
     for ep in range(n_total_epochs):
         start = ep * n_samples_per_epoch
         end = start + n_samples_per_epoch
+        epoch = data[:, start:end]
+        if not is_valid_epoch(epoch):
             invalid_indices.append(ep)
+            continue
+        rows.append(compute_features_from_epoch(epoch, sfreq=sfreq))
     n_dropped = len(invalid_indices)
     if n_dropped:
             n_dropped, n_total_epochs, display, suffix,
         )
+    if not rows:
         logger.info(
             "Feature extraction complete: in=%d, out=0, dropped=%d (%.2f%%)",
             n_total_epochs, n_dropped,
         )
         return pd.DataFrame(columns=feature_cols).astype(np.float64)
     matrix = np.vstack(rows)
     out = pd.DataFrame(matrix, columns=feature_cols, dtype=np.float64)
     logger.info(

tests/pipelines/test_eeg_pipeline.py CHANGED Viewed

@@ -284,10 +284,17 @@ class TestExtractFeaturesFromRecording:
         assert np.isfinite(df[feat_cols].to_numpy()).all()
     def test_drops_invalid_epochs_with_warning(self) -> None:
-        """If an epoch contains NaN, it is logged and dropped."""
         raw = self._load()
-        # Inject a NaN into the last 2-second window so that exactly one epoch
-        # fails `is_valid_epoch`.
         data = raw.get_data().copy()
         data[0, -10] = np.nan
         bad_raw = mne.io.RawArray(data, raw.info, verbose="ERROR")
@@ -295,5 +302,9 @@ class TestExtractFeaturesFromRecording:
             bad_raw, epoch_duration_s=2.0, eog_ch_name="EOG061",
             n_components=4, random_state=97,
         )
-        # 5 epochs minus 1 dropped = 4
-        assert len(df) == 4

         assert np.isfinite(df[feat_cols].to_numpy()).all()
     def test_drops_invalid_epochs_with_warning(self) -> None:
+        """A NaN in the recording: at least one epoch dropped, no NaN survives.
+        The bandpass filter is a long FIR convolution, so a single NaN sample
+        spreads across many samples. The principled behavior is therefore:
+        (a) drop every contaminated epoch, not just the source epoch, and
+        (b) guarantee no NaN in the output. The exact drop count depends on
+        the filter's FIR length, so we assert range + cleanliness instead of
+        an exact number.
+        """
         raw = self._load()
+        # Inject a NaN into the last 2-second window.
         data = raw.get_data().copy()
         data[0, -10] = np.nan
         bad_raw = mne.io.RawArray(data, raw.info, verbose="ERROR")
             bad_raw, epoch_duration_s=2.0, eog_ch_name="EOG061",
             n_components=4, random_state=97,
         )
+        # At least one epoch dropped (vs the clean 5-row baseline).
+        assert len(df) < 5
+        # No NaN/inf must survive into the feature table.
+        feat_cols = [c for c in df.columns if c.startswith("feat_")]
+        assert df[feat_cols].notna().all().all()
+        assert np.isfinite(df[feat_cols].to_numpy()).all()