File size: 8,068 Bytes
ca8e271
45d17fb
ca8e271
45d17fb
ca8e271
 
 
 
 
 
 
 
 
45d17fb
 
32e7d7d
45d17fb
 
 
ca8e271
45d17fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8e271
 
 
 
 
 
 
45d17fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8e271
 
 
 
 
 
 
 
 
 
45d17fb
 
 
 
 
 
 
 
 
 
 
 
ca8e271
 
 
 
 
 
 
 
 
 
45d17fb
 
 
 
 
 
 
 
 
 
 
 
 
ca8e271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32e7d7d
ca8e271
 
32e7d7d
ca8e271
 
 
 
 
 
 
 
 
 
45d17fb
 
 
 
 
 
ca8e271
 
 
 
 
 
 
 
 
 
32e7d7d
 
 
 
ca8e271
 
45d17fb
 
 
ca8e271
45d17fb
ca8e271
 
 
 
45d17fb
ca8e271
 
 
 
 
45d17fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca8e271
45d17fb
 
 
 
ca8e271
45d17fb
 
 
 
 
ca8e271
45d17fb
ca8e271
 
 
45d17fb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import os
import gc
from pathlib import Path
from typing import Tuple, List, Optional, Union, Dict, Any

import h5py
import numpy as np
import pandas as pd
import scipy.signal as signal
from joblib import Parallel, delayed
from scipy.signal import iirnotch
from tqdm import tqdm

def sequence_to_seconds(seq_len: int, fs: float) -> float:
    """Converts a sequence length in samples to time in seconds.

    Args:
        seq_len (int): The number of samples in the sequence.
        fs (float): The sampling frequency in Hz.

    Returns:
        float: The duration of the sequence in seconds.
    """
    return seq_len / fs


def notch_filter(data: np.ndarray, notch_freq: float = 50.0, Q: float = 30.0, fs: float = 2000.0) -> np.ndarray:
    """Applies a notch filter to every channel of the input data independently.

    Args:
        data (np.ndarray): The input signal array of shape (T, D).
        notch_freq (float, optional): The frequency to be removed in Hz. Defaults to 50.0.
        Q (float, optional): The quality factor. Defaults to 30.0.
        fs (float, optional): The sampling frequency in Hz. Defaults to 2000.0.

    Returns:
        np.ndarray: The filtered signal array.
    """
    b, a = iirnotch(notch_freq, Q, fs)
    out = np.zeros_like(data)
    for ch in range(data.shape[1]):
        out[:, ch] = signal.filtfilt(b, a, data[:, ch])
    return out


def bandpass_filter_emg(
    emg: np.ndarray,
    lowcut: float = 20.0,
    highcut: float = 90.0,
    fs: float = 2000.0,
    order: int = 4
) -> np.ndarray:
    """Applies a Butterworth bandpass filter to the EMG signal.

    Args:
        emg (np.ndarray): The input signal array of shape (T, D).
        lowcut (float, optional): Lower bound of the passband in Hz. Defaults to 20.0.
        highcut (float, optional): Upper bound of the passband in Hz. Defaults to 90.0.
        fs (float, optional): The sampling frequency in Hz. Defaults to 2000.0.
        order (int, optional): The order of the filter. Defaults to 4.

    Returns:
        np.ndarray: The filtered signal array.
    """
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = signal.butter(order, [low, high], btype="bandpass")
    out = np.zeros_like(emg)
    for c in range(emg.shape[1]):
        out[:, c] = signal.filtfilt(b, a, emg[:, c])
    return out


def process_emg_features(emg: np.ndarray, window_size: int = 1000, stride: int = 500) -> np.ndarray:
    """Segments raw EMG signals into overlapping windows.

    Args:
        emg (np.ndarray): Raw EMG data of shape (T, n_ch).
        window_size (int, optional): Number of samples per window. Defaults to 1000.
        stride (int, optional): Number of samples to shift between windows. Defaults to 500.

    Returns:
        np.ndarray: Segmented data of shape (N, window_size, n_ch).
    """
    segs = []
    N = len(emg)
    for start in range(0, N, stride):
        end = start + window_size
        if end > N:  # skip the last segment if it is not complete
            continue
        win = emg[start:end]
        segs.append(win)
    return np.array(segs)


def process_one_recording(file_path: str, fs: float = 2000.0, window_size: int = 1000, stride: int = 500) -> np.ndarray:
    """Processes a single EMG2Pose recording file.

    Loads HDF5 timeseries, filters EMG, normalizes (Z-score), and segments.

    Args:
        file_path (str): Absolute path to the .h5 recording file.
        fs (float, optional): Sampling frequency in Hz. Defaults to 2000.0.
        window_size (int, optional): Temporal window size in samples. Defaults to 1000.
        stride (int, optional): Stride between windows in samples. Defaults to 500.

    Returns:
        np.ndarray: Array of processed segments (N, window_size, n_ch).
    """
    with h5py.File(file_path, "r") as f:
        grp = f["emg2pose"]
        data = grp["timeseries"]
        emg = data["emg"][:].astype(np.float32)

    # ==== Preprocessing EMG data ====
    emg_filt = bandpass_filter_emg(emg, 20, 450, fs=fs)
    emg_filt = notch_filter(emg_filt, 50, 30, fs=fs)

    # z-score
    mu = emg_filt.mean(axis=0)
    sd = emg_filt.std(axis=0, ddof=1)
    sd[sd == 0] = 1.0
    emg_z = (emg_filt - mu) / sd

    # segment
    segs = process_emg_features(emg_z, window_size, stride)

    return segs


def main():
    import argparse

    args = argparse.ArgumentParser(description="Process EMG data from DB5.")
    args.add_argument("--data_dir", type=str)
    args.add_argument("--save_dir", type=str)
    args.add_argument(
        "--seq_len", type=int, help="Size of the window in samples for segmentation."
    )
    args.add_argument(
        "--stride", type=int, help="Step size between windows in samples for segmentation."
    )
    args.add_argument(
        "--subsample", type=float, default=1.0, help="Whether to subsample the data"
    )
    args.add_argument(
        "--n_jobs",
        type=int,
        default=-1,
        help="Number of parallel jobs to run. -1 means using all available cores.",
    )
    args.add_argument(
        "--group_size",
        type=int,
        default=1000,
        help="Number of samples per group in the output HDF5 file.",
    )
    args.add_argument(
        "--seed", type=int, default=42, help="Random seed for reproducibility."
    )
    args = args.parse_args()

    data_dir = args.data_dir
    save_dir = args.save_dir
    os.makedirs(save_dir, exist_ok=True)

    fs = 2000.0  # original sampling rate
    window_size, stride = args.seq_len, args.stride

    window_seconds = sequence_to_seconds(window_size, fs)
    print(f"Window size: {window_size} samples ({window_seconds:.2f} seconds)")

    df = pd.read_csv(os.path.join(data_dir, "metadata.csv"))
    if args.subsample < 1.0:
        df = df.groupby("split", group_keys=False).sample(
            frac=args.subsample, random_state=args.seed
        )
    df = df.reset_index(drop=True)

    splits = {}
    for split, df_ in df.groupby("split"):
        sessions = list(df_.filename)
        splits[split] =[
            Path(data_dir).expanduser().joinpath(f"{session}.hdf5")
            for session in sessions
        ]

    for split, files in splits.items():
        out_file = os.path.join(save_dir, f"{split}.h5")

        # Remove existing file if it exists so we don't accidentally append to old runs
        if os.path.exists(out_file):
            os.remove(out_file)

        print(f"Processing {split} split ({len(files)} files)...")

        with h5py.File(out_file, "w") as h5f:
            group_idx = 0
            with Parallel(n_jobs=args.n_jobs) as parallel:
                with tqdm(total=len(files), desc=f"Processing & Saving {split}") as pbar:

                    # Iterate files in batches
                    for i in range(0, len(files), args.group_size):
                        batch_files = files[i : i + args.group_size]

                        # Process current batch
                        results = parallel(
                            delayed(process_one_recording)(file_path, fs, window_size, stride)
                            for file_path in batch_files
                        )

                        if results:
                            X_chunk = np.concatenate(results, axis=0)  # [N, window_size, ch]
                            X_chunk = X_chunk.transpose(0, 2, 1)          # [N, ch, window_size]
                            X_chunk = X_chunk.astype(np.float32)

                            # Write each processed batch as a group compatible with HDF5Loader
                            grp = h5f.create_group(f"data_group_{group_idx}")
                            grp.create_dataset("X", data=X_chunk)
                            group_idx += 1

                        # Explicitly clear memory of large numpy arrays
                        del results
                        if 'X_chunk' in locals():
                            del X_chunk
                        gc.collect()

                        pbar.update(len(batch_files))


if __name__ == "__main__":
    main()