MatteoFasulo commited on
Commit
f7b4d24
·
1 Parent(s): 45d17fb

refactor: update dataset processing commands and improve script structure

Browse files
scripts/README.md CHANGED
@@ -27,12 +27,13 @@ This guide provides commands to process raw EMG data into HDF5 format using slid
27
 
28
  | Dataset | Metric | Seq Len | Stride | Command |
29
  | :--- | :--- | :--- | :--- | :--- |
30
- | **NinaPro DB5** | Gesture | 200 (1s) | 50 | `python scripts/db5.py --data_dir $DATA_PATH/ninapro/DB5/ --save_dir $DATA_PATH/ninapro/DB5/h5/ --seq_len 200 --stride 50` |
31
- | **NinaPro DB5** | Gesture | 1000 (5s) | 250 | `python scripts/db5.py --data_dir $DATA_PATH/ninapro/DB5/ --save_dir $DATA_PATH/ninapro/DB5/h5/ --seq_len 1000 --stride 250` |
32
- | **EMG-EPN612** | Gesture | 200 (1s) | N/A | `python scripts/epn.py --data_dir $DATA_PATH/EPN612/ --source_training $DATA_PATH/EPN612/trainingJSON/ --source_testing $DATA_PATH/EPN612/testingJSON/ --dest_dir $DATA_PATH/EPN612/h5/ --seq_len 200` |
33
- | **EMG-EPN612** | Gesture | 1000 (5s) | N/A | `python scripts/epn.py --data_dir $DATA_PATH/EPN612/ --source_training $DATA_PATH/EPN612/trainingJSON/ --source_testing $DATA_PATH/EPN612/testingJSON/ --dest_dir $DATA_PATH/EPN612/h5/ --seq_len 1000` |
34
- | **UCI EMG** | Gesture | 200 (1s) | 50 | `python scripts/uci.py --data_dir $DATA_PATH/UCI_EMG/EMG_data_for_gestures-master/ --save_dir $DATA_PATH/UCI_EMG/EMG_data_for_gestures-master/h5/ --seq_len 200 --stride 50` |
35
- | **UCI EMG** | Gesture | 1000 (5s) | 250 | `python scripts/uci.py --data_dir $DATA_PATH/UCI_EMG/EMG_data_for_gestures-master/ --save_dir $DATA_PATH/UCI_EMG/EMG_data_for_gestures-master/h5/ --seq_len 1000 --stride 250` |
36
- | **NinaPro DB8** | Regression | 200 (0.1s) | 200 | `python scripts/db8.py --data_dir $DATA_PATH/ninapro/DB8/ --save_dir $DATA_PATH/ninapro/DB8/h5/ --seq_len 200 --stride 200` |
37
- | **NinaPro DB8** | Regression | 1000 (0.5s) | 1000 | `python scripts/db8.py --data_dir $DATA_PATH/ninapro/DB8/ --save_dir $DATA_PATH/ninapro/DB8/h5/ --seq_len 1000 --stride 1000` |
 
38
 
 
27
 
28
  | Dataset | Metric | Seq Len | Stride | Command |
29
  | :--- | :--- | :--- | :--- | :--- |
30
+ | **NinaPro DB5** | Gesture | 200 (1s) | 50 | `python scripts/db5.py --data_dir $DATA_PATH/ninapro/DB5/ --save_dir $DATA_PATH/ninapro/DB5/h5_1sec/ --seq_len 200 --stride 50` |
31
+ | **NinaPro DB5** | Gesture | 1000 (5s) | 250 | `python scripts/db5.py --data_dir $DATA_PATH/ninapro/DB5/ --save_dir $DATA_PATH/ninapro/DB5/h5_5sec/ --seq_len 1000 --stride 250` |
32
+ | **EMG-EPN612** | Gesture | 200 (1s) | N/A | `python scripts/epn.py --data_dir $DATA_PATH/EPN612/ --source_training $DATA_PATH/EPN612/trainingJSON/ --source_testing $DATA_PATH/EPN612/testingJSON/ --dest_dir $DATA_PATH/EPN612/h5_1sec/ --seq_len 200` |
33
+ | **EMG-EPN612** | Gesture | 1000 (5s) | N/A | `python scripts/epn.py --data_dir $DATA_PATH/EPN612/ --source_training $DATA_PATH/EPN612/trainingJSON/ --source_testing $DATA_PATH/EPN612/testingJSON/ --dest_dir $DATA_PATH/EPN612/h5_5sec/ --seq_len 1000` |
34
+ | **UCI EMG** | Gesture | 200 (1s) | 50 | `python scripts/uci.py --data_dir $DATA_PATH/UCI_EMG/EMG_data_for_gestures-master/ --save_dir $DATA_PATH/UCI_EMG/EMG_data_for_gestures-master/h5_1sec/ --seq_len 200 --stride 50` |
35
+ | **UCI EMG** | Gesture | 1000 (5s) | 250 | `python scripts/uci.py --data_dir $DATA_PATH/UCI_EMG/EMG_data_for_gestures-master/ --save_dir $DATA_PATH/UCI_EMG/EMG_data_for_gestures-master/h5_5sec/ --seq_len 1000 --stride 250` |
36
+ | **NinaPro DB8** | Regression | 200 (0.1s) | 200 | `python scripts/db8.py --data_dir $DATA_PATH/ninapro/DB8/ --save_dir $DATA_PATH/ninapro/DB8/h5_100/ --seq_len 200 --stride 200` |
37
+ | **NinaPro DB8** | Regression | 1000 (0.5s) | 1000 | `python scripts/db8.py --data_dir $DATA_PATH/ninapro/DB8/ --save_dir $DATA_PATH/ninapro/DB8/h5_500/ --seq_len 1000 --stride 1000` |
38
+ | **AVE-Speech** | Speech | 2000 (2s) | N/A | `python scripts/avespeech.py --data_dir $DATA_PATH/AVE-Speech/ --save_dir $DATA_PATH/AVE-Speech/h5/` |
39
 
scripts/avespeech.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import librosa
4
+ import h5py
5
+ import numpy as np
6
+ import scipy.io as sio
7
+ import scipy.signal as signal
8
+ from pathlib import Path
9
+ from typing import Tuple, List
10
+ import re
11
+ import argparse
12
+ from huggingface_hub import snapshot_download
13
+ from joblib import Parallel, delayed
14
+ from tqdm import tqdm
15
+
16
+
17
+ def download_emg_only(save_dir: str):
18
+ repo_id = "MML-Group/AVE-Speech"
19
+
20
+ allow_patterns = [
21
+ "Train/EMG/**",
22
+ "Val/EMG/**",
23
+ "Test/EMG/**",
24
+ "phonetic_transcription.xlsx",
25
+ ]
26
+
27
+ snapshot_download(
28
+ repo_id=repo_id,
29
+ repo_type="dataset",
30
+ local_dir=save_dir,
31
+ allow_patterns=allow_patterns,
32
+ )
33
+
34
+
35
+ def unzip_file(zip_path: str, extract_to: str) -> None:
36
+ import zipfile
37
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
38
+ zip_ref.extractall(extract_to)
39
+
40
+
41
+ def unzip_all_subjects(base_dir: str):
42
+ base_path = Path(base_dir)
43
+ pattern = re.compile(r"subject_(\d+)\.zip")
44
+
45
+ for zip_file in base_path.rglob("*.zip"):
46
+ match = pattern.search(zip_file.name)
47
+ if not match:
48
+ continue
49
+
50
+ subject_id = match.group(1)
51
+ extract_dir = zip_file.parent / f"subject_{subject_id}"
52
+ extract_dir.mkdir(exist_ok=True)
53
+
54
+ print(f"Unzipping {zip_file} -> {extract_dir}")
55
+ unzip_file(str(zip_file), str(extract_dir))
56
+
57
+ zip_file.unlink()
58
+
59
+
60
+ def filter(raw_data):
61
+ fs=1000
62
+ b1, a1 = signal.iirnotch(50, 30, fs)
63
+ b2, a2 = signal.iirnotch(150, 30, fs)
64
+ b3, a3 = signal.iirnotch(250, 30, fs)
65
+ b4, a4 = signal.iirnotch(350, 30, fs)
66
+ b5, a5 = signal.butter(4, [10/(fs/2), 400/(fs/2)], 'bandpass')
67
+
68
+ x = signal.filtfilt(b1, a1, raw_data, axis=1)
69
+ x = signal.filtfilt(b2, a2, x, axis=1)
70
+ x = signal.filtfilt(b3, a3, x, axis=1)
71
+ x = signal.filtfilt(b4, a4, x, axis=1)
72
+ x = signal.filtfilt(b5, a5, x, axis=1)
73
+ return x
74
+
75
+
76
+ def zscore(x: np.ndarray) -> np.ndarray:
77
+ mu = x.mean(axis=1, keepdims=True)
78
+ std = x.std(axis=1, keepdims=True) + 1e-8
79
+ return (x - mu) / std
80
+
81
+ def EMG_MFSC(x):
82
+ x = x[:,250:,:]
83
+ n_mels = 36
84
+ sr = 1000
85
+ channel_list = []
86
+ for j in range(x.shape[-1]):
87
+ mfsc_x = np.zeros((x.shape[0], 36, n_mels))
88
+ for i in range(x.shape[0]):
89
+ # norm_x = x[i, :, j]/np.max(abs(x[i, :, j]))
90
+ norm_x = np.asfortranarray(x[i, :, j])
91
+ tmp = librosa.feature.melspectrogram(y=norm_x, sr=sr, n_mels=n_mels, n_fft=200, hop_length=50)
92
+ tmp = librosa.power_to_db(tmp).T
93
+ mfsc_x[i, :, :] = tmp
94
+
95
+ mfsc_x = np.expand_dims(mfsc_x, axis=-1)
96
+ channel_list.append(mfsc_x)
97
+ data_x = np.concatenate(channel_list, axis=-1)
98
+ mu = np.mean(data_x)
99
+ std = np.std(data_x)
100
+ data_x = (data_x - mu) / std
101
+ data_x = data_x.transpose(0,3,1,2) # Shape: (N, C, F, T)
102
+ return data_x
103
+
104
+
105
+ def process_subject(subject_path: Path, use_mfsc: bool) -> Tuple[List[np.ndarray], List[int]]:
106
+ X_list, y_list = [], []
107
+
108
+ for mat_file in subject_path.rglob("*.mat"):
109
+ emg = sio.loadmat(mat_file) # [2000, 6]
110
+ emg = np.expand_dims(emg["data"], axis=0) # Shape: (1, 2000, 6)
111
+ emg = filter(emg)
112
+
113
+ if use_mfsc:
114
+ emg = EMG_MFSC(emg)
115
+ else:
116
+ emg = zscore(emg)
117
+ emg = emg.squeeze(0) # Shape: (2000, 6)
118
+ emg = emg.transpose(1, 0) # Shape: (6, 2000) [C, T]
119
+
120
+ label = int(mat_file.stem)
121
+
122
+ X_list.append(emg)
123
+ y_list.append(label)
124
+
125
+ return X_list, y_list
126
+
127
+
128
+ def process_dataset(
129
+ data_dir: str,
130
+ save_dir: str,
131
+ use_mfsc: bool,
132
+ n_jobs: int,
133
+ ):
134
+ splits = ["Train", "Val", "Test"]
135
+ os.makedirs(save_dir, exist_ok=True)
136
+
137
+ for split in splits:
138
+ split_path = Path(data_dir) / split / "EMG"
139
+ if not split_path.exists():
140
+ continue
141
+
142
+ print(f"\nProcessing {split}...")
143
+
144
+ subjects = [p for p in split_path.iterdir() if p.is_dir()]
145
+
146
+ # Parallel process subjects
147
+ results = Parallel(n_jobs=n_jobs, backend="loky")(
148
+ delayed(process_subject)(subj, use_mfsc) for subj in tqdm(subjects)
149
+ )
150
+
151
+ X_all, y_all = [], []
152
+ for X_list, y_list in results:
153
+ if X_list is None:
154
+ continue
155
+ X_all.extend(X_list)
156
+ y_all.extend(y_list)
157
+
158
+ if len(X_all) == 0:
159
+ continue
160
+
161
+ X = np.array(X_all, dtype=np.float32)
162
+ y = np.array(y_all, dtype=np.int64)
163
+
164
+ # Save to HDF5
165
+ with h5py.File(os.path.join(save_dir, f"{split.lower()}.h5"), "w") as f:
166
+ f.create_dataset("data", data=X)
167
+ f.create_dataset("label", data=y)
168
+
169
+ print(f"{split}: Processed {len(X)} samples.")
170
+ print(f"Saved shapes -> X: {X.shape}, y: {y.shape}")
171
+
172
+
173
+ if __name__ == "__main__":
174
+ parser = argparse.ArgumentParser()
175
+ parser.add_argument("--data_dir", type=str, required=True)
176
+ parser.add_argument("--save_dir", type=str, required=True)
177
+ parser.add_argument("--download", action="store_true")
178
+ parser.add_argument("--use_mfsc", action="store_true")
179
+ parser.add_argument("--n_jobs", type=int, default=-1)
180
+
181
+ args = parser.parse_args()
182
+
183
+ os.makedirs(args.data_dir, exist_ok=True)
184
+ os.makedirs(args.save_dir, exist_ok=True)
185
+
186
+ if args.download:
187
+ print("Downloading dataset...")
188
+ download_emg_only(args.data_dir)
189
+
190
+ print("Unzipping dataset...")
191
+ unzip_all_subjects(args.data_dir)
192
+
193
+ print("Processing dataset...")
194
+ process_dataset(
195
+ data_dir=args.data_dir,
196
+ save_dir=args.save_dir,
197
+ use_mfsc=args.use_mfsc,
198
+ n_jobs=args.n_jobs
199
+ )
scripts/db5.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
- import sys
3
- from typing import Tuple, List, Optional, Union, Dict, Any, Callable
4
 
5
  import h5py
6
  import numpy as np
@@ -237,7 +236,6 @@ def main():
237
  os.system(f"unzip -o {data_dir}/s{i}.zip -d {data_dir}")
238
  os.system(f"rm {data_dir}/s{i}.zip")
239
  print(f"Downloaded and unzipped subject {i}\n{data_dir}/s{i}.zip")
240
- sys.exit("Data downloaded and unzipped. Rerun without --download_data.")
241
 
242
  fs = 200.0 # original sampling rate
243
  window_size, stride = args.seq_len, args.stride
 
1
  import os
2
+ from typing import Tuple
 
3
 
4
  import h5py
5
  import numpy as np
 
236
  os.system(f"unzip -o {data_dir}/s{i}.zip -d {data_dir}")
237
  os.system(f"rm {data_dir}/s{i}.zip")
238
  print(f"Downloaded and unzipped subject {i}\n{data_dir}/s{i}.zip")
 
239
 
240
  fs = 200.0 # original sampling rate
241
  window_size, stride = args.seq_len, args.stride
scripts/db6.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- import sys
3
 
4
  import h5py
5
  import numpy as np
@@ -156,7 +155,6 @@ def main():
156
  print(
157
  f"Downloaded and unzipped subject {i}\n{data_dir}/DB6_s{i}_a.zip and {data_dir}/DB6_s{i}_b.zip"
158
  )
159
- sys.exit("Data downloaded and unzipped. Rerun without --download_data.")
160
 
161
  fs = 2000.0
162
  window_size, stride = args.seq_len, args.stride
 
1
  import os
 
2
 
3
  import h5py
4
  import numpy as np
 
155
  print(
156
  f"Downloaded and unzipped subject {i}\n{data_dir}/DB6_s{i}_a.zip and {data_dir}/DB6_s{i}_b.zip"
157
  )
 
158
 
159
  fs = 2000.0
160
  window_size, stride = args.seq_len, args.stride
scripts/db7.py CHANGED
@@ -1,6 +1,4 @@
1
  import os
2
- import sys
3
-
4
  import h5py
5
  import numpy as np
6
  import scipy.io
@@ -150,7 +148,6 @@ def main():
150
  os.system(f"unzip -o {data_dir}/Subject_{i}.zip -d {data_dir}/Subject_{i}")
151
  os.system(f"rm {data_dir}/Subject_{i}.zip")
152
  print(f"Downloaded and unzipped subject {i}\n{data_dir}/Subject_{i}.zip")
153
- sys.exit("Data downloaded and unzipped. Rerun without --download_data.")
154
 
155
  fs = 2000.0
156
  window_size, stride = args.seq_len, args.stride
 
1
  import os
 
 
2
  import h5py
3
  import numpy as np
4
  import scipy.io
 
148
  os.system(f"unzip -o {data_dir}/Subject_{i}.zip -d {data_dir}/Subject_{i}")
149
  os.system(f"rm {data_dir}/Subject_{i}.zip")
150
  print(f"Downloaded and unzipped subject {i}\n{data_dir}/Subject_{i}.zip")
 
151
 
152
  fs = 2000.0
153
  window_size, stride = args.seq_len, args.stride
scripts/db8.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
- import sys
3
- from typing import Tuple, List, Optional, Union, Dict, Any
4
 
5
  import h5py
6
  import numpy as np
@@ -226,7 +225,6 @@ def main():
226
  print(
227
  f"Downloaded subject {i}\n{data_dir}/S{i}_E1_A1.mat and {data_dir}/S{i}_E1_A2.mat and {data_dir}/S{i}_E1_A3.mat"
228
  )
229
- sys.exit("Data downloaded and unzipped. Rerun without --download_data.")
230
 
231
  fs = 2000.0 # Hz
232
  window_size, stride = args.seq_len, args.stride
 
1
  import os
2
+ from typing import Tuple, Optional
 
3
 
4
  import h5py
5
  import numpy as np
 
225
  print(
226
  f"Downloaded subject {i}\n{data_dir}/S{i}_E1_A1.mat and {data_dir}/S{i}_E1_A2.mat and {data_dir}/S{i}_E1_A3.mat"
227
  )
 
228
 
229
  fs = 2000.0 # Hz
230
  window_size, stride = args.seq_len, args.stride
scripts/epn.py CHANGED
@@ -1,8 +1,7 @@
1
  import glob
2
  import json
3
  import os
4
- import sys
5
- from typing import Tuple, List, Optional, Union, Dict, Any
6
 
7
  import h5py
8
  import numpy as np
@@ -246,7 +245,6 @@ def main():
246
  # clean up zip file
247
  os.system(f"rm {data_dir}/EMG-EPN612_Dataset.zip")
248
  print(f"Downloaded and unzipped dataset\n{data_dir}/EMG-EPN612_Dataset.zip")
249
- sys.exit("Data downloaded and unzipped. Rerun without --download_data.")
250
 
251
  seq_len = args.seq_len
252
 
 
1
  import glob
2
  import json
3
  import os
4
+ from typing import Tuple, List, Dict, Any
 
5
 
6
  import h5py
7
  import numpy as np
 
245
  # clean up zip file
246
  os.system(f"rm {data_dir}/EMG-EPN612_Dataset.zip")
247
  print(f"Downloaded and unzipped dataset\n{data_dir}/EMG-EPN612_Dataset.zip")
 
248
 
249
  seq_len = args.seq_len
250
 
scripts/uci.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
- import sys
3
  from pathlib import Path
4
- from typing import Tuple, List, Optional, Union, Dict, Any
5
 
6
  import h5py
7
  import numpy as np
@@ -278,7 +277,6 @@ if __name__ == "__main__":
278
  os.system(f"unzip -o {data_root}/emg_gestures.zip -d {Path(data_root).parent}")
279
  os.system(f"rm {data_root}/emg_gestures.zip")
280
  print("Dataset downloaded and cleaned up.")
281
- sys.exit("Rerun without --download_data.")
282
 
283
  fs = 200.0 # sampling rate of MYO bracelet
284
  window_size, stride = args.seq_len, args.stride
 
1
  import os
 
2
  from pathlib import Path
3
+ from typing import Tuple, List, Union, Dict
4
 
5
  import h5py
6
  import numpy as np
 
277
  os.system(f"unzip -o {data_root}/emg_gestures.zip -d {Path(data_root).parent}")
278
  os.system(f"rm {data_root}/emg_gestures.zip")
279
  print("Dataset downloaded and cleaned up.")
 
280
 
281
  fs = 200.0 # sampling rate of MYO bracelet
282
  window_size, stride = args.seq_len, args.stride