TinyMyo / scripts /uci.py
MatteoFasulo's picture
refactor: update dataset processing commands and improve script structure
f7b4d24
import os
from pathlib import Path
from typing import Tuple, List, Union, Dict
import h5py
import numpy as np
import scipy.signal as signal
from scipy.signal import iirnotch
def sequence_to_seconds(seq_len: int, fs: float) -> float:
"""Converts a sequence length in samples to time in seconds.
Args:
seq_len (int): The number of samples in the sequence.
fs (float): The sampling frequency in Hz.
Returns:
float: The duration of the sequence in seconds.
"""
return seq_len / fs
def bandpass_filter_emg(
emg: np.ndarray,
lowcut: float = 20.0,
highcut: float = 90.0,
fs: float = 200.0,
order: int = 4
) -> np.ndarray:
"""Applies a Butterworth bandpass filter to the EMG signal.
Args:
emg (np.ndarray): The input signal array of shape (T, D).
lowcut (float, optional): Lower bound of the passband in Hz. Defaults to 20.0.
highcut (float, optional): Upper bound of the passband in Hz. Defaults to 90.0.
fs (float, optional): The sampling frequency in Hz. Defaults to 200.0.
order (int, optional): The order of the filter. Defaults to 4.
Returns:
np.ndarray: The filtered signal array.
"""
nyq = 0.5 * fs
b, a = signal.butter(order, [lowcut / nyq, highcut / nyq], btype="bandpass")
return signal.filtfilt(b, a, emg, axis=0)
def notch_filter_emg(
emg: np.ndarray,
notch_freq: float = 50.0,
Q: float = 30.0,
fs: float = 200.0
) -> np.ndarray:
"""Applies a notch filter to remove power line interference.
Args:
emg (np.ndarray): The input signal array of shape (T, D).
notch_freq (float, optional): The frequency to be removed in Hz. Defaults to 50.0.
Q (float, optional): The quality factor. Defaults to 30.0.
fs (float, optional): The sampling frequency in Hz. Defaults to 200.0.
Returns:
np.ndarray: The filtered signal array.
"""
b, a = iirnotch(notch_freq / (0.5 * fs), Q)
return signal.filtfilt(b, a, emg, axis=0)
def read_emg_txt(txt_path: str) -> np.ndarray:
"""Reads a UCI EMG text file into a numpy array.
The file is expected to have columns: [time, ch1, ..., ch8, class].
Args:
txt_path (str): Path to the .txt file.
Returns:
np.ndarray: A float32 array of shape (N, 10).
"""
data = []
with open(txt_path, "r") as f:
for line in f.readlines()[1:]: # skip header
cols = line.strip().split()
if len(cols) == 10:
data.append(list(map(float, cols)))
return np.asarray(data, dtype=np.float32)
def preprocess_emg(arr: np.ndarray, fs: float = 200.0, remove_class0: bool = True) -> np.ndarray:
"""Applies a standard preprocessing pipeline to the EMG data.
Pipeline includes:
1. Optional removal of rest (class 0).
2. Bandpass filtering (20-90 Hz).
3. Notch filtering (50 Hz).
4. Z-score normalization per channel.
Args:
arr (np.ndarray): Raw data array of shape (N, 10).
fs (float, optional): Sampling frequency in Hz. Defaults to 200.0.
remove_class0 (bool, optional): Whether to remove the "rest" class. Defaults to True.
Returns:
np.ndarray: The preprocessed data array.
"""
if remove_class0:
arr = arr[arr[:, -1] >= 1]
if arr.size == 0:
return arr
emg = arr[:, 1:9] # (N, 8)
emg = bandpass_filter_emg(emg, 20, 90, fs)
emg = notch_filter_emg(emg, 50, 30, fs)
mu = emg.mean(axis=0)
sd = emg.std(axis=0, ddof=1)
sd[sd == 0] = 1.0
emg = (emg - mu) / sd
arr[:, 1:9] = emg
return arr
def find_label_runs(arr: np.ndarray) -> List[Tuple[int, np.ndarray]]:
"""Groups consecutive rows with identical class labels.
Args:
arr (np.ndarray): Data array where the last column is the class label.
Returns:
List[Tuple[int, np.ndarray]]: A list of tuples (label, sub-array).
"""
runs = []
if arr.size == 0:
return runs
curr_lbl = int(arr[0, -1])
start = 0
for i in range(1, len(arr)):
lbl = int(arr[i, -1])
if lbl != curr_lbl:
runs.append((curr_lbl, arr[start:i]))
curr_lbl, start = lbl, i
runs.append((curr_lbl, arr[start:]))
return runs
def sliding_window_majority(
seg_arr: np.ndarray,
window_size: int = 1000,
stride: int = 500
) -> Tuple[np.ndarray, np.ndarray]:
"""Segments a label-consistent array using a sliding window and majority voting.
Args:
seg_arr (np.ndarray): Data array of shape (T, 10).
window_size (int, optional): Number of samples per window. Defaults to 1000.
stride (int, optional): Number of samples to shift between windows. Defaults to 500.
Returns:
Tuple[np.ndarray, np.ndarray]: A tuple containing:
- Windowed EMG segments (N, window_size, 8).
- Majority vote labels (N,).
"""
segs, labs = [], []
for start in range(0, len(seg_arr) - window_size + 1, stride):
win = seg_arr[start : start + window_size]
maj = np.argmax(np.bincount(win[:, -1].astype(int)))
segs.append(win[:, 1:9]) # keep 8-channel EMG
labs.append(maj)
return np.asarray(segs, dtype=np.float32), np.asarray(labs, dtype=np.int32)
def users_with_gesture(
data_root: str,
gesture_id: int,
subj_range: range = range(1, 37),
return_counts: bool = False
) -> Union[List[int], Dict[int, int]]:
"""Identifies which subjects performed a specific gesture.
Args:
data_root (str): Root directory of the dataset.
gesture_id (int): The ID of the gesture to search for.
subj_range (range, optional): Range of subject IDs to check. Defaults to range(1, 37).
return_counts (bool, optional): If True, returns a dictionary with sample counts.
Defaults to False.
Returns:
Union[List[int], Dict[int, int]]: Either a list of subject IDs or a dictionary
mapping subject ID to occurrence count.
"""
found = {}
for subj in subj_range:
subj_dir = os.path.join(data_root, f"{subj:02d}")
if not os.path.isdir(subj_dir):
continue
count = 0
for fname in os.listdir(subj_dir):
if not fname.endswith(".txt"):
continue
txt_path = os.path.join(subj_dir, fname)
try:
arr = read_emg_txt(txt_path)
except Exception:
# skip files we can't parse
continue
if arr.size == 0:
continue
# last column is class label (as float). Compare as int.
if np.any(arr[:, -1].astype(int) == int(gesture_id)):
# count occurrences (rows) of that gesture in this file
count += int((arr[:, -1].astype(int) == int(gesture_id)).sum())
if count > 0:
found[subj] = count
if return_counts:
return found # dict subj -> count
else:
return sorted(found.keys())
def concat_data(lst: List[np.ndarray]) -> np.ndarray:
"""Concatenates a list of data arrays.
Args:
lst (List[np.ndarray]): List of arrays to concatenate.
Returns:
np.ndarray: Concatenated array or empty array if list is empty.
"""
return np.concatenate(lst, axis=0) if lst else np.empty((0, 1000, 8), np.float32)
def concat_label(lst: List[np.ndarray]) -> np.ndarray:
"""Concatenates a list of label arrays.
Args:
lst (List[np.ndarray]): List of label arrays.
Returns:
np.ndarray: Concatenated array or empty array if list is empty.
"""
return np.concatenate(lst, axis=0) if lst else np.empty((0,), np.int32)
if __name__ == "__main__":
import argparse
arg = argparse.ArgumentParser(description="Convert UCI EMG dataset to h5 format.")
arg.add_argument("--download_data", action="store_true")
arg.add_argument(
"--data_dir",
type=str,
required=True,
help="Root directory of the UCI EMG dataset",
)
arg.add_argument(
"--save_dir",
type=str,
required=True,
help="Directory to save the output h5 files",
)
arg.add_argument(
"--seq_len", type=int, help="Size of the window in samples for segmentation."
)
arg.add_argument(
"--stride",
type=int,
help="Step size between windows in samples for segmentation.",
)
args = arg.parse_args()
data_root = args.data_dir
save_root = args.save_dir
os.makedirs(save_root, exist_ok=True)
# download data if requested
if args.download_data:
# https://archive.ics.uci.edu/dataset/481/emg+data+for+gestures
base_url = (
"https://archive.ics.uci.edu/static/public/481/emg+data+for+gestures.zip"
)
os.system(f"wget -O {data_root}/emg_gestures.zip '{base_url}'")
os.system(f"unzip -o {data_root}/emg_gestures.zip -d {Path(data_root).parent}")
os.system(f"rm {data_root}/emg_gestures.zip")
print("Dataset downloaded and cleaned up.")
fs = 200.0 # sampling rate of MYO bracelet
window_size, stride = args.seq_len, args.stride
window_seconds = sequence_to_seconds(window_size, fs)
print(f"Window size: {window_size} samples ({window_seconds:.2f} seconds)")
split_map = {
"train": list(range(1, 25)), # 1–24
"val": list(range(25, 31)), # 25–30
"test": list(range(31, 37)), # 31–36
}
# remove users that performed gesture 7
gesture_id = 7
gesture7_users = users_with_gesture(data_root, gesture_id)
print(f"Users that performed gesture {gesture_id}:", gesture7_users)
keep_subjs = []
for k in split_map:
split_map[k] = [u for u in split_map[k] if u not in gesture7_users]
keep_subjs.extend(split_map[k])
print("Updated split map after removing gesture-7 users:", keep_subjs)
datasets = {k: {"data": [], "label": []} for k in split_map}
for subj in keep_subjs:
subj_dir = os.path.join(data_root, f"{subj:02d}")
if not os.path.isdir(subj_dir):
continue
split_key = next(k for k, v in split_map.items() if subj in v)
for fname in sorted(os.listdir(subj_dir)):
if not fname.endswith(".txt"):
continue
arr = read_emg_txt(os.path.join(subj_dir, fname))
arr = preprocess_emg(arr, fs)
for lbl, seg_arr in find_label_runs(arr):
segs, labs = sliding_window_majority(seg_arr, window_size, stride)
if segs.size:
datasets[split_key]["data"].append(segs)
datasets[split_key]["label"].append(labs - 1)
# concatenate, transpose & save
for split in ["train", "val", "test"]:
X = concat_data(datasets[split]["data"]) # (N,256,8)
y = concat_label(datasets[split]["label"])
X = X.transpose(0, 2, 1) # (N,8,256)
with h5py.File(os.path.join(save_root, f"{split}.h5"), "w") as f:
f.create_dataset("data", data=X.astype(np.float32))
f.create_dataset("label", data=y.astype(np.int32))
uniq, cnt = np.unique(y, return_counts=True)
print(
f"{split.upper():5} → X={X.shape}, label dist:",
dict(zip(uniq.tolist(), cnt.tolist())),
)
print("\nAll splits saved to:", save_root)