refactor: update dataset processing commands and improve script structure

f7b4d24 12 days ago

11.8 kB

	import os
	from pathlib import Path
	from typing import Tuple, List, Union, Dict

	import h5py
	import numpy as np
	import scipy.signal as signal
	from scipy.signal import iirnotch

	def sequence_to_seconds(seq_len: int, fs: float) -> float:
	"""Converts a sequence length in samples to time in seconds.

	Args:
	seq_len (int): The number of samples in the sequence.
	fs (float): The sampling frequency in Hz.

	Returns:
	float: The duration of the sequence in seconds.
	"""
	return seq_len / fs

	def bandpass_filter_emg(
	emg: np.ndarray,
	lowcut: float = 20.0,
	highcut: float = 90.0,
	fs: float = 200.0,
	order: int = 4
	) -> np.ndarray:
	"""Applies a Butterworth bandpass filter to the EMG signal.

	Args:
	emg (np.ndarray): The input signal array of shape (T, D).
	lowcut (float, optional): Lower bound of the passband in Hz. Defaults to 20.0.
	highcut (float, optional): Upper bound of the passband in Hz. Defaults to 90.0.
	fs (float, optional): The sampling frequency in Hz. Defaults to 200.0.
	order (int, optional): The order of the filter. Defaults to 4.

	Returns:
	np.ndarray: The filtered signal array.
	"""
	nyq = 0.5 * fs
	b, a = signal.butter(order, [lowcut / nyq, highcut / nyq], btype="bandpass")
	return signal.filtfilt(b, a, emg, axis=0)

	def notch_filter_emg(
	emg: np.ndarray,
	notch_freq: float = 50.0,
	Q: float = 30.0,
	fs: float = 200.0
	) -> np.ndarray:
	"""Applies a notch filter to remove power line interference.

	Args:
	emg (np.ndarray): The input signal array of shape (T, D).
	notch_freq (float, optional): The frequency to be removed in Hz. Defaults to 50.0.
	Q (float, optional): The quality factor. Defaults to 30.0.
	fs (float, optional): The sampling frequency in Hz. Defaults to 200.0.

	Returns:
	np.ndarray: The filtered signal array.
	"""
	b, a = iirnotch(notch_freq / (0.5 * fs), Q)
	return signal.filtfilt(b, a, emg, axis=0)

	def read_emg_txt(txt_path: str) -> np.ndarray:
	"""Reads a UCI EMG text file into a numpy array.

	The file is expected to have columns: [time, ch1, ..., ch8, class].

	Args:
	txt_path (str): Path to the .txt file.

	Returns:
	np.ndarray: A float32 array of shape (N, 10).
	"""
	data = []
	with open(txt_path, "r") as f:
	for line in f.readlines()[1:]: # skip header
	cols = line.strip().split()
	if len(cols) == 10:
	data.append(list(map(float, cols)))
	return np.asarray(data, dtype=np.float32)


	def preprocess_emg(arr: np.ndarray, fs: float = 200.0, remove_class0: bool = True) -> np.ndarray:
	"""Applies a standard preprocessing pipeline to the EMG data.

	Pipeline includes:
	1. Optional removal of rest (class 0).
	2. Bandpass filtering (20-90 Hz).
	3. Notch filtering (50 Hz).
	4. Z-score normalization per channel.

	Args:
	arr (np.ndarray): Raw data array of shape (N, 10).
	fs (float, optional): Sampling frequency in Hz. Defaults to 200.0.
	remove_class0 (bool, optional): Whether to remove the "rest" class. Defaults to True.

	Returns:
	np.ndarray: The preprocessed data array.
	"""
	if remove_class0:
	arr = arr[arr[:, -1] >= 1]
	if arr.size == 0:
	return arr

	emg = arr[:, 1:9] # (N, 8)
	emg = bandpass_filter_emg(emg, 20, 90, fs)
	emg = notch_filter_emg(emg, 50, 30, fs)

	mu = emg.mean(axis=0)
	sd = emg.std(axis=0, ddof=1)
	sd[sd == 0] = 1.0
	emg = (emg - mu) / sd

	arr[:, 1:9] = emg
	return arr


	def find_label_runs(arr: np.ndarray) -> List[Tuple[int, np.ndarray]]:
	"""Groups consecutive rows with identical class labels.

	Args:
	arr (np.ndarray): Data array where the last column is the class label.

	Returns:
	List[Tuple[int, np.ndarray]]: A list of tuples (label, sub-array).
	"""
	runs = []
	if arr.size == 0:
	return runs
	curr_lbl = int(arr[0, -1])
	start = 0
	for i in range(1, len(arr)):
	lbl = int(arr[i, -1])
	if lbl != curr_lbl:
	runs.append((curr_lbl, arr[start:i]))
	curr_lbl, start = lbl, i
	runs.append((curr_lbl, arr[start:]))
	return runs


	def sliding_window_majority(
	seg_arr: np.ndarray,
	window_size: int = 1000,
	stride: int = 500
	) -> Tuple[np.ndarray, np.ndarray]:
	"""Segments a label-consistent array using a sliding window and majority voting.

	Args:
	seg_arr (np.ndarray): Data array of shape (T, 10).
	window_size (int, optional): Number of samples per window. Defaults to 1000.
	stride (int, optional): Number of samples to shift between windows. Defaults to 500.

	Returns:
	Tuple[np.ndarray, np.ndarray]: A tuple containing:
	- Windowed EMG segments (N, window_size, 8).
	- Majority vote labels (N,).
	"""
	segs, labs = [], []
	for start in range(0, len(seg_arr) - window_size + 1, stride):
	win = seg_arr[start : start + window_size]
	maj = np.argmax(np.bincount(win[:, -1].astype(int)))
	segs.append(win[:, 1:9]) # keep 8-channel EMG
	labs.append(maj)
	return np.asarray(segs, dtype=np.float32), np.asarray(labs, dtype=np.int32)


	def users_with_gesture(
	data_root: str,
	gesture_id: int,
	subj_range: range = range(1, 37),
	return_counts: bool = False
	) -> Union[List[int], Dict[int, int]]:
	"""Identifies which subjects performed a specific gesture.

	Args:
	data_root (str): Root directory of the dataset.
	gesture_id (int): The ID of the gesture to search for.
	subj_range (range, optional): Range of subject IDs to check. Defaults to range(1, 37).
	return_counts (bool, optional): If True, returns a dictionary with sample counts.
	Defaults to False.

	Returns:
	Union[List[int], Dict[int, int]]: Either a list of subject IDs or a dictionary
	mapping subject ID to occurrence count.
	"""
	found = {}
	for subj in subj_range:
	subj_dir = os.path.join(data_root, f"{subj:02d}")
	if not os.path.isdir(subj_dir):
	continue
	count = 0
	for fname in os.listdir(subj_dir):
	if not fname.endswith(".txt"):
	continue
	txt_path = os.path.join(subj_dir, fname)
	try:
	arr = read_emg_txt(txt_path)
	except Exception:
	# skip files we can't parse
	continue
	if arr.size == 0:
	continue
	# last column is class label (as float). Compare as int.
	if np.any(arr[:, -1].astype(int) == int(gesture_id)):
	# count occurrences (rows) of that gesture in this file
	count += int((arr[:, -1].astype(int) == int(gesture_id)).sum())
	if count > 0:
	found[subj] = count

	if return_counts:
	return found # dict subj -> count
	else:
	return sorted(found.keys())

	def concat_data(lst: List[np.ndarray]) -> np.ndarray:
	"""Concatenates a list of data arrays.

	Args:
	lst (List[np.ndarray]): List of arrays to concatenate.

	Returns:
	np.ndarray: Concatenated array or empty array if list is empty.
	"""
	return np.concatenate(lst, axis=0) if lst else np.empty((0, 1000, 8), np.float32)

	def concat_label(lst: List[np.ndarray]) -> np.ndarray:
	"""Concatenates a list of label arrays.

	Args:
	lst (List[np.ndarray]): List of label arrays.

	Returns:
	np.ndarray: Concatenated array or empty array if list is empty.
	"""
	return np.concatenate(lst, axis=0) if lst else np.empty((0,), np.int32)

	if __name__ == "__main__":
	import argparse

	arg = argparse.ArgumentParser(description="Convert UCI EMG dataset to h5 format.")
	arg.add_argument("--download_data", action="store_true")
	arg.add_argument(
	"--data_dir",
	type=str,
	required=True,
	help="Root directory of the UCI EMG dataset",
	)
	arg.add_argument(
	"--save_dir",
	type=str,
	required=True,
	help="Directory to save the output h5 files",
	)
	arg.add_argument(
	"--seq_len", type=int, help="Size of the window in samples for segmentation."
	)
	arg.add_argument(
	"--stride",
	type=int,
	help="Step size between windows in samples for segmentation.",
	)
	args = arg.parse_args()

	data_root = args.data_dir
	save_root = args.save_dir
	os.makedirs(save_root, exist_ok=True)

	# download data if requested
	if args.download_data:
	# https://archive.ics.uci.edu/dataset/481/emg+data+for+gestures
	base_url = (
	"https://archive.ics.uci.edu/static/public/481/emg+data+for+gestures.zip"
	)
	os.system(f"wget -O {data_root}/emg_gestures.zip '{base_url}'")
	os.system(f"unzip -o {data_root}/emg_gestures.zip -d {Path(data_root).parent}")
	os.system(f"rm {data_root}/emg_gestures.zip")
	print("Dataset downloaded and cleaned up.")

	fs = 200.0 # sampling rate of MYO bracelet
	window_size, stride = args.seq_len, args.stride

	window_seconds = sequence_to_seconds(window_size, fs)
	print(f"Window size: {window_size} samples ({window_seconds:.2f} seconds)")

	split_map = {
	"train": list(range(1, 25)), # 1–24
	"val": list(range(25, 31)), # 25–30
	"test": list(range(31, 37)), # 31–36
	}
	# remove users that performed gesture 7
	gesture_id = 7
	gesture7_users = users_with_gesture(data_root, gesture_id)
	print(f"Users that performed gesture {gesture_id}:", gesture7_users)

	keep_subjs = []
	for k in split_map:
	split_map[k] = [u for u in split_map[k] if u not in gesture7_users]
	keep_subjs.extend(split_map[k])
	print("Updated split map after removing gesture-7 users:", keep_subjs)

	datasets = {k: {"data": [], "label": []} for k in split_map}

	for subj in keep_subjs:
	subj_dir = os.path.join(data_root, f"{subj:02d}")
	if not os.path.isdir(subj_dir):
	continue
	split_key = next(k for k, v in split_map.items() if subj in v)

	for fname in sorted(os.listdir(subj_dir)):
	if not fname.endswith(".txt"):
	continue
	arr = read_emg_txt(os.path.join(subj_dir, fname))
	arr = preprocess_emg(arr, fs)

	for lbl, seg_arr in find_label_runs(arr):
	segs, labs = sliding_window_majority(seg_arr, window_size, stride)
	if segs.size:
	datasets[split_key]["data"].append(segs)
	datasets[split_key]["label"].append(labs - 1)

	# concatenate, transpose & save
	for split in ["train", "val", "test"]:
	X = concat_data(datasets[split]["data"]) # (N,256,8)
	y = concat_label(datasets[split]["label"])
	X = X.transpose(0, 2, 1) # (N,8,256)

	with h5py.File(os.path.join(save_root, f"{split}.h5"), "w") as f:
	f.create_dataset("data", data=X.astype(np.float32))
	f.create_dataset("label", data=y.astype(np.int32))
	uniq, cnt = np.unique(y, return_counts=True)
	print(
	f"{split.upper():5} → X={X.shape}, label dist:",
	dict(zip(uniq.tolist(), cnt.tolist())),
	)

	print("\nAll splits saved to:", save_root)