CRSTC / src /audio_preprocessing.py

Upload 218 files

dd1cb8f verified over 1 year ago

9.96 kB

	import numpy as np
	import librosa
	import textgrids
	import os
	import python_speech_features
	from tqdm import tqdm

	# Function for reading labels from .TextGrig file:
	def readLabels(path, sample_rate):

	labeled_list = []
	grid = textgrids.TextGrid(path)

	for interval in grid['silences']:
	if interval.text == "-" or interval.text == " ":
	label = 0
	else:
	label = 1

	dur = interval.dur
	dur_samples = int(np.round(dur * sample_rate)) # sec -> num of samples

	for i in range(dur_samples):
	labeled_list.append(label)

	return np.array(labeled_list)

	def load_files(audio_path, audio_extension):
	"""
	Recursively loads audio files from a specified directory.

	Args:
	audio_path (str): The root directory to search for audio files.
	audio_extension (str, optional): The audio file extension to filter
	for (default is ".wav").

	Returns:
	list: A sorted list of full paths to the found audio files.
	Raises:
	FileNotFoundError: If the specified audio_path does not exist.
	"""

	if not os.path.exists(audio_path):
	raise FileNotFoundError(f"Audio path '{audio_path}' not found.")

	audio_files = []

	for root, _, files in os.walk(audio_path):
	for file in files:
	if file.endswith(audio_extension):
	audio_files.append(os.path.join(root, file))

	return sorted(audio_files)

	def max_signal_length(audio_files):
	"""
	Determines the maximum signal length among a list of audio files.

	Args:
	audio_files (list): A list of paths to audio files.

	Returns:
	int: The maximum signal length found among the audio files.

	Raises:
	ValueError: If the input list is empty.
	IOError: If any audio file cannot be loaded.
	"""

	if not audio_files:
	raise ValueError("Audio file list cannot be empty")

	max_length = 0
	for audio_file in audio_files:
	try:
	signal, _ = librosa.load(audio_file)
	max_length = max(max_length, len(signal))
	except Exception as e: # Catch potential loading errors
	raise IOError(f"Error loading audio file '{audio_file}': {e}")

	return max_length

	def object_padding(object, length):
	"""
	Pad the object to the given length

	Args:
	object (np.array): time series object
	max_length (int): Desired length to pad/truncate signals to.
	Returns:
	object (np.array): processed time series object
	"""
	if len(object) < length:
	padding_length = length - len(object)
	# Pad at the end
	object = np.pad(object, (0, padding_length), mode="constant")
	else:
	object = object[:length]
	return np.array(object)

	def fbank_features_extraction(audio_files, max_length, preemphasis_coef=0.97, window_length=0.025, window_step=0.01, window_function=np.hamming, num_nfft=551, num_features=40):
	"""
	Extracts log Mel-filterbank (fbank) features from a list of audio files.

	Args:
	audio_files (list): List of paths to audio files.
	max_length (int): Desired length to pad/truncate signals to.
	preemphasis_coef (float): Pre-emphasis filter coefficient (default: 0.97).
	window_length (float): Length of the analysis window in seconds (default: 0.025).
	window_step (float): Step between successive windows in seconds (default: 0.01).
	window_function (callable): Window function to apply (default: np.hamming).
	nfft (int): Number of FFT points (default: 551).
	num_features (int): Number of Mel filters (default: 40).

	Returns:
	np.ndarray: 2D array of shape (num_files, num_frames, num_features + 1)
	where num_features + 1 represents the log energy feature.
	"""
	# Filter Bank
	fbank_features = list()
	for i in tqdm(range(len(audio_files))):
	# Load the signal and sample rate
	signal, sample_rate = librosa.load(audio_files[i])
	# Audio padding
	signal = object_padding(signal, max_length)
	# Extract features
	# features_fbank (Mel-frequency cepstral coefficiens): captures the spectral information of the audio signal in a way that mimics human auditory perception
	# feature_energy: the overall energy of the audio signal within a specific frequency range
	features_fbank, feature_energy = python_speech_features.base.fbank(signal=signal,
	samplerate=sample_rate,
	winlen=window_length,
	winstep=window_step,
	nfilt=num_features,
	nfft=num_nfft,
	lowfreq=0,
	highfreq=None,
	preemph=preemphasis_coef,
	winfunc=window_function)
	# Log fbank and log energy
	features_logfbank = np.log(features_fbank)
	feature_logenergy = np.log(feature_energy)
	# Merge logfbank and log energy:
	features = np.hstack((feature_logenergy.reshape(feature_logenergy.shape[0], 1), features_logfbank))
	# Storing the fbank features for each audio
	fbank_features.append(features)
	# Return the features in numpy array format
	return np.array(fbank_features)

	def supervised_features_extraction(audio_files, annotation_files, max_length, preemphasis_coef=0.97, window_length=0.025, window_step=0.01, window_function=np.hamming, num_nfft=551, num_features=40):
	"""
	Extracts log Mel-filterbank (fbank) features from a list of audio files.

	Args:
	audio_files (list): List of paths to audio files.
	annotation_files (list): List of paths to annotation files
	max_length (int): Desired length to pad/truncate signals to.
	preemphasis_coef (float): Pre-emphasis filter coefficient (default: 0.97).
	window_length (float): Length of the analysis window in seconds (default: 0.025).
	window_step (float): Step between successive windows in seconds (default: 0.01).
	window_function (callable): Window function to apply (default: np.hamming).
	nfft (int): Number of FFT points (default: 551).
	num_features (int): Number of Mel filters (default: 40).

	Returns:
	np.ndarray: 2D array of shape (num_files, num_frames, num_features + 1)
	where num_features + 1 represents the log energy feature.
	"""
	# Filter Bank
	fbank_features = list()
	labels = list()
	for i in tqdm(range(len(audio_files))):
	# Load the signal and sample rate
	signal, sample_rate = librosa.load(audio_files[i])
	signal = object_padding(signal, max_length)
	truth_labels = readLabels(path=annotation_files[i], sample_rate=sample_rate)
	truth_labels = object_padding(truth_labels, max_length)
	# Extract features
	# features_fbank (Mel-frequency cepstral coefficiens): captures the spectral information of the audio signal in a way that mimics human auditory perception
	# feature_energy: the overall energy of the audio signal within a specific frequency range
	features_fbank, feature_energy = python_speech_features.base.fbank(signal=signal,
	samplerate=sample_rate,
	winlen=window_length,
	winstep=window_step,
	nfilt=num_features,
	nfft=num_nfft,
	lowfreq=0,
	highfreq=None,
	preemph=preemphasis_coef,
	winfunc=window_function)
	# Log fbank and log energy
	features_logfbank = np.log(features_fbank)
	feature_logenergy = np.log(feature_energy)
	# Merge logfbank and log energy:
	features = np.hstack((feature_logenergy.reshape(feature_logenergy.shape[0], 1), features_logfbank))

	# Reshape labels for each group of features:
	temp_label = python_speech_features.sigproc.framesig(sig=truth_labels,
	frame_len=window_length * sample_rate,
	frame_step=window_step * sample_rate,
	winfunc=np.ones)
	label = np.zeros(temp_label.shape[0])
	label = np.array([1 if np.sum(temp_label[j], axis=0) > temp_label.shape[0] / 2 else 0 for j in range(temp_label.shape[0])])
	# Storing the fbank features and label for each audio
	fbank_features.append(features)
	labels.append(label)
	# Return the features in numpy array format
	return np.array(fbank_features), np.array(labels)