| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks |
| and remove unnecessary dependencies. |
| """ |
|
|
| import warnings |
| from typing import List, Optional, Tuple, Union |
|
|
| import numpy as np |
|
|
|
|
| def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]: |
| """ |
| Convert frequency from hertz to mels. |
| |
| Args: |
| freq (`float` or `np.ndarray`): |
| The frequency, or multiple frequencies, in hertz (Hz). |
| mel_scale (`str`, *optional*, defaults to `"htk"`): |
| The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`. |
| |
| Returns: |
| `float` or `np.ndarray`: The frequencies on the mel scale. |
| """ |
|
|
| if mel_scale not in ["slaney", "htk", "kaldi"]: |
| raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".') |
|
|
| if mel_scale == "htk": |
| return 2595.0 * np.log10(1.0 + (freq / 700.0)) |
| elif mel_scale == "kaldi": |
| return 1127.0 * np.log(1.0 + (freq / 700.0)) |
|
|
| min_log_hertz = 1000.0 |
| min_log_mel = 15.0 |
| logstep = 27.0 / np.log(6.4) |
| mels = 3.0 * freq / 200.0 |
|
|
| if isinstance(freq, np.ndarray): |
| log_region = freq >= min_log_hertz |
| mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep |
| elif freq >= min_log_hertz: |
| mels = min_log_mel + np.log(freq / min_log_hertz) * logstep |
|
|
| return mels |
|
|
|
|
| def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]: |
| """ |
| Convert frequency from mels to hertz. |
| |
| Args: |
| mels (`float` or `np.ndarray`): |
| The frequency, or multiple frequencies, in mels. |
| mel_scale (`str`, *optional*, `"htk"`): |
| The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`. |
| |
| Returns: |
| `float` or `np.ndarray`: The frequencies in hertz. |
| """ |
|
|
| if mel_scale not in ["slaney", "htk", "kaldi"]: |
| raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".') |
|
|
| if mel_scale == "htk": |
| return 700.0 * (np.power(10, mels / 2595.0) - 1.0) |
| elif mel_scale == "kaldi": |
| return 700.0 * (np.exp(mels / 1127.0) - 1.0) |
|
|
| min_log_hertz = 1000.0 |
| min_log_mel = 15.0 |
| logstep = np.log(6.4) / 27.0 |
| freq = 200.0 * mels / 3.0 |
|
|
| if isinstance(mels, np.ndarray): |
| log_region = mels >= min_log_mel |
| freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel)) |
| elif mels >= min_log_mel: |
| freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel)) |
|
|
| return freq |
|
|
|
|
| def hertz_to_octave( |
| freq: Union[float, np.ndarray], tuning: Optional[float] = 0.0, bins_per_octave: Optional[int] = 12 |
| ): |
| """ |
| Convert frequency from hertz to fractional octave numbers. |
| Adapted from *librosa*. |
| |
| Args: |
| freq (`float` or `np.ndarray`): |
| The frequency, or multiple frequencies, in hertz (Hz). |
| tuning (`float`, defaults to `0.`): |
| Tuning deviation from the Stuttgart pitch (A440) in (fractional) bins per octave. |
| bins_per_octave (`int`, defaults to `12`): |
| Number of bins per octave. |
| |
| Returns: |
| `float` or `np.ndarray`: The frequencies on the octave scale. |
| """ |
| stuttgart_pitch = 440.0 * 2.0 ** (tuning / bins_per_octave) |
| octave = np.log2(freq / (float(stuttgart_pitch) / 16)) |
| return octave |
|
|
|
|
| def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: np.ndarray) -> np.ndarray: |
| """ |
| Creates a triangular filter bank. |
| |
| Adapted from *torchaudio* and *librosa*. |
| |
| Args: |
| fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`): |
| Discrete frequencies of the FFT bins in Hz. |
| filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`): |
| Center frequencies of the triangular filters to create, in Hz. |
| |
| Returns: |
| `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)` |
| """ |
| filter_diff = np.diff(filter_freqs) |
| slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1) |
| down_slopes = -slopes[:, :-2] / filter_diff[:-1] |
| up_slopes = slopes[:, 2:] / filter_diff[1:] |
| return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes)) |
|
|
|
|
| def chroma_filter_bank( |
| num_frequency_bins: int, |
| num_chroma: int, |
| sampling_rate: int, |
| tuning: float = 0.0, |
| power: Optional[float] = 2.0, |
| weighting_parameters: Optional[Tuple[float]] = (5.0, 2), |
| start_at_c_chroma: Optional[bool] = True, |
| ): |
| """ |
| Creates a chroma filter bank, i.e a linear transformation to project spectrogram bins onto chroma bins. |
| |
| Adapted from *librosa*. |
| |
| Args: |
| num_frequency_bins (`int`): |
| Number of frequencies used to compute the spectrogram (should be the same as in `stft`). |
| num_chroma (`int`): |
| Number of chroma bins (i.e pitch classes). |
| sampling_rate (`float`): |
| Sample rate of the audio waveform. |
| tuning (`float`): |
| Tuning deviation from A440 in fractions of a chroma bin. |
| power (`float`, *optional*, defaults to 2.0): |
| If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm. |
| weighting_parameters (`Tuple[float]`, *optional*, defaults to `(5., 2.)`): |
| If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and |
| the second element being the Gaussian half-width. |
| start_at_c_chroma (`float`, *optional*, defaults to `True`): |
| If True, the filter bank will start at the 'C' pitch class. Otherwise, it will start at 'A'. |
| Returns: |
| `np.ndarray` of shape `(num_frequency_bins, num_chroma)` |
| """ |
| |
| frequencies = np.linspace(0, sampling_rate, num_frequency_bins, endpoint=False)[1:] |
|
|
| freq_bins = num_chroma * hertz_to_octave(frequencies, tuning=tuning, bins_per_octave=num_chroma) |
|
|
| |
| |
| freq_bins = np.concatenate(([freq_bins[0] - 1.5 * num_chroma], freq_bins)) |
|
|
| bins_width = np.concatenate((np.maximum(freq_bins[1:] - freq_bins[:-1], 1.0), [1])) |
|
|
| chroma_filters = np.subtract.outer(freq_bins, np.arange(0, num_chroma, dtype="d")).T |
|
|
| num_chroma2 = np.round(float(num_chroma) / 2) |
|
|
| |
| |
| |
| chroma_filters = np.remainder(chroma_filters + num_chroma2 + 10 * num_chroma, num_chroma) - num_chroma2 |
|
|
| |
| chroma_filters = np.exp(-0.5 * (2 * chroma_filters / np.tile(bins_width, (num_chroma, 1))) ** 2) |
|
|
| |
| if power is not None: |
| chroma_filters = chroma_filters / np.sum(chroma_filters**power, axis=0, keepdims=True) ** (1.0 / power) |
|
|
| |
| if weighting_parameters is not None: |
| center, half_width = weighting_parameters |
| chroma_filters *= np.tile( |
| np.exp(-0.5 * (((freq_bins / num_chroma - center) / half_width) ** 2)), |
| (num_chroma, 1), |
| ) |
|
|
| if start_at_c_chroma: |
| chroma_filters = np.roll(chroma_filters, -3 * (num_chroma // 12), axis=0) |
|
|
| |
| return np.ascontiguousarray(chroma_filters[:, : int(1 + num_frequency_bins / 2)]) |
|
|
|
|
| def mel_filter_bank( |
| num_frequency_bins: int, |
| num_mel_filters: int, |
| min_frequency: float, |
| max_frequency: float, |
| sampling_rate: int, |
| norm: Optional[str] = None, |
| mel_scale: str = "htk", |
| triangularize_in_mel_space: bool = False, |
| ) -> np.ndarray: |
| """ |
| Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and |
| various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters |
| are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these |
| features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency. |
| |
| Different banks of mel filters were introduced in the literature. The following variations are supported: |
| |
| - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech |
| bandwidth of `[0, 4600]` Hz. |
| - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech |
| bandwidth of `[0, 8000]` Hz. This assumes sampling rate ≥ 16 kHz. |
| - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz and |
| speech bandwidth of `[133, 6854]` Hz. This version also includes area normalization. |
| - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes a sampling rate of |
| 12.5 kHz and speech bandwidth of `[0, 6250]` Hz. |
| |
| This code is adapted from *torchaudio* and *librosa*. Note that the default parameters of torchaudio's |
| `melscale_fbanks` implement the `"htk"` filters while librosa uses the `"slaney"` implementation. |
| |
| Args: |
| num_frequency_bins (`int`): |
| Number of frequencies used to compute the spectrogram (should be the same as in `stft`). |
| num_mel_filters (`int`): |
| Number of mel filters to generate. |
| min_frequency (`float`): |
| Lowest frequency of interest in Hz. |
| max_frequency (`float`): |
| Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`. |
| sampling_rate (`int`): |
| Sample rate of the audio waveform. |
| norm (`str`, *optional*): |
| If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization). |
| mel_scale (`str`, *optional*, defaults to `"htk"`): |
| The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`. |
| triangularize_in_mel_space (`bool`, *optional*, defaults to `False`): |
| If this option is enabled, the triangular filter is applied in mel space rather than frequency space. This |
| should be set to `true` in order to get the same results as `torchaudio` when computing mel filters. |
| |
| Returns: |
| `np.ndarray` of shape (`num_frequency_bins`, `num_mel_filters`): Triangular filter bank matrix. This is a |
| projection matrix to go from a spectrogram to a mel spectrogram. |
| """ |
| if norm is not None and norm != "slaney": |
| raise ValueError('norm must be one of None or "slaney"') |
|
|
| |
| mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale) |
| mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale) |
| mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2) |
| filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale) |
|
|
| if triangularize_in_mel_space: |
| |
| fft_bin_width = sampling_rate / (num_frequency_bins * 2) |
| fft_freqs = hertz_to_mel(fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale) |
| filter_freqs = mel_freqs |
| else: |
| |
| fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins) |
|
|
| mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs) |
|
|
| if norm is not None and norm == "slaney": |
| |
| enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters]) |
| mel_filters *= np.expand_dims(enorm, 0) |
|
|
| if (mel_filters.max(axis=0) == 0.0).any(): |
| warnings.warn( |
| "At least one mel filter has all zero values. " |
| f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. " |
| f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low." |
| ) |
|
|
| return mel_filters |
|
|
|
|
| def optimal_fft_length(window_length: int) -> int: |
| """ |
| Finds the best FFT input size for a given `window_length`. This function takes a given window length and, if not |
| already a power of two, rounds it up to the next power or two. |
| |
| The FFT algorithm works fastest when the length of the input is a power of two, which may be larger than the size |
| of the window or analysis frame. For example, if the window is 400 samples, using an FFT input size of 512 samples |
| is more optimal than an FFT size of 400 samples. Using a larger FFT size does not affect the detected frequencies, |
| it simply gives a higher frequency resolution (i.e. the frequency bins are smaller). |
| """ |
| return 2 ** int(np.ceil(np.log2(window_length))) |
|
|
|
|
| def window_function( |
| window_length: int, |
| name: str = "hann", |
| periodic: bool = True, |
| frame_length: Optional[int] = None, |
| center: bool = True, |
| ) -> np.ndarray: |
| """ |
| Returns an array containing the specified window. This window is intended to be used with `stft`. |
| |
| The following window types are supported: |
| |
| - `"boxcar"`: a rectangular window |
| - `"hamming"`: the Hamming window |
| - `"hann"`: the Hann window |
| - `"povey"`: the Povey window |
| |
| Args: |
| window_length (`int`): |
| The length of the window in samples. |
| name (`str`, *optional*, defaults to `"hann"`): |
| The name of the window function. |
| periodic (`bool`, *optional*, defaults to `True`): |
| Whether the window is periodic or symmetric. |
| frame_length (`int`, *optional*): |
| The length of the analysis frames in samples. Provide a value for `frame_length` if the window is smaller |
| than the frame length, so that it will be zero-padded. |
| center (`bool`, *optional*, defaults to `True`): |
| Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided. |
| |
| Returns: |
| `np.ndarray` of shape `(window_length,)` or `(frame_length,)` containing the window. |
| """ |
| length = window_length + 1 if periodic else window_length |
|
|
| if name == "boxcar": |
| window = np.ones(length) |
| elif name in ["hamming", "hamming_window"]: |
| window = np.hamming(length) |
| elif name in ["hann", "hann_window"]: |
| window = np.hanning(length) |
| elif name in ["povey"]: |
| window = np.power(np.hanning(length), 0.85) |
| else: |
| raise ValueError(f"Unknown window function '{name}'") |
|
|
| if periodic: |
| window = window[:-1] |
|
|
| if frame_length is None: |
| return window |
|
|
| if window_length > frame_length: |
| raise ValueError( |
| f"Length of the window ({window_length}) may not be larger than frame_length ({frame_length})" |
| ) |
|
|
| padded_window = np.zeros(frame_length) |
| offset = (frame_length - window_length) // 2 if center else 0 |
| padded_window[offset : offset + window_length] = window |
| return padded_window |
|
|
|
|
| |
| def spectrogram( |
| waveform: np.ndarray, |
| window: np.ndarray, |
| frame_length: int, |
| hop_length: int, |
| fft_length: Optional[int] = None, |
| power: Optional[float] = 1.0, |
| center: bool = True, |
| pad_mode: str = "reflect", |
| onesided: bool = True, |
| preemphasis: Optional[float] = None, |
| mel_filters: Optional[np.ndarray] = None, |
| mel_floor: float = 1e-10, |
| log_mel: Optional[str] = None, |
| reference: float = 1.0, |
| min_value: float = 1e-10, |
| db_range: Optional[float] = None, |
| remove_dc_offset: Optional[bool] = None, |
| dtype: np.dtype = np.float32, |
| ) -> np.ndarray: |
| """ |
| Calculates a spectrogram over one waveform using the Short-Time Fourier Transform. |
| |
| This function can create the following kinds of spectrograms: |
| |
| - amplitude spectrogram (`power = 1.0`) |
| - power spectrogram (`power = 2.0`) |
| - complex-valued spectrogram (`power = None`) |
| - log spectrogram (use `log_mel` argument) |
| - mel spectrogram (provide `mel_filters`) |
| - log-mel spectrogram (provide `mel_filters` and `log_mel`) |
| |
| How this works: |
| |
| 1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length |
| - hop_length` samples. |
| 2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`. |
| 3. The DFT is taken of each windowed frame. |
| 4. The results are stacked into a spectrogram. |
| |
| We make a distinction between the following "blocks" of sample data, each of which may have a different lengths: |
| |
| - The analysis frame. This is the size of the time slices that the input waveform is split into. |
| - The window. Each analysis frame is multiplied by the window to avoid spectral leakage. |
| - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram. |
| |
| In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A |
| padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame, |
| typically the next power of two. |
| |
| Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and |
| `torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms |
| can be constructed. |
| |
| Args: |
| waveform (`np.ndarray` of shape `(length,)`): |
| The input waveform. This must be a single real-valued, mono waveform. |
| window (`np.ndarray` of shape `(frame_length,)`): |
| The windowing function to apply, including zero-padding if necessary. The actual window length may be |
| shorter than `frame_length`, but we're assuming the array has already been zero-padded. |
| frame_length (`int`): |
| The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also |
| allow smaller sizes. |
| hop_length (`int`): |
| The stride between successive analysis frames in samples. |
| fft_length (`int`, *optional*): |
| The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have. |
| For optimal speed, this should be a power of two. If `None`, uses `frame_length`. |
| power (`float`, *optional*, defaults to 1.0): |
| If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns |
| complex numbers. |
| center (`bool`, *optional*, defaults to `True`): |
| Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame |
| `t` will start at time `t * hop_length`. |
| pad_mode (`str`, *optional*, defaults to `"reflect"`): |
| Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"` |
| (pad with edge values), `"reflect"` (pads with mirrored values). |
| onesided (`bool`, *optional*, defaults to `True`): |
| If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1` |
| frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins. |
| preemphasis (`float`, *optional*) |
| Coefficient for a low-pass filter that applies pre-emphasis before the DFT. |
| mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*): |
| The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram. |
| mel_floor (`float`, *optional*, defaults to 1e-10): |
| Minimum value of mel frequency banks. |
| log_mel (`str`, *optional*): |
| How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take |
| the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be |
| used when `power` is not `None`. |
| reference (`float`, *optional*, defaults to 1.0): |
| Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set |
| the loudest part to 0 dB. Must be greater than zero. |
| min_value (`float`, *optional*, defaults to `1e-10`): |
| The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking |
| `log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an |
| amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero. |
| db_range (`float`, *optional*): |
| Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the |
| peak value and the smallest value will never be more than 80 dB. Must be greater than zero. |
| remove_dc_offset (`bool`, *optional*): |
| Subtract mean from waveform on each frame, applied before pre-emphasis. This should be set to `true` in |
| order to get the same results as `torchaudio.compliance.kaldi.fbank` when computing mel filters. |
| dtype (`np.dtype`, *optional*, defaults to `np.float32`): |
| Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be |
| `np.complex64`. |
| |
| Returns: |
| `nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape |
| `(num_mel_filters, length)` for a mel spectrogram. |
| """ |
| window_length = len(window) |
|
|
| if fft_length is None: |
| fft_length = frame_length |
|
|
| if frame_length > fft_length: |
| raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})") |
|
|
| if window_length != frame_length: |
| raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})") |
|
|
| if hop_length <= 0: |
| raise ValueError("hop_length must be greater than zero") |
|
|
| if waveform.ndim != 1: |
| raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}") |
|
|
| if np.iscomplexobj(waveform): |
| raise ValueError("Complex-valued input waveforms are not currently supported") |
|
|
| if power is None and mel_filters is not None: |
| raise ValueError( |
| "You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram." |
| "Specify `power` to fix this issue." |
| ) |
|
|
| |
| if center: |
| padding = [(int(frame_length // 2), int(frame_length // 2))] |
| waveform = np.pad(waveform, padding, mode=pad_mode) |
|
|
| |
| waveform = waveform.astype(np.float64) |
| window = window.astype(np.float64) |
|
|
| |
| num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length)) |
|
|
| num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length |
| spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64) |
|
|
| |
| fft_func = np.fft.rfft if onesided else np.fft.fft |
| buffer = np.zeros(fft_length) |
|
|
| timestep = 0 |
| for frame_idx in range(num_frames): |
| buffer[:frame_length] = waveform[timestep : timestep + frame_length] |
|
|
| if remove_dc_offset: |
| buffer[:frame_length] = buffer[:frame_length] - buffer[:frame_length].mean() |
|
|
| if preemphasis is not None: |
| buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1] |
| buffer[0] *= 1 - preemphasis |
|
|
| buffer[:frame_length] *= window |
|
|
| spectrogram[frame_idx] = fft_func(buffer) |
| timestep += hop_length |
|
|
| |
| if power is not None: |
| spectrogram = np.abs(spectrogram, dtype=np.float64) ** power |
|
|
| spectrogram = spectrogram.T |
|
|
| if mel_filters is not None: |
| spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram)) |
|
|
| if power is not None and log_mel is not None: |
| if log_mel == "log": |
| spectrogram = np.log(spectrogram) |
| elif log_mel == "log10": |
| spectrogram = np.log10(spectrogram) |
| elif log_mel == "dB": |
| if power == 1.0: |
| spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range) |
| elif power == 2.0: |
| spectrogram = power_to_db(spectrogram, reference, min_value, db_range) |
| else: |
| raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}") |
| else: |
| raise ValueError(f"Unknown log_mel option: {log_mel}") |
|
|
| spectrogram = np.asarray(spectrogram, dtype) |
|
|
| return spectrogram |
|
|
|
|
| def spectrogram_batch( |
| waveform_list: List[np.ndarray], |
| window: np.ndarray, |
| frame_length: int, |
| hop_length: int, |
| fft_length: Optional[int] = None, |
| power: Optional[float] = 1.0, |
| center: bool = True, |
| pad_mode: str = "reflect", |
| onesided: bool = True, |
| preemphasis: Optional[float] = None, |
| mel_filters: Optional[np.ndarray] = None, |
| mel_floor: float = 1e-10, |
| log_mel: Optional[str] = None, |
| reference: float = 1.0, |
| min_value: float = 1e-10, |
| db_range: Optional[float] = None, |
| remove_dc_offset: Optional[bool] = None, |
| dtype: np.dtype = np.float32, |
| ) -> List[np.ndarray]: |
| """ |
| Calculates spectrograms for a list of waveforms using the Short-Time Fourier Transform, optimized for batch processing. |
| This function extends the capabilities of the `spectrogram` function to handle multiple waveforms efficiently by leveraging broadcasting. |
| |
| It supports generating various types of spectrograms: |
| |
| - amplitude spectrogram (`power = 1.0`) |
| - power spectrogram (`power = 2.0`) |
| - complex-valued spectrogram (`power = None`) |
| - log spectrogram (use `log_mel` argument) |
| - mel spectrogram (provide `mel_filters`) |
| - log-mel spectrogram (provide `mel_filters` and `log_mel`) |
| |
| How this works: |
| |
| 1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length |
| - hop_length` samples. |
| 2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`. |
| 3. The DFT is taken of each windowed frame. |
| 4. The results are stacked into a spectrogram. |
| |
| We make a distinction between the following "blocks" of sample data, each of which may have a different lengths: |
| |
| - The analysis frame. This is the size of the time slices that the input waveform is split into. |
| - The window. Each analysis frame is multiplied by the window to avoid spectral leakage. |
| - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram. |
| |
| In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A |
| padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame, |
| typically the next power of two. |
| |
| Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`. |
| |
| Args: |
| waveform_list (`List[np.ndarray]` with arrays of shape `(length,)`): |
| The list of input waveforms, each a single-channel (mono) signal. |
| window (`np.ndarray` of shape `(frame_length,)`): |
| The windowing function to apply, including zero-padding if necessary. |
| frame_length (`int`): |
| The length of each frame for analysis. |
| hop_length (`int`): |
| The step size between successive frames. |
| fft_length (`int`, *optional*): |
| The size of the FFT buffer, defining frequency bin resolution. |
| power (`float`, *optional*, defaults to 1.0): |
| Determines the type of spectrogram: 1.0 for amplitude, 2.0 for power, None for complex. |
| center (`bool`, *optional*, defaults to `True`): |
| Whether to center-pad the waveform frames. |
| pad_mode (`str`, *optional*, defaults to `"reflect"`): |
| The padding strategy when `center` is `True`. |
| onesided (`bool`, *optional*, defaults to `True`): |
| If True, returns a one-sided spectrogram for real input signals. |
| preemphasis (`float`, *optional*): |
| Applies a pre-emphasis filter to each frame. |
| mel_filters (`np.ndarray`, *optional*): |
| Mel filter bank for converting to mel spectrogram. |
| mel_floor (`float`, *optional*, defaults to 1e-10): |
| Floor value for mel spectrogram to avoid log(0). |
| log_mel (`str`, *optional*): |
| Specifies log scaling strategy; options are None, "log", "log10", "dB". |
| reference (`float`, *optional*, defaults to 1.0): |
| Reference value for dB conversion in log_mel. |
| min_value (`float`, *optional*, defaults to 1e-10): |
| Minimum floor value for log scale conversions. |
| db_range (`float`, *optional*): |
| Dynamic range for dB scale spectrograms. |
| remove_dc_offset (`bool`, *optional*): |
| Whether to remove the DC offset from each frame. |
| dtype (`np.dtype`, *optional*, defaults to `np.float32`): |
| Data type of the output spectrogram. |
| |
| Returns: |
| List[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform. |
| """ |
| window_length = len(window) |
|
|
| if fft_length is None: |
| fft_length = frame_length |
|
|
| if frame_length > fft_length: |
| raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})") |
|
|
| if window_length != frame_length: |
| raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})") |
|
|
| if hop_length <= 0: |
| raise ValueError("hop_length must be greater than zero") |
|
|
| |
| for waveform in waveform_list: |
| if waveform.ndim != 1: |
| raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}") |
| if np.iscomplexobj(waveform): |
| raise ValueError("Complex-valued input waveforms are not currently supported") |
| |
| if center: |
| padding = [(int(frame_length // 2), int(frame_length // 2))] |
| waveform_list = [ |
| np.pad( |
| waveform, |
| padding, |
| mode=pad_mode, |
| ) |
| for waveform in waveform_list |
| ] |
| original_waveform_lengths = [ |
| len(waveform) for waveform in waveform_list |
| ] |
|
|
| |
| max_length = max(original_waveform_lengths) |
| padded_waveform_batch = np.array( |
| [ |
| np.pad(waveform, (0, max_length - len(waveform)), mode="constant", constant_values=0) |
| for waveform in waveform_list |
| ], |
| dtype=dtype, |
| ) |
|
|
| |
| padded_waveform_batch = padded_waveform_batch.astype(np.float64) |
| window = window.astype(np.float64) |
|
|
| |
| num_frames = int(1 + np.floor((padded_waveform_batch.shape[1] - frame_length) / hop_length)) |
| |
| true_num_frames = [int(1 + np.floor((length - frame_length) / hop_length)) for length in original_waveform_lengths] |
| num_batches = padded_waveform_batch.shape[0] |
|
|
| num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length |
| spectrogram = np.empty((num_batches, num_frames, num_frequency_bins), dtype=np.complex64) |
|
|
| |
| fft_func = np.fft.rfft if onesided else np.fft.fft |
| buffer = np.zeros((num_batches, fft_length)) |
|
|
| for frame_idx in range(num_frames): |
| timestep = frame_idx * hop_length |
| buffer[:, :frame_length] = padded_waveform_batch[:, timestep : timestep + frame_length] |
|
|
| if remove_dc_offset: |
| buffer[:, :frame_length] -= buffer[:, :frame_length].mean(axis=1, keepdims=True) |
|
|
| if preemphasis is not None: |
| buffer[:, 1:frame_length] -= preemphasis * buffer[:, : frame_length - 1] |
| buffer[:, 0] *= 1 - preemphasis |
|
|
| buffer[:, :frame_length] *= window |
|
|
| spectrogram[:, frame_idx] = fft_func(buffer) |
|
|
| |
| if power is not None: |
| spectrogram = np.abs(spectrogram, dtype=np.float64) ** power |
|
|
| |
| if mel_filters is not None: |
| result = np.tensordot(spectrogram, mel_filters.T, axes=([2], [1])) |
| spectrogram = np.maximum(mel_floor, result) |
|
|
| |
| if power is not None and log_mel is not None: |
| if log_mel == "log": |
| spectrogram = np.log(spectrogram) |
| elif log_mel == "log10": |
| spectrogram = np.log10(spectrogram) |
| elif log_mel == "dB": |
| if power == 1.0: |
| spectrogram = amplitude_to_db_batch(spectrogram, reference, min_value, db_range) |
| elif power == 2.0: |
| spectrogram = power_to_db_batch(spectrogram, reference, min_value, db_range) |
| else: |
| raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}") |
| else: |
| raise ValueError(f"Unknown log_mel option: {log_mel}") |
|
|
| spectrogram = np.asarray(spectrogram, dtype) |
|
|
| spectrogram_list = [spectrogram[i, : true_num_frames[i], :].T for i in range(len(true_num_frames))] |
|
|
| return spectrogram_list |
|
|
|
|
| def power_to_db( |
| spectrogram: np.ndarray, |
| reference: float = 1.0, |
| min_value: float = 1e-10, |
| db_range: Optional[float] = None, |
| ) -> np.ndarray: |
| """ |
| Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, using basic |
| logarithm properties for numerical stability. |
| |
| The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a |
| linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. |
| This means that large variations in energy may not sound all that different if the sound is loud to begin with. |
| This compression operation makes the (mel) spectrogram features match more closely what humans actually hear. |
| |
| Based on the implementation of `librosa.power_to_db`. |
| |
| Args: |
| spectrogram (`np.ndarray`): |
| The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared! |
| reference (`float`, *optional*, defaults to 1.0): |
| Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set |
| the loudest part to 0 dB. Must be greater than zero. |
| min_value (`float`, *optional*, defaults to `1e-10`): |
| The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking |
| `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero. |
| db_range (`float`, *optional*): |
| Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the |
| peak value and the smallest value will never be more than 80 dB. Must be greater than zero. |
| |
| Returns: |
| `np.ndarray`: the spectrogram in decibels |
| """ |
| if reference <= 0.0: |
| raise ValueError("reference must be greater than zero") |
| if min_value <= 0.0: |
| raise ValueError("min_value must be greater than zero") |
|
|
| reference = max(min_value, reference) |
|
|
| spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) |
| spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference)) |
|
|
| if db_range is not None: |
| if db_range <= 0.0: |
| raise ValueError("db_range must be greater than zero") |
| spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None) |
|
|
| return spectrogram |
|
|
|
|
| def power_to_db_batch( |
| spectrogram: np.ndarray, |
| reference: float = 1.0, |
| min_value: float = 1e-10, |
| db_range: Optional[float] = None, |
| ) -> np.ndarray: |
| """ |
| Converts a batch of power spectrograms to the decibel scale. This computes `10 * log10(spectrogram / reference)`, |
| using basic logarithm properties for numerical stability. |
| |
| This function supports batch processing, where each item in the batch is an individual power (mel) spectrogram. |
| |
| Args: |
| spectrogram (`np.ndarray`): |
| The input batch of power (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape). |
| Note that a power spectrogram has the amplitudes squared! |
| reference (`float`, *optional*, defaults to 1.0): |
| Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set |
| the loudest part to 0 dB. Must be greater than zero. |
| min_value (`float`, *optional*, defaults to `1e-10`): |
| The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking |
| `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero. |
| db_range (`float`, *optional*): |
| Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the |
| peak value and the smallest value will never be more than 80 dB. Must be greater than zero. |
| |
| Returns: |
| `np.ndarray`: the batch of spectrograms in decibels |
| """ |
| if reference <= 0.0: |
| raise ValueError("reference must be greater than zero") |
| if min_value <= 0.0: |
| raise ValueError("min_value must be greater than zero") |
|
|
| reference = max(min_value, reference) |
|
|
| spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) |
| spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference)) |
|
|
| if db_range is not None: |
| if db_range <= 0.0: |
| raise ValueError("db_range must be greater than zero") |
| |
| max_values = spectrogram.max(axis=(1, 2), keepdims=True) |
| spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None) |
|
|
| return spectrogram |
|
|
|
|
| def amplitude_to_db( |
| spectrogram: np.ndarray, |
| reference: float = 1.0, |
| min_value: float = 1e-5, |
| db_range: Optional[float] = None, |
| ) -> np.ndarray: |
| """ |
| Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, using |
| basic logarithm properties for numerical stability. |
| |
| The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a |
| linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. |
| This means that large variations in energy may not sound all that different if the sound is loud to begin with. |
| This compression operation makes the (mel) spectrogram features match more closely what humans actually hear. |
| |
| Args: |
| spectrogram (`np.ndarray`): |
| The input amplitude (mel) spectrogram. |
| reference (`float`, *optional*, defaults to 1.0): |
| Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set |
| the loudest part to 0 dB. Must be greater than zero. |
| min_value (`float`, *optional*, defaults to `1e-5`): |
| The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking |
| `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero. |
| db_range (`float`, *optional*): |
| Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the |
| peak value and the smallest value will never be more than 80 dB. Must be greater than zero. |
| |
| Returns: |
| `np.ndarray`: the spectrogram in decibels |
| """ |
| if reference <= 0.0: |
| raise ValueError("reference must be greater than zero") |
| if min_value <= 0.0: |
| raise ValueError("min_value must be greater than zero") |
|
|
| reference = max(min_value, reference) |
|
|
| spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) |
| spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference)) |
|
|
| if db_range is not None: |
| if db_range <= 0.0: |
| raise ValueError("db_range must be greater than zero") |
| spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None) |
|
|
| return spectrogram |
|
|
|
|
| def amplitude_to_db_batch( |
| spectrogram: np.ndarray, reference: float = 1.0, min_value: float = 1e-5, db_range: Optional[float] = None |
| ) -> np.ndarray: |
| """ |
| Converts a batch of amplitude spectrograms to the decibel scale. This computes `20 * log10(spectrogram / reference)`, |
| using basic logarithm properties for numerical stability. |
| |
| The function supports batch processing, where each item in the batch is an individual amplitude (mel) spectrogram. |
| |
| Args: |
| spectrogram (`np.ndarray`): |
| The input batch of amplitude (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape). |
| reference (`float`, *optional*, defaults to 1.0): |
| Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set |
| the loudest part to 0 dB. Must be greater than zero. |
| min_value (`float`, *optional*, defaults to `1e-5`): |
| The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking |
| `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero. |
| db_range (`float`, *optional*): |
| Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the |
| peak value and the smallest value will never be more than 80 dB. Must be greater than zero. |
| |
| Returns: |
| `np.ndarray`: the batch of spectrograms in decibels |
| """ |
| if reference <= 0.0: |
| raise ValueError("reference must be greater than zero") |
| if min_value <= 0.0: |
| raise ValueError("min_value must be greater than zero") |
|
|
| reference = max(min_value, reference) |
|
|
| spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) |
| spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference)) |
|
|
| if db_range is not None: |
| if db_range <= 0.0: |
| raise ValueError("db_range must be greater than zero") |
| |
| max_values = spectrogram.max(axis=(1, 2), keepdims=True) |
| spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None) |
|
|
| return spectrogram |
|
|
|
|
| |
|
|
|
|
| def get_mel_filter_banks( |
| nb_frequency_bins: int, |
| nb_mel_filters: int, |
| frequency_min: float, |
| frequency_max: float, |
| sample_rate: int, |
| norm: Optional[str] = None, |
| mel_scale: str = "htk", |
| ) -> np.array: |
| warnings.warn( |
| "The function `get_mel_filter_banks` is deprecated and will be removed in version 4.31.0 of Transformers", |
| FutureWarning, |
| ) |
| return mel_filter_bank( |
| num_frequency_bins=nb_frequency_bins, |
| num_mel_filters=nb_mel_filters, |
| min_frequency=frequency_min, |
| max_frequency=frequency_max, |
| sampling_rate=sample_rate, |
| norm=norm, |
| mel_scale=mel_scale, |
| ) |
|
|
|
|
| def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True): |
| """ |
| In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed |
| segments called `frames`. |
| |
| The window length (window_length) defines how much of the signal is contained in each frame, while the hop length |
| defines the step between the beginning of each new frame. |
| |
| |
| Args: |
| waveform (`np.array` of shape `(sample_length,)`): |
| The raw waveform which will be split into smaller chunks. |
| hop_length (`int`, *optional*, defaults to 160): |
| Step between each window of the waveform. |
| fft_window_size (`int`, *optional*, defaults to 400): |
| Defines the size of the window. |
| center (`bool`, defaults to `True`): |
| Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the |
| waveform on the left and on the right. |
| |
| Return: |
| framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`): |
| The framed waveforms that can be fed to `np.fft`. |
| """ |
| warnings.warn( |
| "The function `fram_wave` is deprecated and will be removed in version 4.31.0 of Transformers", |
| FutureWarning, |
| ) |
| frames = [] |
| for i in range(0, waveform.shape[0] + 1, hop_length): |
| if center: |
| half_window = (fft_window_size - 1) // 2 + 1 |
| start = i - half_window if i > half_window else 0 |
| end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0] |
| frame = waveform[start:end] |
| if start == 0: |
| padd_width = (-i + half_window, 0) |
| frame = np.pad(frame, pad_width=padd_width, mode="reflect") |
|
|
| elif end == waveform.shape[0]: |
| padd_width = (0, (i - waveform.shape[0] + half_window)) |
| frame = np.pad(frame, pad_width=padd_width, mode="reflect") |
|
|
| else: |
| frame = waveform[i : i + fft_window_size] |
| frame_width = frame.shape[0] |
| if frame_width < waveform.shape[0]: |
| frame = np.lib.pad( |
| frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0 |
| ) |
| frames.append(frame) |
|
|
| frames = np.stack(frames, 0) |
| return frames |
|
|
|
|
| def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None): |
| """ |
| Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results |
| as `torch.stft`. |
| |
| Args: |
| frames (`np.array` of dimension `(num_frames, fft_window_size)`): |
| A framed audio signal obtained using `audio_utils.fram_wav`. |
| windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`: |
| A array representing the function that will be used to reduces the amplitude of the discontinuities at the |
| boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function. |
| For more information on the discontinuities, called *Spectral leakage*, refer to [this |
| tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf |
| fft_window_size (`int`, *optional*): |
| Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the |
| spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of |
| frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to |
| `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally. |
| |
| Example: |
| |
| ```python |
| >>> from transformers.audio_utils import stft, fram_wave |
| >>> import numpy as np |
| |
| >>> audio = np.random.rand(50) |
| >>> fft_window_size = 10 |
| >>> hop_length = 2 |
| >>> framed_audio = fram_wave(audio, hop_length, fft_window_size) |
| >>> spectrogram = stft(framed_audio, np.hanning(fft_window_size + 1)) |
| ``` |
| |
| Returns: |
| spectrogram (`np.ndarray`): |
| A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm |
| """ |
| warnings.warn( |
| "The function `stft` is deprecated and will be removed in version 4.31.0 of Transformers", |
| FutureWarning, |
| ) |
| frame_size = frames.shape[1] |
|
|
| if fft_window_size is None: |
| fft_window_size = frame_size |
|
|
| if fft_window_size < frame_size: |
| raise ValueError("FFT size must greater or equal the frame size") |
| |
| nb_frequency_bins = (fft_window_size >> 1) + 1 |
|
|
| spectrogram = np.empty((len(frames), nb_frequency_bins), dtype=np.complex64) |
| fft_signal = np.zeros(fft_window_size) |
|
|
| for f, frame in enumerate(frames): |
| if windowing_function is not None: |
| np.multiply(frame, windowing_function, out=fft_signal[:frame_size]) |
| else: |
| fft_signal[:frame_size] = frame |
| spectrogram[f] = np.fft.fft(fft_signal, axis=0)[:nb_frequency_bins] |
| return spectrogram.T |
|
|