Spaces:
Running on Zero
Running on Zero
| import librosa | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import torchaudio | |
| from torchaudio.transforms import MelSpectrogram | |
| from einops import rearrange | |
| from typing import List | |
| def stft(x, fft_size, hop_size, win_length, window, use_complex=False): | |
| """Perform STFT and convert to magnitude spectrogram. | |
| Args: | |
| x (Tensor): Input signal tensor (B, T). | |
| fft_size (int): FFT size. | |
| hop_size (int): Hop size. | |
| win_length (int): Window length. | |
| window (str): Window function type. | |
| Returns: | |
| Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). | |
| """ | |
| x_stft = torch.stft( | |
| x, fft_size, hop_size, win_length, window.to(x.device), return_complex=True | |
| ) | |
| # clamp is needed to avoid nan or inf | |
| if not use_complex: | |
| return torch.sqrt( | |
| torch.clamp(x_stft.real**2 + x_stft.imag**2, min=1e-7, max=1e3) | |
| ).transpose(2, 1) | |
| else: | |
| res = torch.cat([x_stft.real.unsqueeze(1), x_stft.imag.unsqueeze(1)], dim=1) | |
| res = res.transpose(2, 3) # [B, 2, T, F] | |
| return res | |
| def compute_mag_scale(n_fft, sampling_rate): | |
| frequencies = librosa.fft_frequencies(sr=sampling_rate, n_fft=n_fft) | |
| frequencies = np.where(frequencies > 1e-10, frequencies, -10) | |
| db_scale = librosa.frequency_weighting(frequencies).reshape(1, 1, -1) | |
| mag_scale = np.sqrt(librosa.db_to_power(db_scale)).astype(np.float32) | |
| return torch.from_numpy(mag_scale) | |
| class SpectralConvergenceLoss(torch.nn.Module): | |
| """Spectral convergence loss module.""" | |
| def __init__(self): | |
| """Initialize spectral convergence loss module.""" | |
| super(SpectralConvergenceLoss, self).__init__() | |
| def forward(self, x_mag, y_mag): | |
| """Calculate forward propagation. | |
| Args: | |
| x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). | |
| y_mag (Tensor): Magnitude spectrogram of ground-truth signal (B, #frames, #freq_bins). | |
| Returns: | |
| Tensor: Spectral convergence loss value. | |
| """ | |
| return torch.norm(y_mag - x_mag) / torch.norm(y_mag) | |
| class LogSTFTMagnitudeLoss(torch.nn.Module): | |
| """Log STFT magnitude loss module.""" | |
| def __init__(self): | |
| """Initialize los STFT magnitude loss module.""" | |
| super(LogSTFTMagnitudeLoss, self).__init__() | |
| def forward(self, x_mag, y_mag): | |
| """Calculate forward propagation. | |
| Args: | |
| x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). | |
| y_mag (Tensor): Magnitude spectrogram of ground-truth signal (B, #frames, #freq_bins). | |
| Returns: | |
| Tensor: Log STFT magnitude loss value. | |
| """ | |
| return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) | |
| class STFTLoss(torch.nn.Module): | |
| """STFT loss module.""" | |
| def __init__( | |
| self, | |
| fft_size=1024, | |
| hop_length=120, | |
| win_length=600, | |
| sampling_rate=16000, | |
| window="hann_window", | |
| cfg=None, | |
| ): | |
| """Initialize STFT loss module.""" | |
| super(STFTLoss, self).__init__() | |
| fft_size = ( | |
| cfg.fft_size if cfg is not None and hasattr(cfg, "fft_size") else fft_size | |
| ) | |
| hop_length = ( | |
| cfg.hop_length | |
| if cfg is not None and hasattr(cfg, "hop_length") | |
| else hop_length | |
| ) | |
| win_length = ( | |
| cfg.win_length | |
| if cfg is not None and hasattr(cfg, "win_length") | |
| else win_length | |
| ) | |
| window = cfg.window if cfg is not None and hasattr(cfg, "window") else window | |
| self.fft_size = fft_size | |
| self.hop_length = hop_length | |
| self.win_length = win_length | |
| self.window = getattr(torch, window)(win_length) | |
| self.spectral_convergence_loss = SpectralConvergenceLoss() | |
| self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() | |
| self.register_buffer("mag_scale", compute_mag_scale(fft_size, sampling_rate)) | |
| def forward(self, x, y): | |
| """Calculate forward propagation. | |
| Args: | |
| x (Tensor): Predicted signal (B, T). | |
| y (Tensor): Ground truth signal (B, T). | |
| Returns: | |
| Tensor: Spectral convergence loss value. | |
| Tensor: Log STFT magnitude loss value. | |
| """ | |
| x_mag = ( | |
| stft(x, self.fft_size, self.hop_length, self.win_length, self.window) | |
| * self.mag_scale | |
| ) | |
| y_mag = ( | |
| stft(y, self.fft_size, self.hop_length, self.win_length, self.window) | |
| * self.mag_scale | |
| ) | |
| sc_loss = self.spectral_convergence_loss(x_mag, y_mag) | |
| log_mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) | |
| return sc_loss, log_mag_loss | |
| class MultiResolutionSTFTLoss(torch.nn.Module): | |
| """Multi resolution STFT loss module.""" | |
| def __init__( | |
| self, | |
| fft_sizes=(1024, 2048, 512), | |
| hop_sizes=(120, 240, 50), | |
| win_lengths=(600, 1200, 240), | |
| window="hann_window", | |
| sampling_rate=16000, | |
| cfg=None, | |
| ): | |
| """Initialize Multi resolution STFT loss module. | |
| Args: | |
| fft_sizes (list): List of FFT sizes. | |
| hop_sizes (list): List of hop sizes. | |
| win_lengths (list): List of window lengths. | |
| window (str): Window function type. | |
| """ | |
| super(MultiResolutionSTFTLoss, self).__init__() | |
| fft_sizes = ( | |
| cfg.fft_sizes | |
| if cfg is not None and hasattr(cfg, "fft_sizes") | |
| else fft_sizes | |
| ) | |
| hop_sizes = ( | |
| cfg.hop_sizes | |
| if cfg is not None and hasattr(cfg, "hop_sizes") | |
| else hop_sizes | |
| ) | |
| win_lengths = ( | |
| cfg.win_lengths | |
| if cfg is not None and hasattr(cfg, "win_lengths") | |
| else win_lengths | |
| ) | |
| window = cfg.window if cfg is not None and hasattr(cfg, "window") else window | |
| sampling_rate = ( | |
| cfg.sampling_rate | |
| if cfg is not None and hasattr(cfg, "sampling_rate") | |
| else sampling_rate | |
| ) | |
| assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) | |
| self.stft_losses = torch.nn.ModuleList() | |
| for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): | |
| self.stft_losses += [ | |
| STFTLoss(fs, ss, wl, window=window, sampling_rate=sampling_rate) | |
| ] | |
| def forward(self, x, y): | |
| """Calculate forward propagation. | |
| Args: | |
| x (Tensor): Predicted signal (B, T). | |
| y (Tensor): GroundTruth signal (B, T). | |
| Returns: | |
| Tensor: Multi resolution spectral convergence loss value. | |
| Tensor: Multi resolution log STFT magnitude loss value. | |
| """ | |
| sc_loss = 0.0 | |
| mag_loss = 0.0 | |
| for f in self.stft_losses: | |
| sc_l, mag_l = f(x, y) | |
| sc_loss += sc_l | |
| mag_loss += mag_l | |
| sc_loss /= len(self.stft_losses) | |
| mag_loss /= len(self.stft_losses) | |
| return sc_loss, mag_loss | |
| class MultiResolutionMelSpectrogramLoss(nn.Module): | |
| """Compute distance between mel spectrograms. Can be used | |
| in a multi-scale way. | |
| Parameters | |
| ---------- | |
| n_mels : List[int] | |
| Number of mels per STFT, by default [150, 80], | |
| window_lengths : List[int], optional | |
| Length of each window of each STFT, by default [2048, 512] | |
| loss_fn : typing.Callable, optional | |
| How to compare each loss, by default nn.L1Loss() | |
| clamp_eps : float, optional | |
| Clamp on the log magnitude, below, by default 1e-5 | |
| mag_weight : float, optional | |
| Weight of raw magnitude portion of loss, by default 1.0 | |
| log_weight : float, optional | |
| Weight of log magnitude portion of loss, by default 1.0 | |
| pow : float, optional | |
| Power to raise magnitude to before taking log, by default 2.0 | |
| weight : float, optional | |
| Weight of this loss, by default 1.0 | |
| match_stride : bool, optional | |
| Whether to match the stride of convolutional layers, by default False | |
| Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py | |
| """ | |
| def __init__( | |
| self, | |
| sample_rate=16000, | |
| n_mels: List[int] = [5, 10, 20, 40, 80, 160, 320], | |
| window_lengths: List[int] = [32, 64, 128, 256, 512, 1024, 2048], | |
| clamp_eps: float = 1e-5, | |
| mag_weight: float = 0.0, | |
| log_weight: float = 1.0, | |
| pow: float = 1.0, | |
| mel_fmin: List[float] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], | |
| mel_fmax: List[float] = [None, None, None, None, None, None, None], | |
| cfg=None, | |
| ): | |
| super().__init__() | |
| sample_rate = ( | |
| cfg.sample_rate | |
| if cfg is not None and hasattr(cfg, "sample_rate") | |
| else sample_rate | |
| ) | |
| n_mels = cfg.n_mels if cfg is not None and hasattr(cfg, "n_mels") else n_mels | |
| window_lengths = ( | |
| cfg.window_lengths | |
| if cfg is not None and hasattr(cfg, "window_lengths") | |
| else window_lengths | |
| ) | |
| clamp_eps = ( | |
| cfg.clamp_eps | |
| if cfg is not None and hasattr(cfg, "clamp_eps") | |
| else clamp_eps | |
| ) | |
| mag_weight = ( | |
| cfg.mag_weight | |
| if cfg is not None and hasattr(cfg, "mag_weight") | |
| else mag_weight | |
| ) | |
| log_weight = ( | |
| cfg.log_weight | |
| if cfg is not None and hasattr(cfg, "log_weight") | |
| else log_weight | |
| ) | |
| pow = cfg.pow if cfg is not None and hasattr(cfg, "pow") else pow | |
| mel_fmin = ( | |
| cfg.mel_fmin if cfg is not None and hasattr(cfg, "mel_fmin") else mel_fmin | |
| ) | |
| mel_fmax = ( | |
| cfg.mel_fmax if cfg is not None and hasattr(cfg, "mel_fmax") else mel_fmax | |
| ) | |
| self.mel_transforms = nn.ModuleList( | |
| [ | |
| MelSpectrogram( | |
| sample_rate=sample_rate, | |
| n_fft=window_length, | |
| hop_length=window_length // 4, | |
| n_mels=n_mel, | |
| power=1.0, | |
| center=True, | |
| norm="slaney", | |
| mel_scale="slaney", | |
| ) | |
| for n_mel, window_length in zip(n_mels, window_lengths) | |
| ] | |
| ) | |
| self.n_mels = n_mels | |
| self.loss_fn = nn.L1Loss() | |
| self.clamp_eps = clamp_eps | |
| self.log_weight = log_weight | |
| self.mag_weight = mag_weight | |
| self.mel_fmin = mel_fmin | |
| self.mel_fmax = mel_fmax | |
| self.pow = pow | |
| def delta(self, x, k): | |
| l = x.shape[1] | |
| return x[:, 0 : l - k] - x[:, k:l] | |
| def forward(self, x, y, mask=None): | |
| """Computes mel loss between an estimate and a reference | |
| signal. | |
| Parameters | |
| ---------- | |
| x : AudioSignal | |
| Estimate signal | |
| y : AudioSignal | |
| Reference signal | |
| Returns | |
| ------- | |
| torch.Tensor | |
| Mel loss. | |
| """ | |
| loss = 0.0 | |
| for mel_transform in self.mel_transforms: | |
| x_mel = mel_transform(x) | |
| y_mel = mel_transform(y) | |
| log_x_mel = x_mel.clamp(self.clamp_eps).pow(self.pow).log10() | |
| log_y_mel = y_mel.clamp(self.clamp_eps).pow(self.pow).log10() | |
| loss += self.log_weight * self.loss_fn(log_x_mel, log_y_mel) | |
| loss += self.mag_weight * self.loss_fn(x_mel, y_mel) | |
| # loss += self.loss_fn(self.delta(log_x_mel, 1), self.delta(log_y_mel, 1)) | |
| # log_x_mel = rearrange(log_x_mel, 'b c t -> b t c') | |
| # log_y_mel = rearrange(log_y_mel, 'b c t -> b t c') | |
| # for i in range(3): | |
| # loss += self.loss_fn(self.delta(log_x_mel, i), self.delta(log_y_mel, i)) | |
| # loss /= len(self.mel_transforms) | |
| return loss | |
| class GANLoss(nn.Module): | |
| def __init__(self, mode="lsgan"): | |
| super(GANLoss, self).__init__() | |
| assert mode in ["lsgan", "lsgan_std", "hinge"] | |
| self.mode = mode | |
| def disc_loss(self, real, fake): | |
| if self.mode == "lsgan": | |
| real_loss = F.mse_loss(real, torch.ones_like(real)) | |
| fake_loss = F.mse_loss(fake, torch.zeros_like(fake)) | |
| elif self.mode == "lsgan_std": | |
| real = (real - 1.0).pow(2) | |
| fake = (fake - 0.0).pow(2) | |
| real_loss = real.mean() + real.std() | |
| fake_loss = fake.mean() + fake.std() | |
| elif self.mode == "hinge": | |
| real_loss = torch.relu(1.0 - real).mean() | |
| fake_loss = torch.relu(1.0 + fake).mean() | |
| else: | |
| raise ValueError(f"no such mode {self.mode}") | |
| return real_loss, fake_loss | |
| def disc_loss2(self, fake): | |
| if self.mode == "lsgan": | |
| fake_loss = F.mse_loss(fake, torch.zeros_like(fake)) | |
| elif self.mode == "lsgan_std": | |
| fake = (fake - 0.0).pow(2) | |
| fake_loss = fake.mean() + fake.std() | |
| elif self.mode == "hinge": | |
| fake_loss = torch.relu(1.0 + fake).mean() | |
| else: | |
| raise ValueError(f"no such mode {self.mode}") | |
| return fake_loss | |
| def gen_loss(self, fake): | |
| if self.mode == "lsgan": | |
| gen_loss = F.mse_loss(fake, torch.ones_like(fake)) | |
| elif self.mode == "lsgan_std": | |
| fake = (fake - 1.0).pow(2) | |
| gen_loss = fake.mean() + fake.std() | |
| elif self.mode == "hinge": | |
| gen_loss = -fake.mean() | |
| else: | |
| raise ValueError(f"no such mode {self.mode}") | |
| return gen_loss | |