import librosa import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torchaudio from torchaudio.transforms import MelSpectrogram from einops import rearrange from typing import List def stft(x, fft_size, hop_size, win_length, window, use_complex=False): """Perform STFT and convert to magnitude spectrogram. Args: x (Tensor): Input signal tensor (B, T). fft_size (int): FFT size. hop_size (int): Hop size. win_length (int): Window length. window (str): Window function type. Returns: Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). """ x_stft = torch.stft( x, fft_size, hop_size, win_length, window.to(x.device), return_complex=True ) # clamp is needed to avoid nan or inf if not use_complex: return torch.sqrt( torch.clamp(x_stft.real**2 + x_stft.imag**2, min=1e-7, max=1e3) ).transpose(2, 1) else: res = torch.cat([x_stft.real.unsqueeze(1), x_stft.imag.unsqueeze(1)], dim=1) res = res.transpose(2, 3) # [B, 2, T, F] return res def compute_mag_scale(n_fft, sampling_rate): frequencies = librosa.fft_frequencies(sr=sampling_rate, n_fft=n_fft) frequencies = np.where(frequencies > 1e-10, frequencies, -10) db_scale = librosa.frequency_weighting(frequencies).reshape(1, 1, -1) mag_scale = np.sqrt(librosa.db_to_power(db_scale)).astype(np.float32) return torch.from_numpy(mag_scale) class SpectralConvergenceLoss(torch.nn.Module): """Spectral convergence loss module.""" def __init__(self): """Initialize spectral convergence loss module.""" super(SpectralConvergenceLoss, self).__init__() def forward(self, x_mag, y_mag): """Calculate forward propagation. Args: x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). y_mag (Tensor): Magnitude spectrogram of ground-truth signal (B, #frames, #freq_bins). Returns: Tensor: Spectral convergence loss value. """ return torch.norm(y_mag - x_mag) / torch.norm(y_mag) class LogSTFTMagnitudeLoss(torch.nn.Module): """Log STFT magnitude loss module.""" def __init__(self): """Initialize los STFT magnitude loss module.""" super(LogSTFTMagnitudeLoss, self).__init__() def forward(self, x_mag, y_mag): """Calculate forward propagation. Args: x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). y_mag (Tensor): Magnitude spectrogram of ground-truth signal (B, #frames, #freq_bins). Returns: Tensor: Log STFT magnitude loss value. """ return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) class STFTLoss(torch.nn.Module): """STFT loss module.""" def __init__( self, fft_size=1024, hop_length=120, win_length=600, sampling_rate=16000, window="hann_window", cfg=None, ): """Initialize STFT loss module.""" super(STFTLoss, self).__init__() fft_size = ( cfg.fft_size if cfg is not None and hasattr(cfg, "fft_size") else fft_size ) hop_length = ( cfg.hop_length if cfg is not None and hasattr(cfg, "hop_length") else hop_length ) win_length = ( cfg.win_length if cfg is not None and hasattr(cfg, "win_length") else win_length ) window = cfg.window if cfg is not None and hasattr(cfg, "window") else window self.fft_size = fft_size self.hop_length = hop_length self.win_length = win_length self.window = getattr(torch, window)(win_length) self.spectral_convergence_loss = SpectralConvergenceLoss() self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() self.register_buffer("mag_scale", compute_mag_scale(fft_size, sampling_rate)) def forward(self, x, y): """Calculate forward propagation. Args: x (Tensor): Predicted signal (B, T). y (Tensor): Ground truth signal (B, T). Returns: Tensor: Spectral convergence loss value. Tensor: Log STFT magnitude loss value. """ x_mag = ( stft(x, self.fft_size, self.hop_length, self.win_length, self.window) * self.mag_scale ) y_mag = ( stft(y, self.fft_size, self.hop_length, self.win_length, self.window) * self.mag_scale ) sc_loss = self.spectral_convergence_loss(x_mag, y_mag) log_mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) return sc_loss, log_mag_loss class MultiResolutionSTFTLoss(torch.nn.Module): """Multi resolution STFT loss module.""" def __init__( self, fft_sizes=(1024, 2048, 512), hop_sizes=(120, 240, 50), win_lengths=(600, 1200, 240), window="hann_window", sampling_rate=16000, cfg=None, ): """Initialize Multi resolution STFT loss module. Args: fft_sizes (list): List of FFT sizes. hop_sizes (list): List of hop sizes. win_lengths (list): List of window lengths. window (str): Window function type. """ super(MultiResolutionSTFTLoss, self).__init__() fft_sizes = ( cfg.fft_sizes if cfg is not None and hasattr(cfg, "fft_sizes") else fft_sizes ) hop_sizes = ( cfg.hop_sizes if cfg is not None and hasattr(cfg, "hop_sizes") else hop_sizes ) win_lengths = ( cfg.win_lengths if cfg is not None and hasattr(cfg, "win_lengths") else win_lengths ) window = cfg.window if cfg is not None and hasattr(cfg, "window") else window sampling_rate = ( cfg.sampling_rate if cfg is not None and hasattr(cfg, "sampling_rate") else sampling_rate ) assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) self.stft_losses = torch.nn.ModuleList() for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): self.stft_losses += [ STFTLoss(fs, ss, wl, window=window, sampling_rate=sampling_rate) ] def forward(self, x, y): """Calculate forward propagation. Args: x (Tensor): Predicted signal (B, T). y (Tensor): GroundTruth signal (B, T). Returns: Tensor: Multi resolution spectral convergence loss value. Tensor: Multi resolution log STFT magnitude loss value. """ sc_loss = 0.0 mag_loss = 0.0 for f in self.stft_losses: sc_l, mag_l = f(x, y) sc_loss += sc_l mag_loss += mag_l sc_loss /= len(self.stft_losses) mag_loss /= len(self.stft_losses) return sc_loss, mag_loss class MultiResolutionMelSpectrogramLoss(nn.Module): """Compute distance between mel spectrograms. Can be used in a multi-scale way. Parameters ---------- n_mels : List[int] Number of mels per STFT, by default [150, 80], window_lengths : List[int], optional Length of each window of each STFT, by default [2048, 512] loss_fn : typing.Callable, optional How to compare each loss, by default nn.L1Loss() clamp_eps : float, optional Clamp on the log magnitude, below, by default 1e-5 mag_weight : float, optional Weight of raw magnitude portion of loss, by default 1.0 log_weight : float, optional Weight of log magnitude portion of loss, by default 1.0 pow : float, optional Power to raise magnitude to before taking log, by default 2.0 weight : float, optional Weight of this loss, by default 1.0 match_stride : bool, optional Whether to match the stride of convolutional layers, by default False Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py """ def __init__( self, sample_rate=16000, n_mels: List[int] = [5, 10, 20, 40, 80, 160, 320], window_lengths: List[int] = [32, 64, 128, 256, 512, 1024, 2048], clamp_eps: float = 1e-5, mag_weight: float = 0.0, log_weight: float = 1.0, pow: float = 1.0, mel_fmin: List[float] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], mel_fmax: List[float] = [None, None, None, None, None, None, None], cfg=None, ): super().__init__() sample_rate = ( cfg.sample_rate if cfg is not None and hasattr(cfg, "sample_rate") else sample_rate ) n_mels = cfg.n_mels if cfg is not None and hasattr(cfg, "n_mels") else n_mels window_lengths = ( cfg.window_lengths if cfg is not None and hasattr(cfg, "window_lengths") else window_lengths ) clamp_eps = ( cfg.clamp_eps if cfg is not None and hasattr(cfg, "clamp_eps") else clamp_eps ) mag_weight = ( cfg.mag_weight if cfg is not None and hasattr(cfg, "mag_weight") else mag_weight ) log_weight = ( cfg.log_weight if cfg is not None and hasattr(cfg, "log_weight") else log_weight ) pow = cfg.pow if cfg is not None and hasattr(cfg, "pow") else pow mel_fmin = ( cfg.mel_fmin if cfg is not None and hasattr(cfg, "mel_fmin") else mel_fmin ) mel_fmax = ( cfg.mel_fmax if cfg is not None and hasattr(cfg, "mel_fmax") else mel_fmax ) self.mel_transforms = nn.ModuleList( [ MelSpectrogram( sample_rate=sample_rate, n_fft=window_length, hop_length=window_length // 4, n_mels=n_mel, power=1.0, center=True, norm="slaney", mel_scale="slaney", ) for n_mel, window_length in zip(n_mels, window_lengths) ] ) self.n_mels = n_mels self.loss_fn = nn.L1Loss() self.clamp_eps = clamp_eps self.log_weight = log_weight self.mag_weight = mag_weight self.mel_fmin = mel_fmin self.mel_fmax = mel_fmax self.pow = pow def delta(self, x, k): l = x.shape[1] return x[:, 0 : l - k] - x[:, k:l] def forward(self, x, y, mask=None): """Computes mel loss between an estimate and a reference signal. Parameters ---------- x : AudioSignal Estimate signal y : AudioSignal Reference signal Returns ------- torch.Tensor Mel loss. """ loss = 0.0 for mel_transform in self.mel_transforms: x_mel = mel_transform(x) y_mel = mel_transform(y) log_x_mel = x_mel.clamp(self.clamp_eps).pow(self.pow).log10() log_y_mel = y_mel.clamp(self.clamp_eps).pow(self.pow).log10() loss += self.log_weight * self.loss_fn(log_x_mel, log_y_mel) loss += self.mag_weight * self.loss_fn(x_mel, y_mel) # loss += self.loss_fn(self.delta(log_x_mel, 1), self.delta(log_y_mel, 1)) # log_x_mel = rearrange(log_x_mel, 'b c t -> b t c') # log_y_mel = rearrange(log_y_mel, 'b c t -> b t c') # for i in range(3): # loss += self.loss_fn(self.delta(log_x_mel, i), self.delta(log_y_mel, i)) # loss /= len(self.mel_transforms) return loss class GANLoss(nn.Module): def __init__(self, mode="lsgan"): super(GANLoss, self).__init__() assert mode in ["lsgan", "lsgan_std", "hinge"] self.mode = mode def disc_loss(self, real, fake): if self.mode == "lsgan": real_loss = F.mse_loss(real, torch.ones_like(real)) fake_loss = F.mse_loss(fake, torch.zeros_like(fake)) elif self.mode == "lsgan_std": real = (real - 1.0).pow(2) fake = (fake - 0.0).pow(2) real_loss = real.mean() + real.std() fake_loss = fake.mean() + fake.std() elif self.mode == "hinge": real_loss = torch.relu(1.0 - real).mean() fake_loss = torch.relu(1.0 + fake).mean() else: raise ValueError(f"no such mode {self.mode}") return real_loss, fake_loss def disc_loss2(self, fake): if self.mode == "lsgan": fake_loss = F.mse_loss(fake, torch.zeros_like(fake)) elif self.mode == "lsgan_std": fake = (fake - 0.0).pow(2) fake_loss = fake.mean() + fake.std() elif self.mode == "hinge": fake_loss = torch.relu(1.0 + fake).mean() else: raise ValueError(f"no such mode {self.mode}") return fake_loss def gen_loss(self, fake): if self.mode == "lsgan": gen_loss = F.mse_loss(fake, torch.ones_like(fake)) elif self.mode == "lsgan_std": fake = (fake - 1.0).pow(2) gen_loss = fake.mean() + fake.std() elif self.mode == "hinge": gen_loss = -fake.mean() else: raise ValueError(f"no such mode {self.mode}") return gen_loss