Spaces:

ydqmkkx
/

GibbsTTS

Running on Zero

App Files Files Community

GibbsTTS / amphion_utils /amphion_codec /loss.py

ydqmkkx

update

0afe769 7 days ago

raw

history blame contribute delete

13.8 kB

	import librosa
	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torchaudio
	from torchaudio.transforms import MelSpectrogram
	from einops import rearrange
	from typing import List


	def stft(x, fft_size, hop_size, win_length, window, use_complex=False):
	"""Perform STFT and convert to magnitude spectrogram.
	Args:
	x (Tensor): Input signal tensor (B, T).
	fft_size (int): FFT size.
	hop_size (int): Hop size.
	win_length (int): Window length.
	window (str): Window function type.
	Returns:
	Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
	"""

	x_stft = torch.stft(
	x, fft_size, hop_size, win_length, window.to(x.device), return_complex=True
	)

	# clamp is needed to avoid nan or inf
	if not use_complex:
	return torch.sqrt(
	torch.clamp(x_stft.real2 + x_stft.imag2, min=1e-7, max=1e3)
	).transpose(2, 1)
	else:
	res = torch.cat([x_stft.real.unsqueeze(1), x_stft.imag.unsqueeze(1)], dim=1)
	res = res.transpose(2, 3) # [B, 2, T, F]
	return res


	def compute_mag_scale(n_fft, sampling_rate):
	frequencies = librosa.fft_frequencies(sr=sampling_rate, n_fft=n_fft)
	frequencies = np.where(frequencies > 1e-10, frequencies, -10)
	db_scale = librosa.frequency_weighting(frequencies).reshape(1, 1, -1)
	mag_scale = np.sqrt(librosa.db_to_power(db_scale)).astype(np.float32)
	return torch.from_numpy(mag_scale)


	class SpectralConvergenceLoss(torch.nn.Module):
	"""Spectral convergence loss module."""

	def __init__(self):
	"""Initialize spectral convergence loss module."""
	super(SpectralConvergenceLoss, self).__init__()

	def forward(self, x_mag, y_mag):
	"""Calculate forward propagation.
	Args:
	x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
	y_mag (Tensor): Magnitude spectrogram of ground-truth signal (B, #frames, #freq_bins).
	Returns:
	Tensor: Spectral convergence loss value.
	"""
	return torch.norm(y_mag - x_mag) / torch.norm(y_mag)


	class LogSTFTMagnitudeLoss(torch.nn.Module):
	"""Log STFT magnitude loss module."""

	def __init__(self):
	"""Initialize los STFT magnitude loss module."""
	super(LogSTFTMagnitudeLoss, self).__init__()

	def forward(self, x_mag, y_mag):
	"""Calculate forward propagation.
	Args:
	x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
	y_mag (Tensor): Magnitude spectrogram of ground-truth signal (B, #frames, #freq_bins).
	Returns:
	Tensor: Log STFT magnitude loss value.
	"""
	return F.l1_loss(torch.log(y_mag), torch.log(x_mag))


	class STFTLoss(torch.nn.Module):
	"""STFT loss module."""

	def __init__(
	self,
	fft_size=1024,
	hop_length=120,
	win_length=600,
	sampling_rate=16000,
	window="hann_window",
	cfg=None,
	):
	"""Initialize STFT loss module."""
	super(STFTLoss, self).__init__()

	fft_size = (
	cfg.fft_size if cfg is not None and hasattr(cfg, "fft_size") else fft_size
	)
	hop_length = (
	cfg.hop_length
	if cfg is not None and hasattr(cfg, "hop_length")
	else hop_length
	)
	win_length = (
	cfg.win_length
	if cfg is not None and hasattr(cfg, "win_length")
	else win_length
	)
	window = cfg.window if cfg is not None and hasattr(cfg, "window") else window

	self.fft_size = fft_size
	self.hop_length = hop_length
	self.win_length = win_length
	self.window = getattr(torch, window)(win_length)
	self.spectral_convergence_loss = SpectralConvergenceLoss()
	self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()

	self.register_buffer("mag_scale", compute_mag_scale(fft_size, sampling_rate))

	def forward(self, x, y):
	"""Calculate forward propagation.
	Args:
	x (Tensor): Predicted signal (B, T).
	y (Tensor): Ground truth signal (B, T).
	Returns:
	Tensor: Spectral convergence loss value.
	Tensor: Log STFT magnitude loss value.
	"""
	x_mag = (
	stft(x, self.fft_size, self.hop_length, self.win_length, self.window)
	* self.mag_scale
	)
	y_mag = (
	stft(y, self.fft_size, self.hop_length, self.win_length, self.window)
	* self.mag_scale
	)
	sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
	log_mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)

	return sc_loss, log_mag_loss


	class MultiResolutionSTFTLoss(torch.nn.Module):
	"""Multi resolution STFT loss module."""

	def __init__(
	self,
	fft_sizes=(1024, 2048, 512),
	hop_sizes=(120, 240, 50),
	win_lengths=(600, 1200, 240),
	window="hann_window",
	sampling_rate=16000,
	cfg=None,
	):
	"""Initialize Multi resolution STFT loss module.
	Args:
	fft_sizes (list): List of FFT sizes.
	hop_sizes (list): List of hop sizes.
	win_lengths (list): List of window lengths.
	window (str): Window function type.
	"""
	super(MultiResolutionSTFTLoss, self).__init__()

	fft_sizes = (
	cfg.fft_sizes
	if cfg is not None and hasattr(cfg, "fft_sizes")
	else fft_sizes
	)
	hop_sizes = (
	cfg.hop_sizes
	if cfg is not None and hasattr(cfg, "hop_sizes")
	else hop_sizes
	)
	win_lengths = (
	cfg.win_lengths
	if cfg is not None and hasattr(cfg, "win_lengths")
	else win_lengths
	)
	window = cfg.window if cfg is not None and hasattr(cfg, "window") else window
	sampling_rate = (
	cfg.sampling_rate
	if cfg is not None and hasattr(cfg, "sampling_rate")
	else sampling_rate
	)

	assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
	self.stft_losses = torch.nn.ModuleList()
	for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
	self.stft_losses += [
	STFTLoss(fs, ss, wl, window=window, sampling_rate=sampling_rate)
	]

	def forward(self, x, y):
	"""Calculate forward propagation.
	Args:
	x (Tensor): Predicted signal (B, T).
	y (Tensor): GroundTruth signal (B, T).
	Returns:
	Tensor: Multi resolution spectral convergence loss value.
	Tensor: Multi resolution log STFT magnitude loss value.
	"""
	sc_loss = 0.0
	mag_loss = 0.0
	for f in self.stft_losses:
	sc_l, mag_l = f(x, y)
	sc_loss += sc_l
	mag_loss += mag_l
	sc_loss /= len(self.stft_losses)
	mag_loss /= len(self.stft_losses)

	return sc_loss, mag_loss


	class MultiResolutionMelSpectrogramLoss(nn.Module):
	"""Compute distance between mel spectrograms. Can be used
	in a multi-scale way.

	Parameters
	----------
	n_mels : List[int]
	Number of mels per STFT, by default [150, 80],
	window_lengths : List[int], optional
	Length of each window of each STFT, by default [2048, 512]
	loss_fn : typing.Callable, optional
	How to compare each loss, by default nn.L1Loss()
	clamp_eps : float, optional
	Clamp on the log magnitude, below, by default 1e-5
	mag_weight : float, optional
	Weight of raw magnitude portion of loss, by default 1.0
	log_weight : float, optional
	Weight of log magnitude portion of loss, by default 1.0
	pow : float, optional
	Power to raise magnitude to before taking log, by default 2.0
	weight : float, optional
	Weight of this loss, by default 1.0
	match_stride : bool, optional
	Whether to match the stride of convolutional layers, by default False

	Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
	"""

	def __init__(
	self,
	sample_rate=16000,
	n_mels: List[int] = [5, 10, 20, 40, 80, 160, 320],
	window_lengths: List[int] = [32, 64, 128, 256, 512, 1024, 2048],
	clamp_eps: float = 1e-5,
	mag_weight: float = 0.0,
	log_weight: float = 1.0,
	pow: float = 1.0,
	mel_fmin: List[float] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
	mel_fmax: List[float] = [None, None, None, None, None, None, None],
	cfg=None,
	):
	super().__init__()

	sample_rate = (
	cfg.sample_rate
	if cfg is not None and hasattr(cfg, "sample_rate")
	else sample_rate
	)
	n_mels = cfg.n_mels if cfg is not None and hasattr(cfg, "n_mels") else n_mels
	window_lengths = (
	cfg.window_lengths
	if cfg is not None and hasattr(cfg, "window_lengths")
	else window_lengths
	)
	clamp_eps = (
	cfg.clamp_eps
	if cfg is not None and hasattr(cfg, "clamp_eps")
	else clamp_eps
	)
	mag_weight = (
	cfg.mag_weight
	if cfg is not None and hasattr(cfg, "mag_weight")
	else mag_weight
	)
	log_weight = (
	cfg.log_weight
	if cfg is not None and hasattr(cfg, "log_weight")
	else log_weight
	)
	pow = cfg.pow if cfg is not None and hasattr(cfg, "pow") else pow
	mel_fmin = (
	cfg.mel_fmin if cfg is not None and hasattr(cfg, "mel_fmin") else mel_fmin
	)
	mel_fmax = (
	cfg.mel_fmax if cfg is not None and hasattr(cfg, "mel_fmax") else mel_fmax
	)

	self.mel_transforms = nn.ModuleList(
	[
	MelSpectrogram(
	sample_rate=sample_rate,
	n_fft=window_length,
	hop_length=window_length // 4,
	n_mels=n_mel,
	power=1.0,
	center=True,
	norm="slaney",
	mel_scale="slaney",
	)
	for n_mel, window_length in zip(n_mels, window_lengths)
	]
	)
	self.n_mels = n_mels
	self.loss_fn = nn.L1Loss()
	self.clamp_eps = clamp_eps
	self.log_weight = log_weight
	self.mag_weight = mag_weight
	self.mel_fmin = mel_fmin
	self.mel_fmax = mel_fmax
	self.pow = pow

	def delta(self, x, k):
	l = x.shape[1]
	return x[:, 0 : l - k] - x[:, k:l]

	def forward(self, x, y, mask=None):
	"""Computes mel loss between an estimate and a reference
	signal.

	Parameters
	----------
	x : AudioSignal
	Estimate signal
	y : AudioSignal
	Reference signal

	Returns
	-------
	torch.Tensor
	Mel loss.
	"""
	loss = 0.0
	for mel_transform in self.mel_transforms:
	x_mel = mel_transform(x)
	y_mel = mel_transform(y)
	log_x_mel = x_mel.clamp(self.clamp_eps).pow(self.pow).log10()
	log_y_mel = y_mel.clamp(self.clamp_eps).pow(self.pow).log10()
	loss += self.log_weight * self.loss_fn(log_x_mel, log_y_mel)
	loss += self.mag_weight * self.loss_fn(x_mel, y_mel)
	# loss += self.loss_fn(self.delta(log_x_mel, 1), self.delta(log_y_mel, 1))
	# log_x_mel = rearrange(log_x_mel, 'b c t -> b t c')
	# log_y_mel = rearrange(log_y_mel, 'b c t -> b t c')
	# for i in range(3):
	# loss += self.loss_fn(self.delta(log_x_mel, i), self.delta(log_y_mel, i))
	# loss /= len(self.mel_transforms)
	return loss


	class GANLoss(nn.Module):
	def __init__(self, mode="lsgan"):
	super(GANLoss, self).__init__()
	assert mode in ["lsgan", "lsgan_std", "hinge"]
	self.mode = mode

	def disc_loss(self, real, fake):
	if self.mode == "lsgan":
	real_loss = F.mse_loss(real, torch.ones_like(real))
	fake_loss = F.mse_loss(fake, torch.zeros_like(fake))
	elif self.mode == "lsgan_std":
	real = (real - 1.0).pow(2)
	fake = (fake - 0.0).pow(2)
	real_loss = real.mean() + real.std()
	fake_loss = fake.mean() + fake.std()
	elif self.mode == "hinge":
	real_loss = torch.relu(1.0 - real).mean()
	fake_loss = torch.relu(1.0 + fake).mean()
	else:
	raise ValueError(f"no such mode {self.mode}")

	return real_loss, fake_loss

	def disc_loss2(self, fake):
	if self.mode == "lsgan":
	fake_loss = F.mse_loss(fake, torch.zeros_like(fake))
	elif self.mode == "lsgan_std":
	fake = (fake - 0.0).pow(2)
	fake_loss = fake.mean() + fake.std()
	elif self.mode == "hinge":
	fake_loss = torch.relu(1.0 + fake).mean()
	else:
	raise ValueError(f"no such mode {self.mode}")

	return fake_loss

	def gen_loss(self, fake):
	if self.mode == "lsgan":
	gen_loss = F.mse_loss(fake, torch.ones_like(fake))
	elif self.mode == "lsgan_std":
	fake = (fake - 1.0).pow(2)
	gen_loss = fake.mean() + fake.std()
	elif self.mode == "hinge":
	gen_loss = -fake.mean()
	else:
	raise ValueError(f"no such mode {self.mode}")

	return gen_loss