Spaces:

owenisas
/

stable-audio-3-lab

Running on Zero

App Files Files Community

stable-audio-3-lab / stable_audio_3 /inference /sampling.py

owenisas

Vendor stable-audio-3 for ZeroGPU

6215e7d verified 2 days ago

raw

history blame contribute delete

22.4 kB

	import torch
	import typing as tp
	from tqdm import trange, tqdm
	import torch.distributions as dist

	from ..data.utils import create_padding_mask_from_lengths, compute_effective_seq_len_from_conditioning


	def build_schedule(
	steps: int,
	sigma_max: float = 1.0,
	dist_shift = None,
	effective_seq_len: tp.Union[int, torch.Tensor, None] = None,
	fallback_seq_len: tp.Optional[int] = None,
	include_endpoint: bool = True,
	device: tp.Union[str, torch.device] = "cpu",
	) -> torch.Tensor:
	"""Build a timestep schedule for diffusion sampling.

	Returns a 1D tensor of shape (N,) where N = steps+1 (if include_endpoint)
	or steps (if not), OR a 2D tensor of shape (batch_size, N) when
	effective_seq_len is a tensor and dist_shift produces per-element schedules.

	Args:
	steps: Number of sampling steps.
	sigma_max: Starting noise level (1.0 for full generation, <1.0 for variations).
	dist_shift: Optional distribution shift object (FluxDistributionShift,
	DistributionShift, LogSNRShift, etc.). Applied to warp the linear schedule.
	effective_seq_len: Sequence length for dist_shift. Scalar int or
	tensor of shape (batch_size,) for per-element schedules.
	fallback_seq_len: Fallback when effective_seq_len is None (typically x.shape[-1]).
	include_endpoint: If True, schedule includes 0 as final value (RF samplers).
	If False, excludes 0 (v-diffusion DDIM).
	device: Device for the output tensor.
	"""
	n_points = steps + 1 if include_endpoint else steps

	if include_endpoint:
	t = torch.linspace(sigma_max, 0, n_points, device=device)
	else:
	t = torch.linspace(sigma_max, 0, n_points + 1, device=device)[:-1]

	if dist_shift is not None:
	seq_len = effective_seq_len if effective_seq_len is not None else fallback_seq_len
	if isinstance(seq_len, torch.Tensor):
	# Clamp per-element sequence lengths to avoid zeros causing log/NaN issues
	seq_len = torch.clamp(seq_len, min=1)
	elif seq_len is not None:
	# Clamp scalar sequence length to at least 1
	seq_len = max(int(seq_len), 1)
	t = dist_shift.shift(t, seq_len)

	# Ensure the first timestep remains aligned with sigma_max after shifting.
	# This keeps the schedule consistent with the initialization in sample_diffusion(),
	# which mixes init_data using sigma_max.
	if isinstance(t, torch.Tensor):
	sigma_max_tensor = t.new_tensor(sigma_max)
	if t.ndim == 1:
	t[0] = sigma_max_tensor
	else:
	# For batched/per-element schedules, enforce sigma_max at the first time index.
	t[..., 0] = sigma_max_tensor

	return t


	def sample_timesteps_logsnr(batch_size, mean_logsnr=-1.2, std_logsnr=2.0):
	"""
	Sample timesteps for diffusion training by sampling logSNR values and converting to t.

	Args:
	batch_size (int): Number of timesteps to sample
	mean_logsnr (float): Mean of the logSNR Gaussian distribution
	std_logsnr (float): Standard deviation of the logSNR Gaussian distribution

	Returns:
	torch.Tensor: Tensor of shape (batch_size,) containing timestep values t in [0, 1]
	"""
	# Sample logSNR from Gaussian distribution
	logsnr = torch.randn(batch_size) * std_logsnr + mean_logsnr

	# Convert logSNR to timesteps using the logistic function
	# Since logSNR = ln((1-t)/t), we can solve for t:
	# t = 1 / (1 + exp(logsnr))
	t = torch.sigmoid(-logsnr)

	# Clamp values to ensure numerical stability
	t = t.clamp(1e-4, 1 - 1e-4)

	return t

	def sample_timesteps_logsnr_uniform(batch_size, min_logsnr=-6, max_logsnr=5.0):
	"""
	Sample timesteps for diffusion training by sampling logSNR values and converting to t.

	Args:
	batch_size (int): Number of timesteps to sample
	min_logsnr (float): Minimum logSNR value
	max_logsnr (float): Maximum logSNR value

	Returns:
	torch.Tensor: Tensor of shape (batch_size,) containing timestep values t in [0, 1]
	"""
	# Sample logSNR from uniform distribution
	logsnr = torch.rand(batch_size) * (max_logsnr - min_logsnr) + min_logsnr

	# Convert logSNR to timesteps using the logistic function
	# Since logSNR = ln((1-t)/t), we can solve for t:
	# t = 1 / (1 + exp(logsnr))
	t = torch.sigmoid(-logsnr)

	# Clamp values to ensure numerical stability
	t = t.clamp(1e-4, 1 - 1e-4)

	return t

	def truncated_logistic_normal_rescaled(shape, left_trunc=0.075, right_trunc=1):
	"""

	shape: shape of the output tensor
	left_trunc: left truncation point, fraction of probability to be discarded
	right_trunc: right truncation boundary, should be 1 (never seen at test time)
	"""

	# Step 1: Sample from the logistic normal distribution (sigmoid of normal)
	logits = torch.randn(shape)

	# Step 2: Apply the CDF transformation of the normal distribution
	normal_dist = dist.Normal(0, 1)
	cdf_values = normal_dist.cdf(logits)

	# Step 3: Define the truncation bounds on the CDF
	lower_bound = normal_dist.cdf(torch.logit(torch.tensor(left_trunc)))
	upper_bound = normal_dist.cdf(torch.logit(torch.tensor(right_trunc)))

	# Step 4: Rescale linear CDF values into the truncated region (between lower_bound and upper_bound)
	truncated_cdf_values = lower_bound + (upper_bound - lower_bound) * cdf_values

	# Step 5: Map back to logistic-normal space using inverse CDF
	truncated_samples = torch.sigmoid(normal_dist.icdf(truncated_cdf_values))

	# Step 6: Rescale values so that min is 0 and max is just below 1
	rescaled_samples = (truncated_samples - left_trunc) / (right_trunc - left_trunc)

	return rescaled_samples

	def sample_discrete_euler(model, x, sigmas, callback=None, disable_tqdm=False, **extra_args):
	"""Draws samples from a model given starting noise. Euler method

	Args:
	sigmas: Pre-computed schedule tensor. Shape (steps+1,) for global schedule
	or (batch_size, steps+1) for per-element schedules.
	"""
	t = sigmas

	# Check if we have per-element schedules (batch_size, steps+1) or global schedule (steps+1,)
	per_element_schedule = t.dim() == 2

	t = t.to(x.device)
	num_steps = t.shape[-1] - 1

	for i in tqdm(range(num_steps), disable=disable_tqdm):
	if per_element_schedule:
	# Per-element schedules: t has shape (batch_size, steps+1)
	t_curr_tensor = t[:, i].to(x.dtype) # (batch_size,)
	t_prev = t[:, i + 1].to(x.dtype) # (batch_size,)
	dt = t_prev - t_curr_tensor # (batch_size,)
	# Reshape for broadcasting with x: (batch_size,) -> (batch_size, 1, 1)
	dt_broadcast = dt.view(-1, 1, 1)
	else:
	# Global schedule: t has shape (steps+1,)
	t_curr = t[i]
	t_prev = t[i + 1]
	t_curr_tensor = t_curr * torch.ones((x.shape[0],), dtype=x.dtype, device=x.device)
	dt = t_prev - t_curr
	dt_broadcast = dt

	v = model(x, t_curr_tensor, **extra_args)

	if callback is not None:
	denoised = x - t_curr_tensor[:, None, None] * v
	callback({'x': x, 't': t_curr_tensor, 'sigma': t_curr_tensor, 'i': i, 'denoised': denoised})

	x = x + dt_broadcast * v

	# If we are on the last timestep, output the denoised data
	return x

	def sample_rk4(model, x, sigmas, callback=None, disable_tqdm=False, **extra_args):
	"""Draws samples from a model given starting noise. 4th-order Runge-Kutta

	Args:
	sigmas: Pre-computed schedule tensor of shape (steps+1,).
	Per-element schedules not supported for RK4.
	"""
	# Make tensor of ones to broadcast the single t values
	ts = x.new_ones([x.shape[0]])

	t = sigmas

	t = t.to(x.device)

	for i, (t_curr, t_prev) in enumerate(tqdm(zip(t[:-1], t[1:]), disable=disable_tqdm)):
	# Broadcast the current timestep to the correct shape
	t_curr_tensor = t_curr * ts
	dt = t_prev - t_curr # we solve backwards in our formulation

	k1 = model(x, t_curr_tensor, **extra_args)

	if callback is not None:
	denoised = x - t_curr * k1
	callback({'x': x, 't': t_curr, 'sigma': t_curr, 'i': i, 'denoised': denoised})

	k2 = model(x + dt / 2 * k1, (t_curr + dt / 2) * ts, **extra_args)
	k3 = model(x + dt / 2 * k2, (t_curr + dt / 2) * ts, **extra_args)

	# Clamp t_prev to avoid evaluating model at exactly t=0
	# (models aren't trained at t=0 and may return garbage/NaN)
	t_prev_eval = t_prev.clamp(min=1e-5)
	k4 = model(x + dt * k3, t_prev_eval * ts, **extra_args)

	x = x + dt / 6 * (k1 + 2 * k2 + 2 * k3 + k4)

	# If we are on the last timestep, output the denoised data
	return x

	def sample_flow_dpmpp(model, x, sigmas, callback=None, disable_tqdm=False, **extra_args):
	"""Draws samples from a model given starting noise. DPM-Solver++ for RF models

	Args:
	sigmas: Pre-computed schedule tensor. Shape (steps+1,) for global schedule
	or (batch_size, steps+1) for per-element schedules.
	"""
	t = sigmas

	# Check if we have per-element schedules (batch_size, steps+1) or global schedule (steps+1,)
	per_element_schedule = t.dim() == 2

	t = t.to(x.device)
	num_steps = t.shape[-1] - 1

	old_denoised = None

	# Clamp t to avoid numerical issues with log(0) and division by zero
	# This prevents inf/-inf values that can cause NaN propagation
	log_snr = lambda t: ((1-t).clamp(min=1e-10) / t.clamp(min=1e-10)).log()

	for i in trange(num_steps, disable=disable_tqdm):
	if per_element_schedule:
	# Per-element schedules: t has shape (batch_size, steps+1)
	t_curr = t[:, i] # (batch_size,)
	t_next = t[:, i + 1] # (batch_size,)
	t_prev = t[:, i - 1] if i > 0 else None
	# Reshape for broadcasting with x: (batch_size,) -> (batch_size, 1, 1)
	t_curr_broadcast = t_curr.view(-1, 1, 1)
	t_next_broadcast = t_next.view(-1, 1, 1)
	t_curr_tensor = t_curr # already (batch_size,)
	else:
	# Global schedule: t has shape (steps+1,)
	t_curr = t[i]
	t_next = t[i + 1]
	t_prev = t[i - 1] if i > 0 else None
	t_curr_broadcast = t_curr
	t_next_broadcast = t_next
	t_curr_tensor = t_curr.expand(x.shape[0])

	model_output = model(x, t_curr_tensor, **extra_args)
	denoised = x - t_curr_broadcast * model_output

	if callback is not None:
	callback({'x': x, 'i': i, 't': t_curr, 'sigma': t_curr, 'denoised': denoised})

	alpha_t = 1 - t_next_broadcast

	# For rectified flow, compute the DPM++ coefficient directly without log_snr
	# to avoid numerical issues at t=0 or t=1
	# The formula is: (-h).expm1() = (t_next - t_curr) / [(1 - t_next) * t_curr]
	# Note: t_next < t_curr, so this is negative
	# We'll compute this directly instead of going through log_snr
	dt = t_next_broadcast - t_curr_broadcast
	# Clamp to avoid division by zero when t_curr or t_next are at boundaries
	dpmpp_coeff = dt / ((1 - t_next_broadcast).clamp(min=1e-10) * t_curr_broadcast.clamp(min=1e-10))

	# Check if this is the first step or the last step (t_next == 0)
	is_first_step = old_denoised is None
	is_last_step = (t_next_broadcast == 0).all() if per_element_schedule else (t_next == 0)

	if is_first_step or is_last_step:
	# First-order update using the directly computed coefficient
	x = (t_next_broadcast / t_curr_broadcast.clamp(min=1e-10)) * x - alpha_t * dpmpp_coeff * denoised
	else:
	# Second-order update with Richardson extrapolation
	if per_element_schedule:
	t_prev_broadcast = t_prev.view(-1, 1, 1)
	else:
	t_prev_broadcast = t_prev
	# Compute r = h_last / h in log-SNR space for second-order correction
	# h = log_snr(t_next) - log_snr(t_curr), h_last = log_snr(t_curr) - log_snr(t_prev)
	h = log_snr(t_next_broadcast) - log_snr(t_curr_broadcast)
	h_last = log_snr(t_curr_broadcast) - log_snr(t_prev_broadcast)
	r = h_last / h
	denoised_d = (1 + 1 / (2 * r)) * denoised - (1 / (2 * r)) * old_denoised
	x = (t_next_broadcast / t_curr_broadcast.clamp(min=1e-10)) * x - alpha_t * dpmpp_coeff * denoised_d

	old_denoised = denoised
	return x

	def sample_flow_pingpong(model, x, sigmas, callback=None, disable_tqdm=False, **extra_args):
	"""Draws samples from a model given starting noise. Ping-pong sampling for distilled models

	Args:
	sigmas: Pre-computed schedule tensor. Shape (steps+1,) for global schedule
	or (batch_size, steps+1) for per-element schedules.
	"""
	t = sigmas

	# Check if we have per-element schedules (batch_size, steps+1) or global schedule (steps+1,)
	per_element_schedule = t.dim() == 2

	t = t.to(x.device)
	num_steps = t.shape[-1] - 1

	for i in trange(num_steps, disable=disable_tqdm):
	if per_element_schedule:
	# Per-element schedules: t has shape (batch_size, steps+1)
	t_curr = t[:, i].to(x.dtype) # (batch_size,)
	t_next = t[:, i + 1].to(x.dtype) # (batch_size,)
	# Reshape for broadcasting with x: (batch_size,) -> (batch_size, 1, 1)
	t_curr_broadcast = t_curr.view(-1, 1, 1)
	t_next_broadcast = t_next.view(-1, 1, 1)
	else:
	# Global schedule: t has shape (steps+1,)
	t_curr = t[i].to(x.dtype)
	t_next = t[i + 1].to(x.dtype)
	t_curr_broadcast = t_curr
	t_next_broadcast = t_next

	# Model forward
	if per_element_schedule:
	t_curr_tensor = t_curr # already (batch_size,)
	else:
	t_curr_tensor = t_curr * torch.ones((x.shape[0],), dtype=x.dtype, device=x.device)

	denoised = x - t_curr_broadcast * model(x, t_curr_tensor, **extra_args)

	if callback is not None:
	callback({'x': x, 'i': i, 't': t_curr, 'sigma': t_curr, 'sigma_hat': t_curr, 'denoised': denoised})

	x = (1 - t_next_broadcast) * denoised + t_next_broadcast * torch.randn_like(x)

	return x



	@torch.no_grad()
	def sample_diffusion(
	model,
	noise: torch.Tensor,
	cond_inputs: dict,
	diffusion_objective: str,
	steps: int,
	cfg_scale: float = 1.0,
	# Varlen support
	conditioning: tp.Optional[tp.List[dict]] = None,
	sample_rate: int = 44100,
	pretransform = None,
	mask_padding_attention: bool = False,
	use_effective_length_for_schedule: bool = False,
	headroom_seconds: float = 5.0,
	padding_mask: tp.Optional[torch.Tensor] = None,
	# Timestep schedule
	dist_shift = None,
	# Sampler options
	sampler_type: str = None,
	batch_cfg: bool = True,
	rescale_cfg: bool = False,
	# CFG options
	apg_scale: float = 1.0,
	# Init data (variation / img2img)
	init_data: tp.Optional[torch.Tensor] = None,
	init_noise_level: float = 1.0,
	# Other
	callback = None,
	disable_tqdm: bool = False,
	decode: bool = True,
	chunked_decode: tp.Optional[bool] = None,
	**sampler_kwargs
	) -> torch.Tensor:
	"""
	Unified sampling function for diffusion models. Handles all diffusion objectives,
	varlen support (padding_mask + effective_seq_len), timestep scheduling, and init_data
	for variation/img2img.

	Args:
	model: The diffusion model backbone (model.model, not the wrapper)
	noise: Initial noise tensor of shape (B, C, T)
	cond_inputs: Pre-processed conditioning inputs dict (merged positive + negative)
	diffusion_objective: One of "v", "rectified_flow", "rf_denoiser"
	steps: Number of sampling steps
	cfg_scale: Classifier-free guidance scale
	conditioning: List of conditioning dicts (for computing varlen from seconds_total)
	sample_rate: Audio sample rate
	pretransform: Optional pretransform for decoding latents and computing downsampling_ratio
	mask_padding_attention: Whether to create padding_mask for attention
	use_effective_length_for_schedule: Whether to use effective_seq_len for dist_shift
	padding_mask: Optional pre-computed padding mask (B, T). If provided, skips
	internal mask computation. Use this to ensure consistency with training masks.
	headroom_seconds: Extra seconds beyond seconds_total for valid region
	dist_shift: Distribution shift object for warping the timestep schedule, or None
	sampler_type: Sampler type. For RF: "euler", "rk4", "dpmpp", "pingpong".
	For v-diffusion: "v-ddim", "v-ddim-cfgpp", or k-diffusion types like "dpmpp-2m-sde".
	batch_cfg: Whether to use batched CFG
	rescale_cfg: Whether to use rescaled CFG
	apg_scale: APG (Adaptive Projected Guidance) scale. 1.0 = full APG, 0.0 = vanilla CFG
	init_data: Optional pre-encoded latent tensor for variation/img2img (shape: B, C, T)
	init_noise_level: Noise level (sigma_max) when using init_data. 1.0 = full noise (no variation).
	callback: Optional callback for progress reporting
	disable_tqdm: Whether to disable progress bar
	decode: Whether to decode latents using pretransform
	**sampler_kwargs: Additional kwargs passed to sampler

	Returns:
	Generated samples (decoded audio if decode=True, else latents)
	"""
	device = noise.device
	batch_size = noise.shape[0]
	latent_seq_len = noise.shape[-1]

	# Compute downsampling ratio
	downsampling_ratio = pretransform.downsampling_ratio if pretransform is not None else 1

	# Default sampler_type per objective
	if sampler_type is None:
	sampler_type = "pingpong" if diffusion_objective == "rf_denoiser" else "euler"


	# Compute effective_seq_len for dist_shift if enabled
	effective_seq_len = None
	if use_effective_length_for_schedule and conditioning is not None:
	effective_seq_len = compute_effective_seq_len_from_conditioning(
	conditioning, sample_rate, downsampling_ratio, device
	)

	# Create padding_mask for attention if enabled (skip if pre-computed mask provided)
	if padding_mask is None and mask_padding_attention and conditioning is not None:
	raw_effective_len = compute_effective_seq_len_from_conditioning(
	conditioning, sample_rate, downsampling_ratio, device
	)
	if raw_effective_len is not None:
	headroom_tokens = int(headroom_seconds * sample_rate / downsampling_ratio)
	valid_lengths = (raw_effective_len + headroom_tokens).clamp(max=latent_seq_len).long()
	padding_mask = create_padding_mask_from_lengths(valid_lengths, latent_seq_len)

	# Determine sigma_max for schedule
	sigma_max = init_noise_level if init_data is not None else 1.0

	# Mix init_data with noise for variation/img2img
	# For k-diffusion v-diffusion samplers, init_data is passed through to sample_k
	# which handles mixing internally with its own sigma scaling
	k_diff_sampler_types = {"k-heun", "k-lms", "k-dpmpp-2s-ancestral", "k-dpm-2",
	"k-dpm-fast", "k-dpm-adaptive", "dpmpp-2m-sde", "dpmpp-3m-sde", "dpmpp-2m"}

	if init_data is not None:
	noise = init_data * (1 - sigma_max) + noise * sigma_max

	# Build common sampler kwargs (conditioning + model-level params only).
	# disable_tqdm and callback are passed explicitly to samplers that use them,
	# not included here, to avoid leaking into model forward() calls.
	common_kwargs = {
	**cond_inputs,
	"cfg_scale": cfg_scale,
	"batch_cfg": batch_cfg,
	"rescale_cfg": rescale_cfg,
	"padding_mask": padding_mask,
	"apg_scale": apg_scale,
	**sampler_kwargs
	}


	if diffusion_objective in ["rectified_flow", "rf_denoiser"]:
	# Remove v-diffusion-specific kwargs that don't apply to RF
	common_kwargs.pop("sigma_min", None)
	common_kwargs.pop("sigma_max", None)
	common_kwargs.pop("rho", None)

	# Build schedule
	sigmas = build_schedule(
	steps=steps, sigma_max=sigma_max,
	dist_shift=dist_shift, effective_seq_len=effective_seq_len,
	fallback_seq_len=latent_seq_len, include_endpoint=True, device=device
	)

	# Route to sampler
	if sampler_type == "euler":
	sampled = sample_discrete_euler(model, noise, sigmas=sigmas, callback=callback, disable_tqdm=disable_tqdm, **common_kwargs)
	elif sampler_type == "rk4":
	sampled = sample_rk4(model, noise, sigmas=sigmas, callback=callback, disable_tqdm=disable_tqdm, **common_kwargs)
	elif sampler_type == "dpmpp":
	sampled = sample_flow_dpmpp(model, noise, sigmas=sigmas, callback=callback, disable_tqdm=disable_tqdm, **common_kwargs)
	elif sampler_type == "pingpong":
	sampled = sample_flow_pingpong(model, noise, sigmas=sigmas, callback=callback, disable_tqdm=disable_tqdm, **common_kwargs)
	else:
	raise ValueError(f"Unknown sampler_type for {diffusion_objective}: {sampler_type}")

	else:
	raise ValueError(f"Unknown diffusion_objective: {diffusion_objective}")

	# Decode if requested
	if decode and pretransform is not None:
	sampled = sampled.to(next(pretransform.parameters()).dtype)
	sampled = pretransform.decode(sampled, chunked=chunked_decode)

	# Zero out audio beyond valid region (padding positions decode to garbage)
	if padding_mask is not None:
	audio_mask = padding_mask.unsqueeze(1).repeat_interleave(downsampling_ratio, dim=-1)
	# Trim or pad to match sampled length
	if audio_mask.shape[-1] > sampled.shape[-1]:
	audio_mask = audio_mask[..., :sampled.shape[-1]]
	elif audio_mask.shape[-1] < sampled.shape[-1]:
	audio_mask = torch.nn.functional.pad(audio_mask, (0, sampled.shape[-1] - audio_mask.shape[-1]), value=False)
	sampled = sampled * audio_mask.to(sampled.dtype)

	return sampled