AudioX

Runtime error

App Files Files Community

AudioX / stable_audio_tools /models /dit.py

Zeyue7

AudioX

8ab1cf8 about 1 year ago

raw

history blame contribute delete

14.8 kB

	import typing as tp

	import torch

	from einops import rearrange
	from torch import nn
	from torch.nn import functional as F
	from x_transformers import ContinuousTransformerWrapper, Encoder

	from .blocks import FourierFeatures
	from .transformer import ContinuousTransformer

	class DiffusionTransformer(nn.Module):
	def __init__(self,
	io_channels=32,
	patch_size=1,
	embed_dim=768,
	cond_token_dim=0,
	project_cond_tokens=True,
	global_cond_dim=0,
	project_global_cond=True,
	input_concat_dim=0,
	prepend_cond_dim=0,
	depth=12,
	num_heads=8,
	transformer_type: tp.Literal["x-transformers", "continuous_transformer"] = "x-transformers",
	global_cond_type: tp.Literal["prepend", "adaLN"] = "prepend",
	**kwargs):

	super().__init__()

	self.cond_token_dim = cond_token_dim

	# Timestep embeddings
	timestep_features_dim = 256

	self.timestep_features = FourierFeatures(1, timestep_features_dim)

	self.to_timestep_embed = nn.Sequential(
	nn.Linear(timestep_features_dim, embed_dim, bias=True),
	nn.SiLU(),
	nn.Linear(embed_dim, embed_dim, bias=True),
	)

	if cond_token_dim > 0:
	# Conditioning tokens
	cond_embed_dim = cond_token_dim if not project_cond_tokens else embed_dim
	self.to_cond_embed = nn.Sequential(
	nn.Linear(cond_token_dim, cond_embed_dim, bias=False),
	nn.SiLU(),
	nn.Linear(cond_embed_dim, cond_embed_dim, bias=False)
	)
	else:
	cond_embed_dim = 0

	if global_cond_dim > 0:
	# Global conditioning
	global_embed_dim = global_cond_dim if not project_global_cond else embed_dim
	self.to_global_embed = nn.Sequential(
	nn.Linear(global_cond_dim, global_embed_dim, bias=False),
	nn.SiLU(),
	nn.Linear(global_embed_dim, global_embed_dim, bias=False)
	)

	if prepend_cond_dim > 0:
	# Prepend conditioning
	self.to_prepend_embed = nn.Sequential(
	nn.Linear(prepend_cond_dim, embed_dim, bias=False),
	nn.SiLU(),
	nn.Linear(embed_dim, embed_dim, bias=False)
	)

	self.input_concat_dim = input_concat_dim

	dim_in = io_channels + self.input_concat_dim

	self.patch_size = patch_size

	# Transformer

	self.transformer_type = transformer_type

	self.global_cond_type = global_cond_type

	if self.transformer_type == "x-transformers":
	self.transformer = ContinuousTransformerWrapper(
	dim_in=dim_in * patch_size,
	dim_out=io_channels * patch_size,
	max_seq_len=0, #Not relevant without absolute positional embeds
	attn_layers = Encoder(
	dim=embed_dim,
	depth=depth,
	heads=num_heads,
	attn_flash = True,
	cross_attend = cond_token_dim > 0,
	dim_context=None if cond_embed_dim == 0 else cond_embed_dim,
	zero_init_branch_output=True,
	use_abs_pos_emb = False,
	rotary_pos_emb=True,
	ff_swish = True,
	ff_glu = True,
	**kwargs
	)
	)

	elif self.transformer_type == "continuous_transformer":

	global_dim = None

	if self.global_cond_type == "adaLN":
	# The global conditioning is projected to the embed_dim already at this point
	global_dim = embed_dim

	self.transformer = ContinuousTransformer(
	dim=embed_dim,
	depth=depth,
	dim_heads=embed_dim // num_heads,
	dim_in=dim_in * patch_size,
	dim_out=io_channels * patch_size,
	cross_attend = cond_token_dim > 0,
	cond_token_dim = cond_embed_dim,
	global_cond_dim=global_dim,
	**kwargs
	)

	else:
	raise ValueError(f"Unknown transformer type: {self.transformer_type}")

	self.preprocess_conv = nn.Conv1d(dim_in, dim_in, 1, bias=False)
	nn.init.zeros_(self.preprocess_conv.weight)
	self.postprocess_conv = nn.Conv1d(io_channels, io_channels, 1, bias=False)
	nn.init.zeros_(self.postprocess_conv.weight)

	def _forward(
	self,
	x,
	t,
	mask=None,
	cross_attn_cond=None,
	cross_attn_cond_mask=None,
	input_concat_cond=None,
	global_embed=None,
	prepend_cond=None,
	prepend_cond_mask=None,
	return_info=False,
	**kwargs):

	if cross_attn_cond is not None:
	cross_attn_cond = self.to_cond_embed(cross_attn_cond) # MLP endecoder, shape: [1, 130, 768]

	if global_embed is not None:
	# Project the global conditioning to the embedding dimension
	global_embed = self.to_global_embed(global_embed)

	prepend_inputs = None
	prepend_mask = None
	prepend_length = 0
	if prepend_cond is not None:
	# Project the prepend conditioning to the embedding dimension
	prepend_cond = self.to_prepend_embed(prepend_cond)

	prepend_inputs = prepend_cond
	if prepend_cond_mask is not None:
	prepend_mask = prepend_cond_mask

	if input_concat_cond is not None:

	# Interpolate input_concat_cond to the same length as x
	if input_concat_cond.shape[2] != x.shape[2]:
	input_concat_cond = F.interpolate(input_concat_cond, (x.shape[2], ), mode='nearest')

	x = torch.cat([x, input_concat_cond], dim=1)

	# Get the batch of timestep embeddings
	timestep_embed = self.to_timestep_embed(self.timestep_features(t[:, None])) # (b, embed_dim)

	# Timestep embedding is considered a global embedding. Add to the global conditioning if it exists
	if global_embed is not None:
	global_embed = global_embed + timestep_embed
	else:
	global_embed = timestep_embed

	# Add the global_embed to the prepend inputs if there is no global conditioning support in the transformer
	if self.global_cond_type == "prepend": # True
	if prepend_inputs is None: # True
	# Prepend inputs are just the global embed, and the mask is all ones
	prepend_inputs = global_embed.unsqueeze(1) # [1, 1, 1536]
	prepend_mask = torch.ones((x.shape[0], 1), device=x.device, dtype=torch.bool)
	else:
	# Prepend inputs are the prepend conditioning + the global embed
	prepend_inputs = torch.cat([prepend_inputs, global_embed.unsqueeze(1)], dim=1)
	prepend_mask = torch.cat([prepend_mask, torch.ones((x.shape[0], 1), device=x.device, dtype=torch.bool)], dim=1)

	prepend_length = prepend_inputs.shape[1] # 1

	x = self.preprocess_conv(x) + x # [1, 64, 1024]

	x = rearrange(x, "b c t -> b t c") # [1, 1024, 64]

	extra_args = {}

	if self.global_cond_type == "adaLN": # 'prepend'
	extra_args["global_cond"] = global_embed

	if self.patch_size > 1: # self.patch_size==1
	x = rearrange(x, "b (t p) c -> b t (c p)", p=self.patch_size)

	if self.transformer_type == "x-transformers":
	output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, extra_args, kwargs)
	elif self.transformer_type == "continuous_transformer":

	output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, extra_args, kwargs)

	if return_info:
	output, info = output
	elif self.transformer_type == "mm_transformer":
	output = self.transformer(x, context=cross_attn_cond, mask=mask, context_mask=cross_attn_cond_mask, extra_args, kwargs)

	output = rearrange(output, "b t c -> b c t")[:,:,prepend_length:]

	if self.patch_size > 1:
	output = rearrange(output, "b (c p) t -> b c (t p)", p=self.patch_size)

	output = self.postprocess_conv(output) + output

	if return_info:
	return output, info

	return output

	def forward(
	self,
	x,
	t,
	cross_attn_cond=None,
	cross_attn_cond_mask=None,
	negative_cross_attn_cond=None,
	negative_cross_attn_mask=None,
	input_concat_cond=None,
	global_embed=None,
	negative_global_embed=None,
	prepend_cond=None,
	prepend_cond_mask=None,
	cfg_scale=1.0,
	cfg_dropout_prob=0.0,
	causal=False,
	scale_phi=0.0,
	mask=None,
	return_info=False,
	**kwargs):

	assert causal == False, "Causal mode is not supported for DiffusionTransformer"

	if cross_attn_cond_mask is not None:
	cross_attn_cond_mask = cross_attn_cond_mask.bool()

	cross_attn_cond_mask = None # Temporarily disabling conditioning masks due to kernel issue for flash attention

	if prepend_cond_mask is not None:
	prepend_cond_mask = prepend_cond_mask.bool()

	# CFG dropout
	if cfg_dropout_prob > 0.0:
	if cross_attn_cond is not None:
	null_embed = torch.zeros_like(cross_attn_cond, device=cross_attn_cond.device)
	dropout_mask = torch.bernoulli(torch.full((cross_attn_cond.shape[0], 1, 1), cfg_dropout_prob, device=cross_attn_cond.device)).to(torch.bool)
	cross_attn_cond = torch.where(dropout_mask, null_embed, cross_attn_cond)

	if prepend_cond is not None:
	null_embed = torch.zeros_like(prepend_cond, device=prepend_cond.device)
	dropout_mask = torch.bernoulli(torch.full((prepend_cond.shape[0], 1, 1), cfg_dropout_prob, device=prepend_cond.device)).to(torch.bool)
	prepend_cond = torch.where(dropout_mask, null_embed, prepend_cond)


	if cfg_scale != 1.0 and (cross_attn_cond is not None or prepend_cond is not None):
	# Classifier-free guidance
	# Concatenate conditioned and unconditioned inputs on the batch dimension
	batch_inputs = torch.cat([x, x], dim=0)
	batch_timestep = torch.cat([t, t], dim=0)

	if global_embed is not None:
	batch_global_cond = torch.cat([global_embed, global_embed], dim=0)
	else:
	batch_global_cond = None

	if input_concat_cond is not None:
	batch_input_concat_cond = torch.cat([input_concat_cond, input_concat_cond], dim=0)
	else:
	batch_input_concat_cond = None

	batch_cond = None
	batch_cond_masks = None

	# Handle CFG for cross-attention conditioning
	if cross_attn_cond is not None:

	null_embed = torch.zeros_like(cross_attn_cond, device=cross_attn_cond.device)

	# For negative cross-attention conditioning, replace the null embed with the negative cross-attention conditioning
	if negative_cross_attn_cond is not None:

	# If there's a negative cross-attention mask, set the masked tokens to the null embed
	if negative_cross_attn_mask is not None:
	negative_cross_attn_mask = negative_cross_attn_mask.to(torch.bool).unsqueeze(2)

	negative_cross_attn_cond = torch.where(negative_cross_attn_mask, negative_cross_attn_cond, null_embed)

	batch_cond = torch.cat([cross_attn_cond, negative_cross_attn_cond], dim=0)

	else:
	batch_cond = torch.cat([cross_attn_cond, null_embed], dim=0)

	if cross_attn_cond_mask is not None:
	batch_cond_masks = torch.cat([cross_attn_cond_mask, cross_attn_cond_mask], dim=0)

	batch_prepend_cond = None
	batch_prepend_cond_mask = None

	if prepend_cond is not None:

	null_embed = torch.zeros_like(prepend_cond, device=prepend_cond.device)

	batch_prepend_cond = torch.cat([prepend_cond, null_embed], dim=0)

	if prepend_cond_mask is not None:
	batch_prepend_cond_mask = torch.cat([prepend_cond_mask, prepend_cond_mask], dim=0)


	if mask is not None:
	batch_masks = torch.cat([mask, mask], dim=0)
	else:
	batch_masks = None

	batch_output = self._forward(
	batch_inputs,
	batch_timestep,
	cross_attn_cond=batch_cond,
	cross_attn_cond_mask=batch_cond_masks,
	mask = batch_masks,
	input_concat_cond=batch_input_concat_cond,
	global_embed = batch_global_cond,
	prepend_cond = batch_prepend_cond,
	prepend_cond_mask = batch_prepend_cond_mask,
	return_info = return_info,
	**kwargs)

	if return_info:
	batch_output, info = batch_output

	cond_output, uncond_output = torch.chunk(batch_output, 2, dim=0)
	cfg_output = uncond_output + (cond_output - uncond_output) * cfg_scale

	# CFG Rescale
	if scale_phi != 0.0:
	cond_out_std = cond_output.std(dim=1, keepdim=True)
	out_cfg_std = cfg_output.std(dim=1, keepdim=True)
	output = scale_phi * (cfg_output * (cond_out_std/out_cfg_std)) + (1-scale_phi) * cfg_output
	else:
	output = cfg_output

	if return_info:
	return output, info

	return output

	else:
	return self._forward(
	x,
	t,
	cross_attn_cond=cross_attn_cond,
	cross_attn_cond_mask=cross_attn_cond_mask,
	input_concat_cond=input_concat_cond,
	global_embed=global_embed,
	prepend_cond=prepend_cond,
	prepend_cond_mask=prepend_cond_mask,
	mask=mask,
	return_info=return_info,
	**kwargs
	)