Upload 3 files

9442c34 verified about 2 years ago

17.1 kB

	from functools import partial
	import torch
	import torch.nn as nn
	import numpy as np
	import torch.utils.checkpoint
	from timm.models.swin_transformer import SwinTransformerBlock
	from timm.models.vision_transformer import Block
	from timm.models.layers import to_2tuple


	class PatchEmbed(nn.Module):
	""" Image to Patch Embedding
	"""

	def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
	super().__init__()
	img_size = to_2tuple(img_size)
	patch_size = to_2tuple(patch_size)
	num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
	self.patch_hw = (img_size[1] // patch_size[1], img_size[0] // patch_size[0])
	self.img_size = img_size
	self.patch_size = patch_size
	self.num_patches = num_patches

	self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

	def forward(self, x):
	B, C, H, W = x.shape
	x = self.proj(x).flatten(2).transpose(1, 2)
	return x


	def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
	"""
	embed_dim: output dimension for each position
	pos: a list of positions to be encoded: size (M,)
	out: (M, D)
	"""
	assert embed_dim % 2 == 0
	omega = np.arange(embed_dim // 2, dtype=float)
	omega /= embed_dim / 2.
	omega = 1. / 10000 ** omega # (D/2,)

	pos = pos.reshape(-1) # (M,)
	out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product

	emb_sin = np.sin(out) # (M, D/2)
	emb_cos = np.cos(out) # (M, D/2)

	emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
	return emb


	def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
	assert embed_dim % 2 == 0
	# use half of dimensions to encode grid_h
	emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
	emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
	emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
	return emb


	def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
	"""
	grid_size: int of the grid height and width
	return:
	pos_embed: [grid_sizegrid_size, embed_dim] or [1+grid_sizegrid_size, embed_dim] (w/ or w/o cls_token)
	"""
	grid_h = np.arange(grid_size, dtype=np.float32)
	grid_w = np.arange(grid_size, dtype=np.float32)
	grid = np.meshgrid(grid_w, grid_h) # here w goes first
	grid = np.stack(grid, axis=0)

	grid = grid.reshape([2, 1, grid_size, grid_size])
	pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
	if cls_token:
	pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
	return pos_embed


	def get_2d_sincos_pos_embed_flexible(embed_dim, grid_size, cls_token=False):
	"""
	grid_size: int of the grid height and width
	return:
	pos_embed: [grid_sizegrid_size, embed_dim] or [1+grid_sizegrid_size, embed_dim] (w/ or w/o cls_token)
	"""
	grid_h = np.arange(grid_size[0], dtype=np.float32)
	grid_w = np.arange(grid_size[1], dtype=np.float32)
	grid = np.meshgrid(grid_w, grid_h) # here w goes first
	grid = np.stack(grid, axis=0)

	grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
	pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
	if cls_token:
	pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
	return pos_embed


	class SwinTransformerBlockWrapper(torch.nn.Module):
	"""
	Wrap SwinTransformerBlock to fit the input shape of [B, N, C] like TransformerBlock.

	The SwinTransformerBlock takes the input shape of [B, H, W, C], and TransformerBlock
	takes the input shape of [B, N, C].
	"""

	def __init__(self, block: SwinTransformerBlock):
	super().__init__()
	self.block = block
	self.input_resolution = block.input_resolution

	def forward(self, x):
	"""
	:param x: [B, N, C]
	:return: [B, N, C]
	"""
	B, N, C = x.shape
	x = x.reshape(B, *self.input_resolution, C)
	x = self.block(x)
	x = x.reshape(B, N, C)
	return x


	class MaskedAutoencoderViT(nn.Module):
	""" Masked Autoencoder with VisionTransformer backbone
	"""

	def __init__(
	self,
	img_size=224,
	patch_size=16,
	in_chans=3, # input channels. 1 for audio, 3 for image
	embed_dim=1024,
	depth=24, # transformer depth
	num_heads=16,
	decoder_mode=0, # 0: transformer (global attn), 1: swin-transformer (swined local attn)
	no_shift=False, # invalid when decoder_mode=0. swin-transformer. shift patch or not
	decoder_embed_dim=512,
	decoder_depth=8, # invalid when decoder_mode=1. It will be fixed to 16 when decoder_mode=1.
	decoder_num_heads=16, # invalid when decoder_mode=1. It will be fixed to 16 when decoder_mode=1.
	mlp_ratio=4., # hidden dimension / embed dimension in feedforward layer of transformer
	norm_layer=nn.LayerNorm,
	norm_pix_loss=False, # use (per-patch) normalized pixels as targets for computing loss
	pos_trainable=False,
	):
	super().__init__()

	self.img_size = to_2tuple(img_size)

	self.embed_dim = embed_dim
	self.decoder_embed_dim = decoder_embed_dim
	# MAE encoder specifics
	self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim)
	num_patches = self.patch_embed.num_patches

	self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))

	self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim),
	requires_grad=pos_trainable) # fixed sin-cos embedding

	self.encoder_depth = depth
	self.blocks = nn.ModuleList([
	Block(embed_dim, num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer) for _ in range(depth)])
	self.norm = norm_layer(embed_dim)

	# --------------------------------------------------------------------------
	# MAE decoder specifics
	self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True)

	self.mask_token = nn.Parameter(torch.zeros(1, 1, decoder_embed_dim))
	self.decoder_pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, decoder_embed_dim),
	requires_grad=pos_trainable) # fixed sin-cos embedding

	self.no_shift = no_shift

	self.decoder_mode = decoder_mode

	window_size = (4, 4)
	feat_size = (self.img_size[0] // patch_size, 8)

	if self.decoder_mode == 1:
	decoder_modules = []
	for index in range(16):
	if self.no_shift:
	shift_size = (0, 0)
	else:
	if (index % 2) == 0:
	shift_size = (0, 0)
	else:
	shift_size = (2, 0)
	# shift_size = tuple([0 if ((index % 2) == 0) else w // 2 for w in window_size])
	decoder_modules.append(
	SwinTransformerBlockWrapper(
	SwinTransformerBlock(
	dim=decoder_embed_dim,
	input_resolution=feat_size,
	num_heads=16,
	window_size=window_size,
	shift_size=shift_size,
	mlp_ratio=mlp_ratio,
	proj_drop=0.0,
	attn_drop=0.0,
	drop_path=0.0,
	norm_layer=norm_layer,
	)
	)
	)
	self.decoder_blocks = nn.ModuleList(decoder_modules)
	else:
	# Transformer
	self.decoder_blocks = nn.ModuleList([
	Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer)
	for _ in range(decoder_depth)])

	self.decoder_norm = norm_layer(decoder_embed_dim)
	self.decoder_pred = nn.Linear(decoder_embed_dim, patch_size ** 2 * in_chans, bias=True) # decoder to patch

	self.norm_pix_loss = norm_pix_loss

	self.patch_size = patch_size

	self.initialize_weights()

	def initialize_weights(self):
	# initialize (and freeze) pos_embed by sin-cos embedding
	pos_embed = get_2d_sincos_pos_embed_flexible(self.pos_embed.shape[-1], self.patch_embed.patch_hw,
	cls_token=True)
	self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))

	decoder_pos_embed = get_2d_sincos_pos_embed_flexible(self.decoder_pos_embed.shape[-1],
	self.patch_embed.patch_hw, cls_token=True)
	self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0))

	# initialize patch_embed like nn.Linear (instead of nn.Conv2d)
	w = self.patch_embed.proj.weight.data
	torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

	# timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
	torch.nn.init.normal_(self.cls_token, std=.02)
	torch.nn.init.normal_(self.mask_token, std=.02)

	# initialize nn.Linear and nn.LayerNorm
	self.apply(self._init_weights)

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	# we use xavier_uniform following official JAX ViT:
	torch.nn.init.xavier_uniform_(m.weight)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)

	def patchify(self, imgs):
	"""
	imgs: (N, 3, H, W)
	x: (N, L, patch_size*2 3)
	L = (H/p)*(W/p)
	"""
	p = self.patch_embed.patch_size[0]

	h = imgs.shape[2] // p
	w = imgs.shape[3] // p
	# h,w = self.patch_embed.patch_hw
	x = imgs.reshape(shape=(imgs.shape[0], 1, h, p, w, p))
	x = torch.einsum('nchpwq->nhwpqc', x)
	x = x.reshape(imgs.shape[0], h * w, p ** 2 * 1)

	return x

	def unpatchify(self, x):
	"""
	x: (N, L, patch_size*2 3)
	specs: (N, 1, H, W)
	"""
	p = self.patch_embed.patch_size[0]
	h = self.img_size[0] // p
	w = 128 // p
	x = x.reshape(shape=(x.shape[0], h, w, p, p, 1))
	x = torch.einsum('nhwpqc->nchpwq', x)
	specs = x.reshape(x.shape[0], 1, h * p, w * p)
	return specs

	def random_masking(self, x, mask_ratio):
	"""
	Perform per-sample random masking by per-sample shuffling.
	Per-sample shuffling is done by argsort random noise.
	x: [N, L, D], sequence
	"""
	N, L, D = x.shape # batch, length, dim
	len_keep = int(L * (1 - mask_ratio))

	noise = torch.rand(N, L, device=x.device) # noise in [0, 1]

	# sort noise for each sample
	ids_shuffle = torch.argsort(noise, dim=1) # ascend: small is keep, large is remove
	ids_restore = torch.argsort(ids_shuffle, dim=1)

	# keep the first subset
	ids_keep = ids_shuffle[:, :len_keep]
	x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))

	# generate the binary mask: 0 is keep, 1 is remove
	mask = torch.ones([N, L], device=x.device)
	mask[:, :len_keep] = 0
	# unshuffle to get the binary mask
	mask = torch.gather(mask, dim=1, index=ids_restore)

	return x_masked, mask, ids_restore

	def forward_encoder(self, x, mask_ratio):
	"""
	:param x: [N, C, H, W]
	:param mask_ratio: float. ratio of masked patches
	:return: tuple. x: [N, L', D], mask: [N, L], ids_restore: [N, L], None
	"""
	# embed patches
	x = self.patch_embed(x)

	B, L, D = x.shape

	# add pos embed w/o cls token
	x = x + self.pos_embed[:, 1:L + 1, :]

	# masking: length -> length * mask_ratio
	x, mask, ids_restore = self.random_masking(x, mask_ratio)

	# append cls token
	cls_token = self.cls_token + self.pos_embed[:, :1, :]
	cls_tokens = cls_token.expand(x.shape[0], -1, -1)
	x = torch.cat((cls_tokens, x), dim=1)

	# apply Transformer blocks
	for blk in self.blocks:
	x = blk(x)
	x = self.norm(x)

	return x, mask, ids_restore

	def forward_encoder_no_mask(
	self,
	x,
	header='mean'
	):
	"""
	:param x: [N, C, H, W]
	:param header: str. 'cls' or 'mean'
	:param key_padding_mask: [N, L], 0 is keep, 1 is remove
	:return: contextual_emb: [N, L, D]
	"""
	# embed patches
	x = self.patch_embed(x)

	B, L, D = x.shape

	# add pos embed w/o cls token
	x = x + self.pos_embed[:, 1:L + 1, :]

	# append cls token
	cls_token = self.cls_token + self.pos_embed[:, :1, :]
	cls_tokens = cls_token.expand(x.shape[0], -1, -1)
	x = torch.cat((cls_tokens, x), dim=1)

	# apply Transformer blocks
	for n, blk in enumerate(self.blocks):
	x = blk(x)

	x = self.norm(x)

	if header == 'cls':
	emb = x[:, 0, :]
	elif header == 'mean':
	emb = x[:, 1:, :].mean(dim=1)
	else:
	raise NotImplementedError

	return emb

	def forward_decoder(self, x, ids_restore):
	"""
	:param x: [N, L, D]
	:param ids_restore: [N, L]
	:return: pred: [N, L, pp3], None, None
	"""
	# embed tokens
	x = self.decoder_embed(x) # [N, L, D] -> [N, L, D']

	# append mask tokens to sequence
	mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1)
	x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1) # no cls token
	x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])) # unshuffle
	x = torch.cat([x[:, :1, :], x_], dim=1) # append cls token

	B, L, D = x.shape

	# add pos embed
	x = x + self.decoder_pos_embed[:, :L, :]

	if self.decoder_mode != 0:
	B, L, D = x.shape
	x = x[:, 1:, :]

	if self.decoder_mode > 3: # mvit
	x = self.decoder_blocks(x)
	else:
	# apply Transformer blocks
	for blk in self.decoder_blocks:
	x = blk(x)

	x = self.decoder_norm(x)

	# predictor projection
	pred = self.decoder_pred(x)

	# remove cls token
	if self.decoder_mode == 0:
	pred = pred[:, 1:, :]

	return pred

	def forward_loss(self, imgs, pred, mask, norm_pix_loss=False):
	"""
	imgs: [N, 3, H, W]
	pred: [N, L, pp3]
	mask: [N, L], 0 is keep, 1 is remove,
	"""
	target = self.patchify(imgs)
	if norm_pix_loss:
	mean = target.mean(dim=-1, keepdim=True)
	var = target.var(dim=-1, keepdim=True)
	target = (target - mean) / (var + 1.e-6) ** .5

	loss = (pred - target) ** 2
	loss = loss.mean(dim=-1) # [N, L], mean loss per patch

	loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches
	return loss

	def forward(self, imgs, mask_ratio=0.8):
	"""

	:param imgs: [N, C, H, W]
	:param mask_ratio: float. ratio of masked patches
	:return: tuple. loss_recon: float, pred: [N, L, pp3], mask: [N, L], None
	"""
	emb_enc, mask, ids_restore = self.forward_encoder(imgs, mask_ratio)
	pred = self.forward_decoder(emb_enc, ids_restore) # [N, L, pp3]
	loss_recon = self.forward_loss(imgs, pred, mask, norm_pix_loss=self.norm_pix_loss)
	return loss_recon, pred, mask


	if __name__ == '__main__':
	device = 'cpu'
	# device = 'cuda'

	# Model
	audio_mae = MaskedAutoencoderViT(
	img_size=(2048, 128),
	patch_size=16,
	in_chans=1,
	embed_dim=768,
	depth=12,
	num_heads=12,
	decoder_mode=1,
	no_shift=False,
	decoder_embed_dim=512,
	norm_layer=partial(nn.LayerNorm, eps=1e-6),
	norm_pix_loss=False,
	pos_trainable=False,
	)

	# Load pre-trained weights
	ckpt_path = 'music-mae-32kHz.pth'
	audio_mae.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
	audio_mae.to(device)

	# Generate a batch of random inputs: (N, C, H, W), N=4 (batch size), C=1 (channel), H=2048, W=128
	# Each input is a mel spectrogram with shape (2048, 128)
	x = torch.randn(4, 1, 2048, 128).to(device)

	# Compute the representation of the input batch
	emb = audio_mae.forward_encoder_no_mask(x, header='mean') # torch.Size([4, 768])