Upload unet.py with huggingface_hub

54c8086 verified 2 months ago

7.43 kB

	"""
	U-Net architecture for conditional diffusion on spatiotemporal PDE data.
	Supports non-square inputs, time conditioning, and skip connections.
	"""
	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	class SinusoidalPosEmb(nn.Module):
	"""Sinusoidal positional embedding for diffusion timestep."""

	def __init__(self, dim):
	super().__init__()
	self.dim = dim

	def forward(self, t):
	half = self.dim // 2
	emb = math.log(10000) / (half - 1)
	emb = torch.exp(torch.arange(half, device=t.device) * -emb)
	emb = t[:, None].float() * emb[None, :]
	return torch.cat([emb.sin(), emb.cos()], dim=-1)


	class ResBlock(nn.Module):
	"""Residual block with group norm, SiLU, and time embedding injection."""

	def __init__(self, in_ch, out_ch, time_dim, dropout=0.1):
	super().__init__()
	self.norm1 = nn.GroupNorm(min(32, in_ch), in_ch)
	self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1)
	self.time_mlp = nn.Sequential(nn.SiLU(), nn.Linear(time_dim, out_ch))
	self.norm2 = nn.GroupNorm(min(32, out_ch), out_ch)
	self.dropout = nn.Dropout(dropout)
	self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1)
	self.skip = nn.Conv2d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()

	def forward(self, x, t_emb):
	h = F.silu(self.norm1(x))
	h = self.conv1(h)
	h = h + self.time_mlp(t_emb)[:, :, None, None]
	h = F.silu(self.norm2(h))
	h = self.dropout(h)
	h = self.conv2(h)
	return h + self.skip(x)


	class SelfAttention(nn.Module):
	"""Multi-head self-attention on spatial features."""

	def __init__(self, channels, num_heads=4):
	super().__init__()
	self.norm = nn.GroupNorm(min(32, channels), channels)
	self.attn = nn.MultiheadAttention(channels, num_heads, batch_first=True)

	def forward(self, x):
	B, C, H, W = x.shape
	h = self.norm(x).reshape(B, C, H * W).permute(0, 2, 1)
	h, _ = self.attn(h, h, h)
	h = h.permute(0, 2, 1).reshape(B, C, H, W)
	return x + h


	class Downsample(nn.Module):
	def __init__(self, ch):
	super().__init__()
	self.conv = nn.Conv2d(ch, ch, 3, stride=2, padding=1)

	def forward(self, x):
	return self.conv(x)


	class Upsample(nn.Module):
	def __init__(self, ch):
	super().__init__()
	self.conv = nn.Conv2d(ch, ch, 3, padding=1)

	def forward(self, x):
	x = F.interpolate(x, scale_factor=2, mode="nearest")
	return self.conv(x)


	class UNet(nn.Module):
	"""U-Net for conditional diffusion.

	Condition (e.g. previous frame) is concatenated to the noisy input along
	the channel dimension before being passed to forward(). So set
	``in_channels = output_channels + condition_channels``.

	Args:
	in_channels: noisy-target channels + condition channels.
	out_channels: channels to predict (same as target).
	base_ch: base channel width.
	ch_mults: per-level channel multipliers.
	n_res: residual blocks per level.
	attn_levels: which levels get self-attention (0-indexed).
	dropout: dropout rate.
	time_dim: timestep embedding dimension.
	"""

	def __init__(
	self,
	in_channels,
	out_channels,
	base_ch=64,
	ch_mults=(1, 2, 4, 8),
	n_res=2,
	attn_levels=(3,),
	dropout=0.1,
	time_dim=256,
	):
	super().__init__()
	self.n_res = n_res
	self.ch_mults = ch_mults

	# --- time embedding ---
	self.time_embed = nn.Sequential(
	SinusoidalPosEmb(time_dim),
	nn.Linear(time_dim, time_dim * 4),
	nn.SiLU(),
	nn.Linear(time_dim * 4, time_dim),
	)

	# --- input projection ---
	self.input_conv = nn.Conv2d(in_channels, base_ch, 3, padding=1)

	# --- downsampling path ---
	self.downs = nn.ModuleList()
	ch = base_ch
	skip_chs = [ch] # track channel dims for skip connections

	for lvl, mult in enumerate(ch_mults):
	out_ch = base_ch * mult
	for _ in range(n_res):
	self.downs.append(
	nn.ModuleDict(
	{
	"res": ResBlock(ch, out_ch, time_dim, dropout),
	**(
	{"attn": SelfAttention(out_ch)}
	if lvl in attn_levels
	else {}
	),
	}
	)
	)
	ch = out_ch
	skip_chs.append(ch)
	if lvl < len(ch_mults) - 1:
	self.downs.append(nn.ModuleDict({"down": Downsample(ch)}))
	skip_chs.append(ch)

	# --- middle ---
	self.mid_res1 = ResBlock(ch, ch, time_dim, dropout)
	self.mid_attn = SelfAttention(ch)
	self.mid_res2 = ResBlock(ch, ch, time_dim, dropout)

	# --- upsampling path ---
	self.ups = nn.ModuleList()
	for lvl in reversed(range(len(ch_mults))):
	out_ch = base_ch * ch_mults[lvl]
	for _ in range(n_res + 1): # +1 to consume downsample skip
	skip_ch = skip_chs.pop()
	self.ups.append(
	nn.ModuleDict(
	{
	"res": ResBlock(ch + skip_ch, out_ch, time_dim, dropout),
	**(
	{"attn": SelfAttention(out_ch)}
	if lvl in attn_levels
	else {}
	),
	}
	)
	)
	ch = out_ch
	if lvl > 0:
	self.ups.append(nn.ModuleDict({"up": Upsample(ch)}))

	# --- output projection ---
	self.out_norm = nn.GroupNorm(min(32, ch), ch)
	self.out_conv = nn.Conv2d(ch, out_channels, 3, padding=1)

	def forward(self, x, t, cond=None):
	"""
	Args:
	x: noisy target [B, C_out, H, W]
	t: diffusion timestep [B] (int or float)
	cond: condition [B, C_cond, H, W] (optional, concatenated)
	Returns:
	predicted noise [B, C_out, H, W]
	"""
	if cond is not None:
	x = torch.cat([x, cond], dim=1)

	t_emb = self.time_embed(t)
	h = self.input_conv(x)

	# --- down ---
	skips = [h]
	for block in self.downs:
	if "down" in block:
	h = block["down"](h)
	skips.append(h)
	else:
	h = block["res"](h, t_emb)
	if "attn" in block:
	h = block["attn"](h)
	skips.append(h)

	# --- middle ---
	h = self.mid_res1(h, t_emb)
	h = self.mid_attn(h)
	h = self.mid_res2(h, t_emb)

	# --- up ---
	for block in self.ups:
	if "up" in block:
	h = block["up"](h)
	else:
	s = skips.pop()
	h = torch.cat([h, s], dim=1)
	h = block["res"](h, t_emb)
	if "attn" in block:
	h = block["attn"](h)

	h = F.silu(self.out_norm(h))
	return self.out_conv(h)