asdf98
/

LuminaRS

Model card Files Files and versions

xet

Community

asdf98 commited on 11 days ago

Commit

c6e7340

verified ·

1 Parent(s): bc114d4

Upload luminars/model.py

Browse files

Files changed (1) hide show

luminars/model.py +265 -0

luminars/model.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""
+LuminaRS -- Lightweight Latent Recursive Diffusion.
+A small UNet+iterative-refinement model (~110M params) for art/illustration generation.
+Uses: pretrained VAE, pretrained CLIP text encoder (both frozen), custom lightweight UNet.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+# ---------------------------------------------------------------------------
+# Utilities
+# ---------------------------------------------------------------------------
+def timestep_embedding(t, dim, max_period=10000):
+    """Create sinusoidal timestep embeddings."""
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(0, half, dtype=torch.float32, device=t.device) / half)
+    args = t[:, None] * freqs[None]
+    emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        emb = torch.cat([emb, torch.zeros_like(emb[:, :1])], dim=-1)
+    return emb
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        return self.g * x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+# ---------------------------------------------------------------------------
+# Multi-Query Attention (MQA) -- faster than MHA on mobile
+# ---------------------------------------------------------------------------
+class MQAttention(nn.Module):
+    def __init__(self, dim, n_heads=8):
+        super().__init__()
+        assert dim % n_heads == 0
+        self.n_heads = n_heads
+        self.dh = dim // n_heads
+        self.scale = self.dh ** -0.5
+        self.q_proj = nn.Linear(dim, dim)
+        self.k_proj = nn.Linear(dim, dim)
+        self.v_proj = nn.Linear(dim, dim)
+        self.out_proj = nn.Linear(dim, dim)
+    def forward(self, x, context=None):
+        B, L, C = x.shape
+        if context is None:
+            context = x
+        q = self.q_proj(x).view(B, L, self.n_heads, self.dh).transpose(1, 2)
+        k = self.k_proj(context).view(B, -1, self.n_heads, self.dh).transpose(1, 2)
+        v = self.v_proj(context).view(B, -1, self.n_heads, self.dh).transpose(1, 2)
+        attn = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        out = torch.matmul(attn, v).transpose(1, 2).reshape(B, L, C)
+        return self.out_proj(out)
+# ---------------------------------------------------------------------------
+# ConvNeXt-like Block (depthwise + pointwise + GELU)
+# ---------------------------------------------------------------------------
+class ConvNeXtBlock(nn.Module):
+    def __init__(self, dim, drop_path=0.0, text_dim=None):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
+        self.norm = nn.GroupNorm(1, dim)
+        self.pwconv1 = nn.Linear(dim, dim * 4)
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(dim * 4, dim)
+        self.gamma = nn.Parameter(torch.zeros(1, dim, 1, 1)) if drop_path == 0.0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        # Optional cross-attention for text conditioning
+        self.text_attn = None
+        if text_dim is not None:
+            self.text_norm = RMSNorm(dim)
+            self.text_attn = MQAttention(dim)
+            self.text_proj = nn.Linear(text_dim, dim)
+    def forward(self, x, text_emb=None):
+        shortcut = x
+        x = self.dwconv(x)
+        x = self.norm(x)
+        # pointwise via 1x1 conv (channel mixer)
+        x = x.permute(0, 2, 3, 1)  # (B, H, W, C)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        x = x.permute(0, 3, 1, 2)  # (B, C, H, W)
+        if self.gamma is not None:
+            x = x * self.gamma
+        x = shortcut + self.drop_path(x)
+        if self.text_attn is not None and text_emb is not None:
+            B, C, H, W = x.shape
+            x_flat = x.view(B, C, H * W).transpose(1, 2)  # (B, HW, C)
+            x_flat = x_flat + self.text_attn(
+                self.text_norm(x_flat),
+                self.text_proj(text_emb)
+            )
+            x = x_flat.transpose(1, 2).view(B, C, H, W)
+        return x
+class DropPath(nn.Module):
+    """Stochastic depth (drop path)."""
+    def __init__(self, drop_prob=0.0):
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        if self.drop_prob == 0.0 or not self.training:
+            return x
+        keep_prob = 1 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        return x * keep_prob + x * torch.zeros(shape, device=x.device).bernoulli_(keep_prob)
+# ---------------------------------------------------------------------------
+# Down/Up blocks
+# ---------------------------------------------------------------------------
+class DownBlock(nn.Module):
+    def __init__(self, in_ch, out_ch, n_blocks=2, text_dim=None, drop_path=0.0):
+        super().__init__()
+        self.blocks = nn.ModuleList([
+            ConvNeXtBlock(in_ch if i == 0 else out_ch, drop_path=drop_path, text_dim=text_dim)
+            for i in range(n_blocks)
+        ])
+        self.down = nn.Conv2d(out_ch, out_ch, kernel_size=3, stride=2, padding=1)
+    def forward(self, x, text_emb=None):
+        for blk in self.blocks:
+            x = blk(x, text_emb)
+        x = self.down(x)
+        return x
+class UpBlock(nn.Module):
+    def __init__(self, in_ch, out_ch, n_blocks=2, text_dim=None, drop_path=0.0):
+        super().__init__()
+        self.up = nn.ConvTranspose2d(in_ch, out_ch, kernel_size=2, stride=2)
+        self.blocks = nn.ModuleList([
+            ConvNeXtBlock(out_ch, drop_path=drop_path, text_dim=text_dim)
+            for _ in range(n_blocks)
+        ])
+    def forward(self, x, skip, text_emb=None):
+        x = self.up(x)
+        x = x + skip
+        for blk in self.blocks:
+            x = blk(x, text_emb)
+        return x
+# ---------------------------------------------------------------------------
+# Time Embedder
+# ---------------------------------------------------------------------------
+class TimeEmbed(nn.Module):
+    def __init__(self, t_dim=256, out_dim=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(t_dim, out_dim),
+            nn.SiLU(),
+            nn.Linear(out_dim, out_dim),
+        )
+    def forward(self, t):
+        return self.mlp(timestep_embedding(t, self.mlp[0].in_features))
+# ---------------------------------------------------------------------------
+# MAIN MODEL: LuminaRS
+# ---------------------------------------------------------------------------
+class LuminaRS(nn.Module):
+    """
+    Lightweight latent diffusion model with iterative refinement.
+    Architecture (1024x1024 target, 32x32x16 latent):
+    - Encoder: 16 -> 32 -> 64 -> 128 -> 256 (channels at each scale)
+    - Bottleneck: 256-ch blocks
+    - Decoder: 256 -> 128 -> 64 -> 32 -> 16 (with skip)
+    - Cross-attention at every block (MQA)
+    - Shared weights applied recursively T times per denoising step (like TRM/HRM)
+    """
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        chs = cfg.channels
+        self.time_embed = TimeEmbed(cfg.t_embed_dim, cfg.channels[0] * 4)
+        # Project time into each scale
+        self.time_projs = nn.ModuleList([nn.Linear(cfg.channels[0] * 4, c) for c in chs])
+        # Text conditioning (use frozen CLIP text encoder externally)
+        self.text_proj = nn.Linear(cfg.text_embed_dim, cfg.channels[0])
+        # --- Encoder ---
+        self.in_conv = nn.Conv2d(cfg.latent_dim, chs[0], kernel_size=3, padding=1)
+        self.enc_blocks = nn.ModuleList()
+        for i in range(len(chs) - 1):
+            self.enc_blocks.append(DownBlock(chs[i], chs[i+1], n_blocks=2,
+                                              text_dim=cfg.channels[0], drop_path=cfg.drop_path))
+        # --- Bottleneck ---
+        self.bottleneck = nn.ModuleList([
+            ConvNeXtBlock(chs[-1], drop_path=cfg.drop_path, text_dim=cfg.channels[0])
+            for _ in range(cfg.n_bottleneck)
+        ])
+        # --- Decoder ---
+        self.dec_blocks = nn.ModuleList()
+        for i in range(len(chs) - 1, 0, -1):
+            self.dec_blocks.append(UpBlock(chs[i], chs[i-1], n_blocks=2,
+                                            text_dim=cfg.channels[0], drop_path=cfg.drop_path))
+        self.out_conv = nn.Conv2d(chs[0], cfg.latent_dim, kernel_size=1)
+        # --- Iterative Refinement (recursive depth like TRM) ---
+        self.n_recurse = cfg.n_recurse  # T: number of shared-weight passes
+    def forward(self, z, text_emb, t):
+        """
+        z:        (B, latent_dim, H, W) -- noisy latent
+        text_emb: (B, L, text_embed_dim) -- CLIP text embeddings
+        t:        (B,) -- timestep (0=noise, 1=clean for flow matching)
+        Returns:  (B, latent_dim, H, W) -- predicted velocity / noise
+        """
+        B = z.shape[0]
+        # Time embedding
+        t_emb = self.time_embed(t)  # (B, C0*4)
+        # Text projection
+        text_cond = self.text_proj(text_emb)  # (B, L, C0)
+        # --- RECURSIVE REFINEMENT (TRM-style shared-weight loops) ---
+        x = self.in_conv(z)
+        for _ in range(self.n_recurse):
+            # Encoder
+            skips = []
+            h = x
+            for i, down in enumerate(self.enc_blocks):
+                t_scale = self.time_projs[i](t_emb)[:, :, None, None]
+                h = h + t_scale
+                h = down(h, text_cond)
+                skips.append(h)
+            # Bottleneck
+            for blk in self.bottleneck:
+                h = blk(h, text_cond)
+            # Decoder
+            for i, up in enumerate(self.dec_blocks):
+                t_scale = self.time_projs[len(self.enc_blocks) - i](t_emb)[:, :, None, None]
+                h = h + t_scale
+                skip = skips[len(skips) - 1 - i]
+                h = up(h, skip, text_cond)
+            x = x + h  # residual update (like TRM iterative refinement)
+        return self.out_conv(x)
+    def count_params(self):
+        total = sum(p.numel() for p in self.parameters())
+        train = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return total, train