asdf98
/

microforge

Model card Files Files and versions

xet

Community

asdf98 commited on 8 days ago

Commit

e80ddea

verified ·

1 Parent(s): 8801354

Add microforge/planner.py

Browse files

Files changed (1) hide show

microforge/planner.py +270 -0

microforge/planner.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""
+Recurrent Latent Planner (RLP)
+===============================
+The "reasoning core" of MicroForge. Inspired by:
+- RIN (Recurrent Interface Networks, Jabri et al. 2022): decoupled latent tokens
+  that iteratively refine via cross-attention to image tokens
+- DiMSUM shared attention: lightweight global context
+- HRM/TRM recursive reasoning: iterative refinement of a compact state
+The RLP maintains a fixed set of K latent tokens (the "plan") that:
+1. READ from the noised image latent to understand current state
+2. REASON internally via self-attention over plan tokens
+3. WRITE back to the image latent to guide denoising
+This is applied BEFORE each denoising step, creating a planning loop:
+  plan_0 = init(text_emb)
+  for step in diffusion_steps:
+      plan_{s+1} = RLP.read_reason_write(z_s, plan_s, text_emb)
+      z_{s+1} = backbone(z_s, t_s, text_emb, plan_{s+1})
+Key insight (from RIN): the plan tokens are much fewer than image tokens
+(K=32 vs N=256+), so self-attention over plan is cheap. Cross-attention
+(K queries, N keys) is O(K*N) which is small when K << N.
+This gives the model a "thinking" mechanism: it can reason about the
+image at a higher level before committing to pixel-level changes.
+For editing: the planner can compare source and target latents and
+plan what needs to change (like a diff operation in latent space).
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+class PlannerReadWrite(nn.Module):
+    """
+    Cross-attention interface between plan tokens and image tokens.
+    READ: plan attends to image -> updates plan
+    WRITE: image attends to plan -> plan guides image
+    """
+    def __init__(self, dim: int, num_heads: int = 4):
+        super().__init__()
+        self.head_dim = dim // num_heads
+        self.num_heads = num_heads
+        # Read: plan tokens query, image tokens are keys/values
+        self.read_q = nn.Linear(dim, dim, bias=False)
+        self.read_kv = nn.Linear(dim, dim * 2, bias=False)
+        self.read_out = nn.Linear(dim, dim, bias=False)
+        self.read_norm_plan = nn.LayerNorm(dim)
+        self.read_norm_img = nn.LayerNorm(dim)
+        # Write: image tokens query, plan tokens are keys/values
+        self.write_q = nn.Linear(dim, dim, bias=False)
+        self.write_kv = nn.Linear(dim, dim * 2, bias=False)
+        self.write_out = nn.Linear(dim, dim, bias=False)
+        self.write_norm_img = nn.LayerNorm(dim)
+        self.write_norm_plan = nn.LayerNorm(dim)
+    def _attention(self, q, k, v):
+        B, H, N, D = q.shape
+        scale = D ** -0.5
+        attn = (q @ k.transpose(-2, -1)) * scale
+        attn = attn.softmax(dim=-1)
+        return attn @ v
+    def read(self, plan: torch.Tensor, image: torch.Tensor) -> torch.Tensor:
+        """Plan reads from image. plan: [B,K,D], image: [B,N,D] -> updated plan [B,K,D]"""
+        B, K, D = plan.shape
+        N = image.shape[1]
+        q = self.read_q(self.read_norm_plan(plan)).reshape(B, K, self.num_heads, self.head_dim).transpose(1, 2)
+        kv = self.read_kv(self.read_norm_img(image)).reshape(B, N, 2, self.num_heads, self.head_dim)
+        k, v = kv[:, :, 0].transpose(1, 2), kv[:, :, 1].transpose(1, 2)
+        out = self._attention(q, k, v)
+        out = out.transpose(1, 2).reshape(B, K, D)
+        return plan + self.read_out(out)
+    def write(self, image: torch.Tensor, plan: torch.Tensor) -> torch.Tensor:
+        """Plan writes to image. image: [B,N,D], plan: [B,K,D] -> updated image [B,N,D]"""
+        B, N, D = image.shape
+        K = plan.shape[1]
+        q = self.write_q(self.write_norm_img(image)).reshape(B, N, self.num_heads, self.head_dim).transpose(1, 2)
+        kv = self.write_kv(self.write_norm_plan(plan)).reshape(B, K, 2, self.num_heads, self.head_dim)
+        k, v = kv[:, :, 0].transpose(1, 2), kv[:, :, 1].transpose(1, 2)
+        out = self._attention(q, k, v)
+        out = out.transpose(1, 2).reshape(B, N, D)
+        return image + self.write_out(out)
+class PlannerReasoning(nn.Module):
+    """
+    Self-attention + FFN over plan tokens.
+    This is where the "thinking" happens - plan tokens reason about
+    what the image should look like.
+    """
+    def __init__(self, dim: int, num_heads: int = 4, ffn_expansion: int = 3):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn = nn.MultiheadAttention(dim, num_heads, batch_first=True)
+        self.norm2 = nn.LayerNorm(dim)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, dim * ffn_expansion),
+            nn.GELU(),
+            nn.Linear(dim * ffn_expansion, dim),
+        )
+        # Condition integration
+        self.cond_proj = nn.Linear(dim, dim * 2)  # scale and shift
+    def forward(self, plan: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
+        """
+        plan: [B, K, D]
+        cond: [B, D] (timestep + text condition)
+        """
+        # Self-attention over plan tokens
+        h = self.norm1(plan)
+        h, _ = self.attn(h, h, h)
+        plan = plan + h
+        # Conditioned FFN
+        params = self.cond_proj(cond).unsqueeze(1)  # [B, 1, 2D]
+        scale, shift = params.chunk(2, dim=-1)
+        h = self.norm2(plan)
+        h = h * (1 + scale) + shift
+        plan = plan + self.ffn(h)
+        return plan
+class RecurrentLatentPlanner(nn.Module):
+    """
+    Recurrent Latent Planner (RLP).
+    Maintains K latent plan tokens that iteratively refine across
+    denoising steps. Each refinement involves:
+    1. READ: plan attends to current noised image
+    2. REASON: plan tokens self-attend and process with FFN
+    3. WRITE: plan injects guidance back into image tokens
+    The plan carries forward across denoising steps via latent self-conditioning
+    (from RIN). At step s, the plan from step s-1 is used as initialization,
+    creating a persistent "memory" of the generation process.
+    Parameters:
+    - num_plan_tokens: K, number of plan tokens (default 32)
+    - dim: token dimension
+    - num_layers: depth of reasoning (default 2)
+    - text_dim: dimension of text embeddings for initialization
+    Memory: K * D * 4 bytes per plan = 32 * 384 * 4 = 49KB (negligible)
+    Compute: O(K^2 + K*N) per layer (K=32, N=256 -> ~40K ops, vs N^2=65K for full attention)
+    """
+    def __init__(
+        self,
+        num_plan_tokens: int = 32,
+        dim: int = 384,
+        text_dim: int = 768,
+        latent_channels: int = 32,
+        num_layers: int = 2,
+        num_heads: int = 4,
+    ):
+        super().__init__()
+        self.num_plan_tokens = num_plan_tokens
+        self.dim = dim
+        # Input projection: map raw latent channels to planner dim
+        self.image_proj = nn.Linear(latent_channels, dim)
+        # Learnable initial plan tokens
+        self.init_tokens = nn.Parameter(torch.randn(1, num_plan_tokens, dim) * 0.02)
+        # Text-to-plan projection (initialize plan from text)
+        self.text_to_plan = nn.Sequential(
+            nn.Linear(text_dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim),
+        )
+        # Timestep projection
+        self.time_proj = nn.Sequential(
+            nn.Linear(dim, dim * 4),
+            nn.SiLU(),
+            nn.Linear(dim * 4, dim),
+        )
+        # Stacked read-reason-write layers
+        self.layers = nn.ModuleList()
+        for _ in range(num_layers):
+            self.layers.append(nn.ModuleDict({
+                'read_write': PlannerReadWrite(dim, num_heads),
+                'reason': PlannerReasoning(dim, num_heads),
+            }))
+        # Final projection to backbone-compatible tokens (must match text_dim)
+        self.output_proj = nn.Linear(dim, text_dim)
+        self.output_norm = nn.LayerNorm(dim)
+        # Self-conditioning weight (learnable, from RIN)
+        self.self_cond_weight = nn.Parameter(torch.tensor(0.5))
+    def initialize_plan(
+        self,
+        text_pooled: torch.Tensor,
+        batch_size: int,
+        prev_plan: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Initialize plan tokens from text and (optionally) previous plan.
+        text_pooled: [B, text_dim]
+        prev_plan: [B, K, D] from previous denoising step (latent self-conditioning)
+        """
+        # Learnable base + text-guided initialization
+        plan = self.init_tokens.expand(batch_size, -1, -1)
+        text_cond = self.text_to_plan(text_pooled).unsqueeze(1)  # [B, 1, D]
+        plan = plan + text_cond
+        # Latent self-conditioning from previous step
+        if prev_plan is not None:
+            w = torch.sigmoid(self.self_cond_weight)
+            plan = w * prev_plan + (1 - w) * plan
+        return plan
+    def forward(
+        self,
+        image_tokens: torch.Tensor,
+        plan: torch.Tensor,
+        t_emb: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Full read-reason-write cycle.
+        Args:
+            image_tokens: [B, N, D] - patchified noised image latent
+            plan: [B, K, D] - current plan tokens
+            t_emb: [B, D] - timestep embedding
+        Returns:
+            updated_plan: [B, K, D] - refined plan
+            planner_output: [B, K, D] - tokens to inject into backbone
+        """
+        cond = t_emb  # Could add more conditioning here
+        # Project image tokens to planner dimension
+        image_tokens = self.image_proj(image_tokens)
+        for layer in self.layers:
+            # READ: plan learns from image
+            plan = layer['read_write'].read(plan, image_tokens)
+            # REASON: plan self-refines
+            plan = layer['reason'](plan, cond)
+            # WRITE: plan guides image (optional, only in advanced mode)
+            # image_tokens = layer['read_write'].write(image_tokens, plan)
+        # Project plan tokens for backbone injection
+        output = self.output_proj(self.output_norm(plan))
+        return plan, output
+    def get_plan_size_bytes(self) -> int:
+        """Return size of plan state in bytes (for memory budgeting)."""
+        return self.num_plan_tokens * self.dim * 4  # float32