asdf98
/

iris-image-gen

Model card Files Files and versions

xet

Community

asdf98 commited on 8 days ago

Commit

c98929a

verified ·

1 Parent(s): fe73fcc

Fix: register context_proj as proper nn.Module (was lazy, not saved in checkpoints)

Browse files

Files changed (1) hide show

iris/model.py +31 -6

iris/model.py CHANGED Viewed

@@ -65,8 +65,16 @@ class IRIS(nn.Module):
     """
     IRIS: Iterative Refinement Image Synthesizer.
     Predicts velocity v_theta(z_t, t, c) for flow matching.
     """
-    def __init__(self, latent_channels=32, dim=512, patch_size=4, num_blocks=6, num_heads=8, max_iterations=8, ffn_expansion=2, gradient_checkpointing=True):
         super().__init__()
         self.latent_channels = latent_channels
         self.dim = dim
@@ -75,8 +83,17 @@ class IRIS(nn.Module):
         self.patchify = Patchify(latent_channels, dim, patch_size)
         self.unpatchify = Unpatchify(latent_channels, dim, patch_size)
         spatial_size = 4  # default for 16x16 latent with ps=4
-        self.core = RefinementCore(dim=dim, num_blocks=num_blocks, num_heads=num_heads, spatial_size=spatial_size, max_iterations=max_iterations, ffn_expansion=ffn_expansion, gradient_checkpointing=gradient_checkpointing)
         self.tiny_decoder = TinyDecoder(latent_channels, out_channels=3)
         self._init_weights()
     def _init_weights(self):
@@ -95,10 +112,18 @@ class IRIS(nn.Module):
     def forward(self, z_t, t, context, num_iterations=4):
         tokens, H_tok, W_tok = self.patchify(z_t)
-        if context.shape[-1] != self.dim:
-            if not hasattr(self, '_context_proj'):
-                self._context_proj = nn.Linear(context.shape[-1], self.dim, bias=False).to(context.device, context.dtype)
-            context = self._context_proj(context)
         refined = self.core(tokens, context, t, H_tok, W_tok, num_iterations=num_iterations)
         return self.unpatchify(refined, H_tok, W_tok)

     """
     IRIS: Iterative Refinement Image Synthesizer.
     Predicts velocity v_theta(z_t, t, c) for flow matching.
+    Args:
+        text_dim: dimension of text encoder output. If different from dim,
+                  a learned linear projection is applied. Set to 384 for
+                  all-MiniLM-L6-v2, 512 for CLIP, etc. Set to None or
+                  equal to dim to skip projection.
     """
+    def __init__(self, latent_channels=32, dim=512, patch_size=4, num_blocks=6,
+                 num_heads=8, max_iterations=8, ffn_expansion=2,
+                 gradient_checkpointing=True, text_dim=None):
         super().__init__()
         self.latent_channels = latent_channels
         self.dim = dim
         self.patchify = Patchify(latent_channels, dim, patch_size)
         self.unpatchify = Unpatchify(latent_channels, dim, patch_size)
         spatial_size = 4  # default for 16x16 latent with ps=4
+        self.core = RefinementCore(dim=dim, num_blocks=num_blocks, num_heads=num_heads,
+                                   spatial_size=spatial_size, max_iterations=max_iterations,
+                                   ffn_expansion=ffn_expansion, gradient_checkpointing=gradient_checkpointing)
         self.tiny_decoder = TinyDecoder(latent_channels, out_channels=3)
+        # Text projection: maps text encoder dim to model dim if they differ
+        if text_dim is not None and text_dim != dim:
+            self.context_proj = nn.Linear(text_dim, dim, bias=False)
+        else:
+            self.context_proj = None
         self._init_weights()
     def _init_weights(self):
     def forward(self, z_t, t, context, num_iterations=4):
         tokens, H_tok, W_tok = self.patchify(z_t)
+        # Project text embeddings to model dim if needed
+        if self.context_proj is not None:
+            context = self.context_proj(context)
+        elif context.shape[-1] != self.dim:
+            # Fallback: lazy projection for backwards compat
+            if not hasattr(self, '_lazy_context_proj'):
+                self._lazy_context_proj = nn.Linear(
+                    context.shape[-1], self.dim, bias=False
+                ).to(context.device, context.dtype)
+            context = self._lazy_context_proj(context)
         refined = self.core(tokens, context, t, H_tok, W_tok, num_iterations=num_iterations)
         return self.unpatchify(refined, H_tok, W_tok)