asdf98
/

IRIS-architecture

@@ -989,71 +989,105 @@ class IRISGenerator(nn.Module):
 # ============================================================================
 class IRIS(nn.Module):
-    """Complete IRIS system: VAE + Generator.
-    For training: use train_step() which handles noise scheduling.
-    For inference: use generate() which runs the full pipeline.
     """
-    def __init__(self, config: IRISConfig):
         super().__init__()
         self.config = config
-        self.vae = WaveletVAE(config)
         self.generator = IRISGenerator(config)
     def encode(self, images: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Encode images to latent space."""
         return self.vae.encode(images)
     def decode(self, z: torch.Tensor) -> torch.Tensor:
-        """Decode latent to images."""
         return self.vae.decode(z)
-    def get_velocity_target(self, z_0: torch.Tensor, noise: torch.Tensor) -> torch.Tensor:
         """Rectified flow velocity target: v = noise - z_0."""
         return noise - z_0
-    def add_noise(self, z_0: torch.Tensor, noise: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
         """Rectified flow forward process: z_t = (1-t)*z_0 + t*noise."""
         t_expand = t[:, None, None, None]
         return (1 - t_expand) * z_0 + t_expand * noise
-    def sample_timesteps(self, batch_size: int, device: torch.device) -> torch.Tensor:
-        """Sample timesteps from logit-normal distribution (from SD3/RF).
-        Concentrates sampling on intermediate timesteps where learning is hardest.
-        """
         u = torch.randn(batch_size, device=device)
-        t = torch.sigmoid(u)  # Logit-normal with mean=0, std=1
-        # Clamp to avoid t=0 and t=1
         t = t.clamp(1e-5, 1 - 1e-5)
         return t
-    def train_step(
         self,
-        images: torch.Tensor,
         text_tokens: torch.Tensor,
         num_iterations: Optional[int] = None,
     ) -> dict:
-        """Single training step for rectified flow.
-        Returns dict with loss and diagnostics.
         """
         B = images.shape[0]
         device = images.device
-        # Encode to latent
         z_0, mean, logvar = self.encode(images)
-        # Sample noise and timesteps
         noise = torch.randn_like(z_0)
         t = self.sample_timesteps(B, device)
-        # Create noisy latent
         z_t = self.add_noise(z_0, noise, t)
-        # Predict velocity
-        # Randomly sample iteration count for training robustness
         if num_iterations is None:
-            r_choices = [4, 6, 8, 10, 12]
             r = r_choices[torch.randint(0, len(r_choices), (1,)).item()]
         else:
             r = num_iterations
@@ -1061,15 +1095,9 @@ class IRIS(nn.Module):
         v_pred = self.generator(z_t, t, text_tokens, num_iterations=r)
         v_target = self.get_velocity_target(z_0, noise)
-        # SNR-weighted loss (from Rectified Flow paper)
-        # w(t) = t / (1 - t) — emphasizes high-noise timesteps
         w = t / (1 - t + 1e-8)
         w = w[:, None, None, None]
-        # Velocity matching loss
         velocity_loss = (w * (v_pred - v_target).pow(2)).mean()
-        # VAE KL loss
         kl_loss = -0.5 * (1 + logvar - mean.pow(2) - logvar.exp()).mean()
         return {
@@ -1080,7 +1108,7 @@ class IRIS(nn.Module):
         }
     @torch.no_grad()
-    def generate(
         self,
         text_tokens: torch.Tensor,
         num_steps: int = 4,
@@ -1088,14 +1116,9 @@ class IRIS(nn.Module):
         cfg_scale: float = 4.0,
         seed: Optional[int] = None,
     ) -> torch.Tensor:
-        """Generate images from text conditioning using Euler solver.
-        Args:
-            text_tokens: [B, S, text_dim] CLIP text embeddings
-            num_steps: Number of ODE solver steps (1-50)
-            num_iterations: Core iterations per step (quality budget)
-            cfg_scale: Classifier-free guidance scale
-            seed: Random seed for reproducibility
         """
         B, S, _ = text_tokens.shape
         device = text_tokens.device
@@ -1103,35 +1126,25 @@ class IRIS(nn.Module):
         if seed is not None:
             torch.manual_seed(seed)
-        # Start from pure noise
         z = torch.randn(B, self.config.latent_channels,
                         self.config.latent_spatial, self.config.latent_spatial,
                         device=device)
-        # Euler solver for rectified flow ODE: dz/dt = -v(z, t)
-        # Integrate from t=1 (noise) to t=0 (data)
         dt = 1.0 / num_steps
         for step in range(num_steps):
             t_val = 1.0 - step * dt
             t = torch.full((B,), t_val, device=device)
-            # Predict velocity
             v = self.generator(z, t, text_tokens, num_iterations=num_iterations)
-            # Classifier-free guidance (if cfg_scale > 1)
             if cfg_scale > 1.0:
                 null_tokens = torch.zeros_like(text_tokens)
                 v_uncond = self.generator(z, t, null_tokens, num_iterations=num_iterations)
                 v = v_uncond + cfg_scale * (v - v_uncond)
-            # Euler step: z = z - dt * v
             z = z - dt * v
-        # Decode to image
-        images = self.decode(z)
-        images = images.clamp(-1, 1)
-        return images
 # ============================================================================
@@ -1162,9 +1175,9 @@ def estimate_memory_mb(model: nn.Module, dtype=torch.float16) -> float:
 def create_iris_small(latent_spatial: int = 32) -> IRIS:
-    """Create IRIS-Small: ~75M generator params, suitable for mobile."""
     config = IRISConfig(
-        latent_channels=16,
         latent_spatial=latent_spatial,
         hidden_dim=512,
         num_heads=8,
@@ -1187,9 +1200,9 @@ def create_iris_small(latent_spatial: int = 32) -> IRIS:
 def create_iris_tiny(latent_spatial: int = 32) -> IRIS:
-    """Create IRIS-Tiny: ~30M generator params, ultra-mobile."""
     config = IRISConfig(
-        latent_channels=8,
         latent_spatial=latent_spatial,
         hidden_dim=384,
         num_heads=6,
@@ -1212,9 +1225,9 @@ def create_iris_tiny(latent_spatial: int = 32) -> IRIS:
 def create_iris_base(latent_spatial: int = 32) -> IRIS:
-    """Create IRIS-Base: ~150M generator params, quality-focused."""
     config = IRISConfig(
-        latent_channels=16,
         latent_spatial=latent_spatial,
         hidden_dim=768,
         num_heads=12,

 # ============================================================================
 class IRIS(nn.Module):
+    """Complete IRIS system: Generator + optional built-in VAE.
+    For training with external VAE (recommended): use train_step_latent() with pre-encoded latents.
+    For training with built-in Wavelet VAE: use train_step() with raw images.
+    For inference: use generate_latent() to get latent, then decode externally.
     """
+    def __init__(self, config: IRISConfig, use_builtin_vae: bool = False):
         super().__init__()
         self.config = config
         self.generator = IRISGenerator(config)
+        # Built-in Wavelet VAE is optional — prefer pre-trained external VAE
+        self.vae = WaveletVAE(config) if use_builtin_vae else None
     def encode(self, images: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Encode images via built-in VAE (only if use_builtin_vae=True)."""
+        assert self.vae is not None, "No built-in VAE. Use an external VAE to encode images."
         return self.vae.encode(images)
     def decode(self, z: torch.Tensor) -> torch.Tensor:
+        """Decode latent via built-in VAE (only if use_builtin_vae=True)."""
+        assert self.vae is not None, "No built-in VAE. Use an external VAE to decode latents."
         return self.vae.decode(z)
+    @staticmethod
+    def get_velocity_target(z_0: torch.Tensor, noise: torch.Tensor) -> torch.Tensor:
         """Rectified flow velocity target: v = noise - z_0."""
         return noise - z_0
+    @staticmethod
+    def add_noise(z_0: torch.Tensor, noise: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
         """Rectified flow forward process: z_t = (1-t)*z_0 + t*noise."""
         t_expand = t[:, None, None, None]
         return (1 - t_expand) * z_0 + t_expand * noise
+    @staticmethod
+    def sample_timesteps(batch_size: int, device: torch.device) -> torch.Tensor:
+        """Sample timesteps from logit-normal distribution (from SD3/RF)."""
         u = torch.randn(batch_size, device=device)
+        t = torch.sigmoid(u)
         t = t.clamp(1e-5, 1 - 1e-5)
         return t
+    def train_step_latent(
         self,
+        z_0: torch.Tensor,
         text_tokens: torch.Tensor,
         num_iterations: Optional[int] = None,
     ) -> dict:
+        """Training step on PRE-ENCODED latents (recommended path).
+        Use this with an external pre-trained VAE:
+            z_0 = external_vae.encode(images)  # done outside
+            result = iris.train_step_latent(z_0, text_tokens)
         """
+        B = z_0.shape[0]
+        device = z_0.device
+        noise = torch.randn_like(z_0)
+        t = self.sample_timesteps(B, device)
+        z_t = self.add_noise(z_0, noise, t)
+        if num_iterations is None:
+            r_choices = [3, 4, 5, 6]
+            r = r_choices[torch.randint(0, len(r_choices), (1,)).item()]
+        else:
+            r = num_iterations
+        v_pred = self.generator(z_t, t, text_tokens, num_iterations=r)
+        v_target = self.get_velocity_target(z_0, noise)
+        w = t / (1 - t + 1e-8)
+        w = w[:, None, None, None]
+        velocity_loss = (w * (v_pred - v_target).pow(2)).mean()
+        return {
+            'loss': velocity_loss,
+            'velocity_loss': velocity_loss.item(),
+            'mean_t': t.mean().item(),
+        }
+    def train_step(
+        self,
+        images: torch.Tensor,
+        text_tokens: torch.Tensor,
+        num_iterations: Optional[int] = None,
+    ) -> dict:
+        """Training step with built-in Wavelet VAE (legacy path)."""
+        assert self.vae is not None, "No built-in VAE. Use train_step_latent() instead."
         B = images.shape[0]
         device = images.device
         z_0, mean, logvar = self.encode(images)
         noise = torch.randn_like(z_0)
         t = self.sample_timesteps(B, device)
         z_t = self.add_noise(z_0, noise, t)
         if num_iterations is None:
+            r_choices = [3, 4, 5, 6]
             r = r_choices[torch.randint(0, len(r_choices), (1,)).item()]
         else:
             r = num_iterations
         v_pred = self.generator(z_t, t, text_tokens, num_iterations=r)
         v_target = self.get_velocity_target(z_0, noise)
         w = t / (1 - t + 1e-8)
         w = w[:, None, None, None]
         velocity_loss = (w * (v_pred - v_target).pow(2)).mean()
         kl_loss = -0.5 * (1 + logvar - mean.pow(2) - logvar.exp()).mean()
         return {
         }
     @torch.no_grad()
+    def generate_latent(
         self,
         text_tokens: torch.Tensor,
         num_steps: int = 4,
         cfg_scale: float = 4.0,
         seed: Optional[int] = None,
     ) -> torch.Tensor:
+        """Generate latent (decode externally with your VAE).
+        Returns z_0 latent tensor, NOT decoded image.
         """
         B, S, _ = text_tokens.shape
         device = text_tokens.device
         if seed is not None:
             torch.manual_seed(seed)
         z = torch.randn(B, self.config.latent_channels,
                         self.config.latent_spatial, self.config.latent_spatial,
                         device=device)
         dt = 1.0 / num_steps
         for step in range(num_steps):
             t_val = 1.0 - step * dt
             t = torch.full((B,), t_val, device=device)
             v = self.generator(z, t, text_tokens, num_iterations=num_iterations)
             if cfg_scale > 1.0:
                 null_tokens = torch.zeros_like(text_tokens)
                 v_uncond = self.generator(z, t, null_tokens, num_iterations=num_iterations)
                 v = v_uncond + cfg_scale * (v - v_uncond)
             z = z - dt * v
+        return z
 # ============================================================================
 def create_iris_small(latent_spatial: int = 32) -> IRIS:
+    """Create IRIS-Small for SD-VAE latent space (4ch, 8× downsample)."""
     config = IRISConfig(
+        latent_channels=4,
         latent_spatial=latent_spatial,
         hidden_dim=512,
         num_heads=8,
 def create_iris_tiny(latent_spatial: int = 32) -> IRIS:
+    """Create IRIS-Tiny for SD-VAE latent space (4ch, 8× downsample)."""
     config = IRISConfig(
+        latent_channels=4,
         latent_spatial=latent_spatial,
         hidden_dim=384,
         num_heads=6,
 def create_iris_base(latent_spatial: int = 32) -> IRIS:
+    """Create IRIS-Base for SD-VAE latent space (4ch, 8× downsample)."""
     config = IRISConfig(
+        latent_channels=4,
         latent_spatial=latent_spatial,
         hidden_dim=768,
         num_heads=12,