asdf98
/

microforge

Model card Files Files and versions

xet

Community

asdf98 commited on 8 days ago

Commit

ef19514

verified ·

1 Parent(s): e80ddea

Add microforge/training.py

Browse files

Files changed (1) hide show

microforge/training.py +385 -0

microforge/training.py ADDED Viewed

	@@ -0,0 +1,385 @@

+"""
+MicroForge Training: Rectified Flow + Consistency Distillation
+===============================================================
+Training objectives:
+1. Rectified Flow (primary): learn velocity v(z_t, t) = z_1 - z_0
+2. Consistency Distillation (secondary): for few-step inference
+3. VAE losses: L1 recon + KL + perceptual (LPIPS placeholder)
+Rectified Flow formulation:
+  z_t = (1-t) * z_0 + t * epsilon    (linear interpolation)
+  v_target = epsilon - z_0            (velocity)
+  L_flow = ||v_theta(z_t, t) - v_target||^2
+Logit-normal timestep sampling (from SnapGen/SD3):
+  t ~ sigma(Normal(mean, std))  where mean=0, std=1
+  This puts more weight on intermediate timesteps.
+Staged curriculum (from DreamLite + SnapGen):
+  Stage 1: Low-res composition (128-256px)
+  Stage 2: Texture refinement (256-512px)
+  Stage 3: High-res detail (512-1024px)
+  Stage 4: Editing tasks (with spatial concat)
+  Stage 5: Step distillation (LADD or consistency)
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from typing import Optional, Dict, Tuple
+class FlowMatchingScheduler:
+    """
+    Rectified Flow / Flow Matching schedule.
+    Forward process: z_t = (1-t) * z_0 + t * epsilon
+    Velocity: v = epsilon - z_0
+    At t=0: z_t = z_0 (clean)
+    At t=1: z_t = epsilon (noise)
+    Timestep sampling: logit-normal distribution
+    """
+    def __init__(
+        self,
+        logit_mean: float = 0.0,
+        logit_std: float = 1.0,
+        time_shift: float = 3.0,
+    ):
+        self.logit_mean = logit_mean
+        self.logit_std = logit_std
+        self.time_shift = time_shift
+    def sample_timesteps(self, batch_size: int, device: torch.device) -> torch.Tensor:
+        """
+        Sample timesteps from logit-normal distribution.
+        Returns t in [0, 1].
+        """
+        u = torch.randn(batch_size, device=device) * self.logit_std + self.logit_mean
+        t = torch.sigmoid(u)
+        # Dynamic time shifting (from FLUX/DreamLite)
+        if self.time_shift != 1.0:
+            t = self.time_shift * t / (1 + (self.time_shift - 1) * t)
+        return t
+    def add_noise(
+        self,
+        z_0: torch.Tensor,
+        noise: torch.Tensor,
+        t: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Create noised sample and target velocity.
+        z_t = (1-t) * z_0 + t * epsilon
+        v_target = epsilon - z_0
+        Args:
+            z_0: [B, C, H, W] clean latent
+            noise: [B, C, H, W] standard normal noise
+            t: [B] timesteps
+        Returns:
+            z_t: [B, C, H, W] noised latent
+            v_target: [B, C, H, W] target velocity
+        """
+        t_expanded = t[:, None, None, None]  # [B, 1, 1, 1]
+        z_t = (1 - t_expanded) * z_0 + t_expanded * noise
+        v_target = noise - z_0
+        return z_t, v_target
+    @torch.no_grad()
+    def euler_step(
+        self,
+        z_t: torch.Tensor,
+        v_pred: torch.Tensor,
+        t: float,
+        t_next: float,
+    ) -> torch.Tensor:
+        """
+        Single Euler step for ODE sampling.
+        z_{t_next} = z_t + (t_next - t) * v_pred
+        """
+        dt = t_next - t
+        return z_t + dt * v_pred
+    @torch.no_grad()
+    def sample(
+        self,
+        model,
+        noise: torch.Tensor,
+        text_emb: torch.Tensor,
+        text_pooled: torch.Tensor,
+        num_steps: int = 20,
+        cfg_scale: float = 7.5,
+        planner=None,
+    ) -> torch.Tensor:
+        """
+        Full sampling loop using Euler ODE solver.
+        Args:
+            model: MicroForgeBackbone
+            noise: [B, C, H, W] initial noise
+            text_emb: [B, M, D] text embeddings
+            text_pooled: [B, D] pooled text
+            num_steps: number of denoising steps
+            cfg_scale: classifier-free guidance scale
+            planner: optional RecurrentLatentPlanner
+        Returns:
+            z_0: [B, C, H, W] generated clean latent
+        """
+        timesteps = torch.linspace(1, 0, num_steps + 1, device=noise.device)
+        z_t = noise
+        plan = None
+        for i in range(num_steps):
+            t = timesteps[i]
+            t_next = timesteps[i + 1]
+            t_batch = torch.full((noise.shape[0],), t, device=noise.device)
+            planner_tokens = None
+            if planner is not None:
+                # Initialize or update plan
+                from .backbone import PatchEmbed2D
+                # Simple flattening for planner input
+                B, C, H, W = z_t.shape
+                img_tokens = z_t.reshape(B, C, -1).permute(0, 2, 1)
+                plan = planner.initialize_plan(text_pooled, B, plan)
+                t_emb = model.time_embed(t_batch)
+                plan, planner_tokens = planner(img_tokens, plan, t_emb)
+            # Classifier-free guidance
+            if cfg_scale > 1.0:
+                # Conditional prediction
+                v_cond = model(z_t, t_batch, text_emb, text_pooled, planner_tokens)
+                # Unconditional prediction (empty text)
+                null_text = torch.zeros_like(text_emb)
+                null_pooled = torch.zeros_like(text_pooled)
+                v_uncond = model(z_t, t_batch, null_text, null_pooled, None)
+                # CFG
+                v_pred = v_uncond + cfg_scale * (v_cond - v_uncond)
+            else:
+                v_pred = model(z_t, t_batch, text_emb, text_pooled, planner_tokens)
+            z_t = self.euler_step(z_t, v_pred, t.item(), t_next.item())
+        return z_t
+class MicroForgeLoss(nn.Module):
+    """
+    Combined loss function for MicroForge training.
+    L_total = L_flow + lambda_kl * L_kl + lambda_recon * L_recon
+    For distillation stages, additional losses are added.
+    """
+    def __init__(
+        self,
+        lambda_kl: float = 1e-6,
+        lambda_recon: float = 1.0,
+    ):
+        super().__init__()
+        self.lambda_kl = lambda_kl
+        self.lambda_recon = lambda_recon
+    def flow_matching_loss(
+        self,
+        v_pred: torch.Tensor,
+        v_target: torch.Tensor,
+        t: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Flow matching loss with optional timestep weighting.
+        L = ||v_pred - v_target||^2
+        Optional: t-scaling (from SnapGen) to prioritize perceptually important timesteps.
+        """
+        loss = F.mse_loss(v_pred, v_target, reduction='none')
+        if t is not None:
+            # T-scaling: weight intermediate timesteps more
+            # SNR-based weighting: higher weight at intermediate noise levels
+            weight = 1.0 / (1.0 + torch.abs(2 * t - 1))  # Peak at t=0.5
+            weight = weight[:, None, None, None]
+            loss = loss * weight
+        return loss.mean()
+    def vae_loss(
+        self,
+        x_recon: torch.Tensor,
+        x: torch.Tensor,
+        mu: torch.Tensor,
+        logvar: torch.Tensor,
+    ) -> Dict[str, torch.Tensor]:
+        """VAE training loss: L1 recon + KL."""
+        l_recon = F.l1_loss(x_recon, x)
+        l_kl = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
+        total = self.lambda_recon * l_recon + self.lambda_kl * l_kl
+        return {
+            'total': total,
+            'recon': l_recon,
+            'kl': l_kl,
+        }
+    def forward(
+        self,
+        v_pred: torch.Tensor,
+        v_target: torch.Tensor,
+        t: Optional[torch.Tensor] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """Compute flow matching loss (main training objective)."""
+        l_flow = self.flow_matching_loss(v_pred, v_target, t)
+        return {'total': l_flow, 'flow': l_flow}
+class MicroForgeTrainer:
+    """
+    Training orchestrator for MicroForge.
+    Implements the staged curriculum:
+    Stage 1: VAE training (or use pretrained DC-AE)
+    Stage 2: Backbone training with flow matching at low-res
+    Stage 3: Progressive resolution increase
+    Stage 4: Editing task joint training
+    Stage 5: Step distillation (consistency or LADD)
+    Memory optimization for 16GB GPU:
+    - Gradient checkpointing
+    - Mixed precision (fp16/bf16)
+    - Small batch + gradient accumulation
+    - Freeze VAE during backbone training
+    """
+    def __init__(
+        self,
+        vae,
+        backbone,
+        planner=None,
+        lr: float = 1e-4,
+        weight_decay: float = 0.01,
+        grad_clip: float = 2.0,
+        use_ema: bool = True,
+        ema_decay: float = 0.9999,
+    ):
+        self.vae = vae
+        self.backbone = backbone
+        self.planner = planner
+        self.scheduler = FlowMatchingScheduler()
+        self.loss_fn = MicroForgeLoss()
+        self.grad_clip = grad_clip
+        # Setup optimizer
+        params = list(backbone.parameters())
+        if planner is not None:
+            params += list(planner.parameters())
+        self.optimizer = torch.optim.AdamW(
+            params, lr=lr, weight_decay=weight_decay,
+            betas=(0.9, 0.999),
+        )
+        # EMA
+        self.use_ema = use_ema
+        self.ema_decay = ema_decay
+        if use_ema:
+            self.ema_backbone = self._create_ema(backbone)
+    def _create_ema(self, model):
+        """Create EMA copy of model."""
+        import copy
+        ema = copy.deepcopy(model)
+        for p in ema.parameters():
+            p.data = p.data.clone()
+            p.requires_grad_(False)
+        return ema
+    @torch.no_grad()
+    def _update_ema(self):
+        """Update EMA weights."""
+        if not self.use_ema:
+            return
+        for p_ema, p_model in zip(self.ema_backbone.parameters(), self.backbone.parameters()):
+            p_ema.data.mul_(self.ema_decay).add_(p_model.data, alpha=1 - self.ema_decay)
+    def train_step(
+        self,
+        images: torch.Tensor,
+        text_emb: torch.Tensor,
+        text_pooled: torch.Tensor,
+    ) -> Dict[str, float]:
+        """
+        Single training step.
+        Args:
+            images: [B, 3, H, W] input images
+            text_emb: [B, M, text_dim] text embeddings
+            text_pooled: [B, text_dim] pooled text
+        Returns:
+            dict of loss values
+        """
+        device = images.device
+        # Encode to latent (VAE frozen)
+        with torch.no_grad():
+            z_0 = self.vae.get_latent(images)
+        # Sample timesteps and noise
+        B = z_0.shape[0]
+        t = self.scheduler.sample_timesteps(B, device)
+        noise = torch.randn_like(z_0)
+        # Create noised latent and target
+        z_t, v_target = self.scheduler.add_noise(z_0, noise, t)
+        # Optional: planner
+        planner_tokens = None
+        if self.planner is not None:
+            img_tokens = z_t.reshape(B, z_t.shape[1], -1).permute(0, 2, 1)
+            plan = self.planner.initialize_plan(text_pooled, B)
+            t_emb = self.backbone.time_embed(t)
+            _, planner_tokens = self.planner(img_tokens, plan, t_emb)
+        # Predict velocity
+        v_pred = self.backbone(z_t, t, text_emb, text_pooled, planner_tokens)
+        # Compute loss
+        losses = self.loss_fn(v_pred, v_target, t)
+        # Backward + optimize
+        self.optimizer.zero_grad()
+        losses['total'].backward()
+        torch.nn.utils.clip_grad_norm_(self.backbone.parameters(), self.grad_clip)
+        self.optimizer.step()
+        # Update EMA
+        self._update_ema()
+        return {k: v.item() for k, v in losses.items()}
+    def train_vae_step(
+        self,
+        images: torch.Tensor,
+        vae_optimizer: torch.optim.Optimizer,
+    ) -> Dict[str, float]:
+        """Training step for VAE."""
+        x_recon, mu, logvar = self.vae(images)
+        losses = self.loss_fn.vae_loss(x_recon, images, mu, logvar)
+        vae_optimizer.zero_grad()
+        losses['total'].backward()
+        torch.nn.utils.clip_grad_norm_(self.vae.parameters(), self.grad_clip)
+        vae_optimizer.step()
+        return {k: v.item() for k, v in losses.items()}