asdf98
/

LiquidGen

Model card Files Files and versions

xet

Community

asdf98 commited on 8 days ago

Commit

193fbf7

verified ·

1 Parent(s): 49c59f1

Add gradient checkpointing + zigzag index caching

Browse files

Files changed (1) hide show

model.py +31 -17

model.py CHANGED Viewed

@@ -45,6 +45,7 @@ References:
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import math
 from typing import Optional, Tuple
@@ -168,26 +169,26 @@ class ZigzagScan1D(nn.Module):
                                 padding=kernel_size // 2, groups=channels, bias=False)
         self.pw = nn.Conv1d(channels, channels, 1, bias=True)
         self.act = nn.GELU()
-    def _zigzag_indices(self, H: int, W: int, device: torch.device) -> torch.Tensor:
-        indices = []
-        for i in range(H):
-            row = list(range(i * W, (i + 1) * W))
-            if i % 2 == 1:
-                row = row[::-1]
-            indices.extend(row)
-        return torch.tensor(indices, device=device, dtype=torch.long)
-    def _inverse_zigzag_indices(self, H: int, W: int, device: torch.device) -> torch.Tensor:
-        fwd = self._zigzag_indices(H, W, device)
-        inv = torch.empty_like(fwd)
-        inv[fwd] = torch.arange(H * W, device=device)
-        return inv
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, C, H, W = x.shape
-        zz_idx = self._zigzag_indices(H, W, x.device)
-        inv_idx = self._inverse_zigzag_indices(H, W, x.device)
         x_flat = x.reshape(B, C, H * W)
         x_zz = x_flat[:, :, zz_idx]
         x_mixed = self.pw(self.act(self.conv1d(x_zz)))
@@ -361,6 +362,16 @@ class LiquidGen(nn.Module):
         nn.init.zeros_(self.unpatch.bias)
         self.apply(self._init_weights)
     def _init_weights(self, m):
         if isinstance(m, nn.Conv2d):
@@ -410,7 +421,10 @@ class LiquidGen(nn.Module):
             elif i >= mid and len(skip_connections) > 0:
                 skip = skip_connections.pop()
                 h = h + skip
-            h = block(h, cond)
         h = self.final_norm(h)
         h = self.final_proj(h)

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
 import math
 from typing import Optional, Tuple
                                 padding=kernel_size // 2, groups=channels, bias=False)
         self.pw = nn.Conv1d(channels, channels, 1, bias=True)
         self.act = nn.GELU()
+        self._idx_cache = {}
+    def _get_indices(self, H: int, W: int, device: torch.device):
+        key = (H, W, device)
+        if key not in self._idx_cache:
+            indices = []
+            for i in range(H):
+                row = list(range(i * W, (i + 1) * W))
+                if i % 2 == 1:
+                    row = row[::-1]
+                indices.extend(row)
+            fwd = torch.tensor(indices, device=device, dtype=torch.long)
+            inv = torch.empty_like(fwd)
+            inv[fwd] = torch.arange(H * W, device=device)
+            self._idx_cache[key] = (fwd, inv)
+        return self._idx_cache[key]
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, C, H, W = x.shape
+        zz_idx, inv_idx = self._get_indices(H, W, x.device)
         x_flat = x.reshape(B, C, H * W)
         x_zz = x_flat[:, :, zz_idx]
         x_mixed = self.pw(self.act(self.conv1d(x_zz)))
         nn.init.zeros_(self.unpatch.bias)
         self.apply(self._init_weights)
+        self._gradient_checkpointing = False
+    def enable_gradient_checkpointing(self):
+        """Enable gradient checkpointing to reduce VRAM by ~40-60%.
+        Recomputes block activations during backward instead of storing them.
+        Slower training (~30%) but allows much larger batch sizes or models."""
+        self._gradient_checkpointing = True
+    def disable_gradient_checkpointing(self):
+        self._gradient_checkpointing = False
     def _init_weights(self, m):
         if isinstance(m, nn.Conv2d):
             elif i >= mid and len(skip_connections) > 0:
                 skip = skip_connections.pop()
                 h = h + skip
+            if self._gradient_checkpointing and self.training:
+                h = checkpoint(block, h, cond, use_reentrant=False)
+            else:
+                h = block(h, cond)
         h = self.final_norm(h)
         h = self.final_proj(h)