asdf98
/

LiquidGen

Model card Files Files and versions

xet

Community

asdf98 commited on 8 days ago

Commit

0b46772

verified ·

1 Parent(s): 2403335

Fix dataset: drop broken keremberke, use pure-parquet datasets only (cartoon default, Artificio/WikiArt for styles)"

Browse files

Files changed (1) hide show

train.py +58 -65

train.py CHANGED Viewed

@@ -3,14 +3,13 @@ LiquidGen Training Pipeline v2
 Optimized for Colab free tier:
 - Latent pre-caching: encode images with VAE once, save to disk, train on pure tensors
-- No VAE needed during training loop → saves ~1GB VRAM + faster iterations
-- Streaming support for large datasets
-- Multiple small dataset presets
 - Uses madebyollin/sdxl-vae-fp16-fix (fully open, no login, fp16 stable)
 Flow Matching training objective (velocity prediction):
-- Forward: x_t = (1 - t) * x_0 + t * ε
-- Target: v = ε - x_0
 - Loss: MSE(model(x_t, t), v)
 """
@@ -28,35 +27,17 @@ from dataclasses import dataclass, asdict
 # =============================================================================
-# Dataset Presets (all verified, fast to download, no auth needed)
 # =============================================================================
 DATASET_PRESETS = {
-    "paintings_mini": {
-        "name": "keremberke/painting-style-classification",
-        "config": "mini",
-        "image_column": "image",
-        "label_column": "labels",
-        "num_classes": 27,
-        "trust_remote_code": True,
-        "description": "~200 painting samples, 27 styles, 1.7MB — instant smoke test",
-    },
-    "paintings": {
-        "name": "keremberke/painting-style-classification",
-        "config": "full",
-        "image_column": "image",
-        "label_column": "labels",
-        "num_classes": 27,
-        "trust_remote_code": True,
-        "description": "~8K paintings, 27 styles, 204MB — best for style-conditional training",
-    },
     "cartoon": {
         "name": "Norod78/cartoon-blip-captions",
         "config": "",
         "image_column": "image",
         "label_column": "",
         "num_classes": 0,
-        "description": "~2.5K cartoon/anime, unconditional, 181MB",
     },
     "flowers": {
         "name": "huggan/flowers-102-categories",
@@ -66,14 +47,21 @@ DATASET_PRESETS = {
         "num_classes": 0,
         "description": "~8K flower photos, unconditional, 331MB",
     },
-    "wikiart_stream": {
-        "name": "huggan/wikiart",
         "config": "",
         "image_column": "image",
         "label_column": "style",
-        "num_classes": 27,
-        "streaming": True,
-        "description": "~80K paintings, 27 styles, STREAMING (0 disk) — use max_images to limit",
     },
 }
@@ -83,13 +71,13 @@ class TrainConfig:
     """Training configuration optimized for Colab free tier (T4 16GB)."""
     # Model
     model_size: str = "small"  # small (~55M), base (~140M), large (~280M)
-    num_classes: int = 27
     class_drop_prob: float = 0.1
     # Data
-    dataset_preset: str = "paintings"  # key from DATASET_PRESETS
     image_size: int = 256              # 256 or 512
-    max_images: int = 0                # 0 = use all, >0 = limit (for streaming/testing)
     # VAE — fully open, no login needed
     vae_id: str = "madebyollin/sdxl-vae-fp16-fix"
@@ -161,8 +149,8 @@ class CachedLatentDataset(Dataset):
         self.labels = data.get("labels", None)
         print(f"Loaded {len(self.latents)} cached latents from {cache_path}")
         print(f"  Shape: {self.latents.shape}, dtype: {self.latents.dtype}")
-        if self.labels is not None:
-            print(f"  Labels: unique={self.labels.unique().shape[0]}")
     def __len__(self):
         return len(self.latents)
@@ -176,46 +164,39 @@ class CachedLatentDataset(Dataset):
 def precache_latents(config, cache_path=None):
     """
     Encode all images to VAE latents once, save to disk.
-    Uses madebyollin/sdxl-vae-fp16-fix (no auth needed).
     """
     if cache_path is None:
         cache_path = os.path.join(config.output_dir, "cached_latents.pt")
     if os.path.exists(cache_path):
-        print(f"✅ Cache exists: {cache_path}")
         data = torch.load(cache_path, map_location="cpu", weights_only=True)
-        print(f"   {data['latents'].shape[0]} latents, shape {data['latents'].shape[1:]}")
         return cache_path
     os.makedirs(os.path.dirname(cache_path) if os.path.dirname(cache_path) else ".", exist_ok=True)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # Load VAE
-    print(f"Loading VAE: {config.vae_id} (open, no login needed)...")
     from diffusers import AutoencoderKL
     vae = AutoencoderKL.from_pretrained(
         config.vae_id, torch_dtype=torch.float16
     ).to(device).eval()
     for p in vae.parameters():
         p.requires_grad_(False)
-    print(f"  VAE loaded: {sum(p.numel() for p in vae.parameters())/1e6:.0f}M params")
     # Load dataset
     preset = DATASET_PRESETS[config.dataset_preset]
-    print(f"Loading dataset: {preset['name']} ({preset['description']})")
     from datasets import load_dataset
     from torchvision import transforms
-    is_streaming = preset.get("streaming", False)
     ds_kwargs = {"split": "train"}
     if preset["config"]:
         ds_kwargs["name"] = preset["config"]
-    if is_streaming:
-        ds_kwargs["streaming"] = True
-    # Some datasets have legacy loading scripts that need this flag
-    if preset.get("trust_remote_code", False):
-        ds_kwargs["trust_remote_code"] = True
     dataset = load_dataset(preset["name"], **ds_kwargs)
@@ -225,6 +206,11 @@ def precache_latents(config, cache_path=None):
         transforms.ToTensor(),
     ])
     all_latents = []
     all_labels = []
     batch_pixels = []
@@ -232,10 +218,8 @@ def precache_latents(config, cache_path=None):
     encode_bs = 16
     count = 0
     max_imgs = config.max_images if config.max_images > 0 else float("inf")
-    img_col = preset["image_column"]
-    lbl_col = preset["label_column"]
-    print(f"Encoding images to VAE latents...")
     t0 = time.time()
     for item in dataset:
@@ -245,8 +229,18 @@ def precache_latents(config, cache_path=None):
         if img.mode != "RGB":
             img = img.convert("RGB")
         batch_pixels.append(transform(img))
         if lbl_col and lbl_col in item:
-            batch_labels.append(item[lbl_col])
         else:
             batch_labels.append(-1)
         count += 1
@@ -260,7 +254,7 @@ def precache_latents(config, cache_path=None):
             all_labels.extend(batch_labels)
             batch_pixels, batch_labels = [], []
             if count % 500 == 0:
-                print(f"  {count} images encoded ({time.time()-t0:.0f}s)")
     if batch_pixels:
         with torch.no_grad():
@@ -272,17 +266,22 @@ def precache_latents(config, cache_path=None):
     all_latents = torch.cat(all_latents, dim=0)
     all_labels = torch.tensor(all_labels, dtype=torch.long)
-    torch.save({"latents": all_latents, "labels": all_labels}, cache_path)
     elapsed = time.time() - t0
     mb = os.path.getsize(cache_path) / 1024**2
-    print(f"\n✅ Cached {count} latents -> {cache_path}")
-    print(f"   Shape: {all_latents.shape}, Size: {mb:.1f}MB, Time: {elapsed:.0f}s")
     del vae
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-    print("   VAE unloaded, VRAM freed\n")
     return cache_path
@@ -371,15 +370,12 @@ def train(config):
     with open(f"{config.output_dir}/config.json", "w") as f:
         json.dump(asdict(config), f, indent=2)
-    # Step 1: Pre-cache latents
     cache_path = precache_latents(config)
-    # Step 2: Dataset from cache
     train_ds = CachedLatentDataset(cache_path)
     train_dl = DataLoader(train_ds, batch_size=config.batch_size, shuffle=True,
                           num_workers=config.num_workers, pin_memory=True, drop_last=True)
-    # Step 3: Model
     mcfg = get_model_config(config.model_size, config.num_classes, config.class_drop_prob)
     mcfg["in_channels"] = config.latent_channels
     model = LiquidGen(**mcfg).to(device)
@@ -388,7 +384,6 @@ def train(config):
     if config.compile_model and hasattr(torch, "compile"):
         model = torch.compile(model)
-    # Step 4: Training setup
     opt = torch.optim.AdamW(model.parameters(), lr=config.learning_rate,
                             weight_decay=config.weight_decay, betas=(0.9, 0.999))
     total_steps = len(train_dl) * config.num_epochs // config.gradient_accumulation_steps
@@ -398,16 +393,14 @@ def train(config):
     fm = FlowMatchingScheduler(config.min_timestep, config.max_timestep)
     lat_size = config.image_size // 8
-    print(f"\nTotal steps: {total_steps}, Batch: {config.batch_size}x{config.gradient_accumulation_steps}")
     print(f"Latent: [{config.batch_size}, {config.latent_channels}, {lat_size}, {lat_size}]")
-    print(f"No VAE during training -> max VRAM for model")
     if torch.cuda.is_available():
         print(f"VRAM: {torch.cuda.memory_allocated()/1024**3:.1f} / "
               f"{torch.cuda.get_device_properties(0).total_mem/1024**3:.1f} GB")
-    # Step 5: Train!
     gs = 0; la = 0.0; vae = None; vae_loaded = False
-    print(f"\n{'='*60}\n🚀 Training!\n{'='*60}\n")
     t_start = time.time()
     for epoch in range(config.num_epochs):
@@ -483,7 +476,7 @@ def train(config):
 if __name__ == "__main__":
     config = TrainConfig(
-        model_size="small", dataset_preset="paintings_mini",
         image_size=256, batch_size=8, num_epochs=5,
         log_every_n_steps=5, sample_every_n_steps=99999,
     )

 Optimized for Colab free tier:
 - Latent pre-caching: encode images with VAE once, save to disk, train on pure tensors
+- No VAE needed during training loop -> saves ~1GB VRAM + faster iterations
+- All datasets are pure parquet — no legacy loading scripts
 - Uses madebyollin/sdxl-vae-fp16-fix (fully open, no login, fp16 stable)
 Flow Matching training objective (velocity prediction):
+- Forward: x_t = (1 - t) * x_0 + t * eps
+- Target: v = eps - x_0
 - Loss: MSE(model(x_t, t), v)
 """
 # =============================================================================
+# Dataset Presets — ALL pure parquet, no loading scripts, no auth
 # =============================================================================
 DATASET_PRESETS = {
     "cartoon": {
         "name": "Norod78/cartoon-blip-captions",
         "config": "",
         "image_column": "image",
         "label_column": "",
         "num_classes": 0,
+        "description": "~2.5K cartoon/anime images, unconditional, 181MB",
     },
     "flowers": {
         "name": "huggan/flowers-102-categories",
         "num_classes": 0,
         "description": "~8K flower photos, unconditional, 331MB",
     },
+    "wikiart": {
+        "name": "Artificio/WikiArt",
         "config": "",
         "image_column": "image",
         "label_column": "style",
+        "num_classes": 0,  # string labels, mapped to ints automatically
+        "description": "~105K paintings with style labels, 1.6GB (use max_images to limit)",
+    },
+    "art_painting": {
+        "name": "huggan/few-shot-art-painting",
+        "config": "",
+        "image_column": "image",
+        "label_column": "",
+        "num_classes": 0,
+        "description": "~6K art paintings, unconditional, 511MB",
     },
 }
     """Training configuration optimized for Colab free tier (T4 16GB)."""
     # Model
     model_size: str = "small"  # small (~55M), base (~140M), large (~280M)
+    num_classes: int = 0       # 0 = unconditional
     class_drop_prob: float = 0.1
     # Data
+    dataset_preset: str = "cartoon"    # key from DATASET_PRESETS
     image_size: int = 256              # 256 or 512
+    max_images: int = 0                # 0 = use all, >0 = limit
     # VAE — fully open, no login needed
     vae_id: str = "madebyollin/sdxl-vae-fp16-fix"
         self.labels = data.get("labels", None)
         print(f"Loaded {len(self.latents)} cached latents from {cache_path}")
         print(f"  Shape: {self.latents.shape}, dtype: {self.latents.dtype}")
+        if self.labels is not None and (self.labels >= 0).any():
+            print(f"  Labels: unique={self.labels[self.labels >= 0].unique().shape[0]}")
     def __len__(self):
         return len(self.latents)
 def precache_latents(config, cache_path=None):
     """
     Encode all images to VAE latents once, save to disk.
     """
     if cache_path is None:
         cache_path = os.path.join(config.output_dir, "cached_latents.pt")
     if os.path.exists(cache_path):
+        print(f"Cache exists: {cache_path}")
         data = torch.load(cache_path, map_location="cpu", weights_only=True)
+        print(f"  {data['latents'].shape[0]} latents, shape {data['latents'].shape[1:]}")
         return cache_path
     os.makedirs(os.path.dirname(cache_path) if os.path.dirname(cache_path) else ".", exist_ok=True)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # Load VAE
+    print(f"Loading VAE: {config.vae_id}...")
     from diffusers import AutoencoderKL
     vae = AutoencoderKL.from_pretrained(
         config.vae_id, torch_dtype=torch.float16
     ).to(device).eval()
     for p in vae.parameters():
         p.requires_grad_(False)
+    print(f"  VAE: {sum(p.numel() for p in vae.parameters())/1e6:.0f}M params")
     # Load dataset
     preset = DATASET_PRESETS[config.dataset_preset]
+    print(f"Loading: {preset['name']} ({preset['description']})")
     from datasets import load_dataset
     from torchvision import transforms
     ds_kwargs = {"split": "train"}
     if preset["config"]:
         ds_kwargs["name"] = preset["config"]
     dataset = load_dataset(preset["name"], **ds_kwargs)
         transforms.ToTensor(),
     ])
+    # For Artificio/WikiArt: style is a string, map to int
+    img_col = preset["image_column"]
+    lbl_col = preset["label_column"]
+    style_to_id = {}
     all_latents = []
     all_labels = []
     batch_pixels = []
     encode_bs = 16
     count = 0
     max_imgs = config.max_images if config.max_images > 0 else float("inf")
+    print(f"Encoding to VAE latents...")
     t0 = time.time()
     for item in dataset:
         if img.mode != "RGB":
             img = img.convert("RGB")
         batch_pixels.append(transform(img))
+        # Handle labels: int or string
         if lbl_col and lbl_col in item:
+            raw_label = item[lbl_col]
+            if isinstance(raw_label, str):
+                if raw_label not in style_to_id:
+                    style_to_id[raw_label] = len(style_to_id)
+                batch_labels.append(style_to_id[raw_label])
+            elif isinstance(raw_label, int):
+                batch_labels.append(raw_label)
+            else:
+                batch_labels.append(-1)
         else:
             batch_labels.append(-1)
         count += 1
             all_labels.extend(batch_labels)
             batch_pixels, batch_labels = [], []
             if count % 500 == 0:
+                print(f"  {count} images ({time.time()-t0:.0f}s)")
     if batch_pixels:
         with torch.no_grad():
     all_latents = torch.cat(all_latents, dim=0)
     all_labels = torch.tensor(all_labels, dtype=torch.long)
+    save_data = {"latents": all_latents, "labels": all_labels}
+    if style_to_id:
+        save_data["style_to_id"] = style_to_id
+        print(f"  Mapped {len(style_to_id)} style labels to class IDs")
+    torch.save(save_data, cache_path)
     elapsed = time.time() - t0
     mb = os.path.getsize(cache_path) / 1024**2
+    print(f"\nCached {count} latents -> {cache_path}")
+    print(f"  Shape: {all_latents.shape}, {mb:.1f}MB, {elapsed:.0f}s")
     del vae
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+    print("  VAE unloaded\n")
     return cache_path
     with open(f"{config.output_dir}/config.json", "w") as f:
         json.dump(asdict(config), f, indent=2)
     cache_path = precache_latents(config)
     train_ds = CachedLatentDataset(cache_path)
     train_dl = DataLoader(train_ds, batch_size=config.batch_size, shuffle=True,
                           num_workers=config.num_workers, pin_memory=True, drop_last=True)
     mcfg = get_model_config(config.model_size, config.num_classes, config.class_drop_prob)
     mcfg["in_channels"] = config.latent_channels
     model = LiquidGen(**mcfg).to(device)
     if config.compile_model and hasattr(torch, "compile"):
         model = torch.compile(model)
     opt = torch.optim.AdamW(model.parameters(), lr=config.learning_rate,
                             weight_decay=config.weight_decay, betas=(0.9, 0.999))
     total_steps = len(train_dl) * config.num_epochs // config.gradient_accumulation_steps
     fm = FlowMatchingScheduler(config.min_timestep, config.max_timestep)
     lat_size = config.image_size // 8
+    print(f"\nSteps: {total_steps}, Batch: {config.batch_size}x{config.gradient_accumulation_steps}")
     print(f"Latent: [{config.batch_size}, {config.latent_channels}, {lat_size}, {lat_size}]")
     if torch.cuda.is_available():
         print(f"VRAM: {torch.cuda.memory_allocated()/1024**3:.1f} / "
               f"{torch.cuda.get_device_properties(0).total_mem/1024**3:.1f} GB")
     gs = 0; la = 0.0; vae = None; vae_loaded = False
+    print(f"\n{'='*60}\nTraining!\n{'='*60}\n")
     t_start = time.time()
     for epoch in range(config.num_epochs):
 if __name__ == "__main__":
     config = TrainConfig(
+        model_size="small", dataset_preset="cartoon",
         image_size=256, batch_size=8, num_epochs=5,
         log_every_n_steps=5, sample_every_n_steps=99999,
     )