asdf98
/

IRIS-architecture

@@ -359,7 +359,7 @@
    "source": [
     "# \u2500\u2500\u2500 VAE Training Loop \u2500\u2500\u2500\n",
     "import time\n",
-    "from torch.cuda.amp import autocast, GradScaler\n",
     "\n",
     "VAE_EPOCHS = 80            # Enough to get good reconstructions\n",
     "VAE_LR = 1e-4\n",
@@ -368,25 +368,23 @@
     "\n",
     "optimizer_vae = torch.optim.AdamW(vae.parameters(), lr=VAE_LR, weight_decay=0.01)\n",
     "scheduler_vae = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_vae, T_max=VAE_EPOCHS)\n",
-    "scaler = GradScaler()\n",
     "dwt = HaarDWT2D()\n",
     "\n",
     "# Logging\n",
     "vae_losses = {\"total\": [], \"recon\": [], \"kl\": [], \"freq\": []}\n",
     "\n",
     "print(f\"Training VAE for {VAE_EPOCHS} epochs on {len(train_loader)} batches...\")\n",
-    "print(f\"{'Epoch':>6} {'Loss':>10} {'Recon':>10} {'KL':>10} {'Freq':>10} {'LR':>10} {'Time':>8}\")\n",
-    "print(\"\u2500\" * 70)\n",
     "\n",
     "vae.train()\n",
-    "for epoch in range(VAE_EPOCHS):\n",
     "    epoch_losses = {\"total\": 0, \"recon\": 0, \"kl\": 0, \"freq\": 0}\n",
-    "    t0 = time.time()\n",
     "\n",
     "    for images, _ in train_loader:\n",
-    "        images = images.to(device)\n",
     "\n",
-    "        with autocast(dtype=torch.float16):\n",
     "            x_recon, mean, logvar = vae(images)\n",
     "\n",
     "            # Reconstruction loss\n",
@@ -422,12 +420,7 @@
     "        vae_losses[k].append(epoch_losses[k])\n",
     "\n",
     "    scheduler_vae.step()\n",
-    "    dt = time.time() - t0\n",
-    "\n",
-    "    if (epoch + 1) % 10 == 0 or epoch == 0:\n",
-    "        lr = optimizer_vae.param_groups[0][\"lr\"]\n",
-    "        print(f\"{epoch+1:>6} {epoch_losses['total']:>10.4f} {epoch_losses['recon']:>10.4f} \"\n",
-    "              f\"{epoch_losses['kl']:>10.4f} {epoch_losses['freq']:>10.4f} {lr:>10.2e} {dt:>7.1f}s\")\n",
     "\n",
     "print(\"\\n\u2705 VAE training complete!\")"
    ],
@@ -539,6 +532,40 @@
     "print(f\"  Effective at r=6: ~{gen_params + 5*core_params:,} effective params\")\n",
     "print(f\"  Memory fp16: {gen_params*2/1024/1024:.1f} MB\")\n",
     "\n",
     "# Free standalone VAE to save memory\n",
     "del vae, optimizer_vae, scheduler_vae\n",
     "torch.cuda.empty_cache()"
@@ -551,6 +578,8 @@
    "metadata": {},
    "source": [
     "# \u2500\u2500\u2500 Generator Training Loop \u2500\u2500\u2500\n",
     "GEN_EPOCHS = 150           # More epochs for small dataset\n",
     "GEN_LR = 2e-4             # Higher LR works well with AdamW + cosine\n",
     "GRAD_ACCUM = 2             # Effective batch = BATCH_SIZE \u00d7 GRAD_ACCUM = 8\n",
@@ -563,7 +592,7 @@
     "    betas=(0.9, 0.95),\n",
     ")\n",
     "\n",
-    "total_steps = GEN_EPOCHS * len(train_loader) // GRAD_ACCUM\n",
     "\n",
     "def lr_lambda(step):\n",
     "    if step < WARMUP_STEPS:\n",
@@ -572,41 +601,35 @@
     "    return 0.5 * (1 + __import__('math').cos(__import__('math').pi * progress))\n",
     "\n",
     "scheduler_gen = torch.optim.lr_scheduler.LambdaLR(optimizer_gen, lr_lambda)\n",
-    "scaler_gen = GradScaler()\n",
     "\n",
     "# Logging\n",
     "gen_losses = {\"total\": [], \"velocity\": [], \"kl\": []}\n",
     "\n",
     "print(f\"Training generator for {GEN_EPOCHS} epochs ({total_steps} optimizer steps)\")\n",
     "print(f\"Effective batch size: {BATCH_SIZE} \u00d7 {GRAD_ACCUM} = {BATCH_SIZE * GRAD_ACCUM}\")\n",
-    "print(f\"Warmup: {WARMUP_STEPS} steps, then cosine decay to 0\")\n",
-    "print()\n",
-    "print(f\"{'Epoch':>6} {'Loss':>10} {'VelLoss':>10} {'MeanT':>8} {'LR':>10} {'Time':>8}\")\n",
-    "print(\"\u2500\" * 60)\n",
     "\n",
     "iris.generator.train()\n",
     "global_step = 0\n",
     "best_loss = float('inf')\n",
     "\n",
-    "for epoch in range(GEN_EPOCHS):\n",
     "    epoch_vel = 0\n",
     "    epoch_total = 0\n",
     "    n_batches = 0\n",
-    "    t0 = time.time()\n",
     "\n",
     "    optimizer_gen.zero_grad(set_to_none=True)\n",
     "\n",
-    "    for batch_idx, (images, captions) in enumerate(train_loader):\n",
-    "        images = images.to(device)\n",
-    "\n",
-    "        # Encode text with CLIP\n",
-    "        with torch.no_grad():\n",
-    "            text_emb = encode_text(list(captions))  # [B, 77, 768]\n",
     "\n",
     "        # Forward pass with mixed precision\n",
-    "        with autocast(dtype=torch.float16):\n",
-    "            # Randomly sample iteration count for robustness\n",
-    "            r = [4, 5, 6, 7, 8][torch.randint(0, 5, (1,)).item()]\n",
     "            result = iris.train_step(images, text_emb, num_iterations=r)\n",
     "            loss = result[\"loss\"] / GRAD_ACCUM\n",
     "\n",
@@ -630,17 +653,16 @@
     "    avg_total = epoch_total / n_batches\n",
     "    gen_losses[\"velocity\"].append(avg_vel)\n",
     "    gen_losses[\"total\"].append(avg_total)\n",
-    "    dt = time.time() - t0\n",
     "\n",
     "    if avg_vel < best_loss:\n",
     "        best_loss = avg_vel\n",
     "\n",
-    "    if (epoch + 1) % 10 == 0 or epoch == 0:\n",
-    "        lr = optimizer_gen.param_groups[0][\"lr\"]\n",
-    "        print(f\"{epoch+1:>6} {avg_total:>10.4f} {avg_vel:>10.4f} \"\n",
-    "              f\"{result['mean_t']:>8.3f} {lr:>10.2e} {dt:>7.1f}s\")\n",
     "\n",
-    "print(f\"\\n\u2705 Generator training complete! Best velocity loss: {best_loss:.4f}\")"
    ],
    "outputs": [],
    "execution_count": null

    "source": [
     "# \u2500\u2500\u2500 VAE Training Loop \u2500\u2500\u2500\n",
     "import time\n",
+    "from tqdm.auto import tqdm\n",
     "\n",
     "VAE_EPOCHS = 80            # Enough to get good reconstructions\n",
     "VAE_LR = 1e-4\n",
     "\n",
     "optimizer_vae = torch.optim.AdamW(vae.parameters(), lr=VAE_LR, weight_decay=0.01)\n",
     "scheduler_vae = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_vae, T_max=VAE_EPOCHS)\n",
+    "scaler = torch.amp.GradScaler('cuda')\n",
     "dwt = HaarDWT2D()\n",
     "\n",
     "# Logging\n",
     "vae_losses = {\"total\": [], \"recon\": [], \"kl\": [], \"freq\": []}\n",
     "\n",
     "print(f\"Training VAE for {VAE_EPOCHS} epochs on {len(train_loader)} batches...\")\n",
     "\n",
     "vae.train()\n",
+    "pbar = tqdm(range(VAE_EPOCHS), desc=\"VAE Training\")\n",
+    "for epoch in pbar:\n",
     "    epoch_losses = {\"total\": 0, \"recon\": 0, \"kl\": 0, \"freq\": 0}\n",
     "\n",
     "    for images, _ in train_loader:\n",
+    "        images = images.to(device, non_blocking=True)\n",
     "\n",
+    "        with torch.amp.autocast('cuda', dtype=torch.float16):\n",
     "            x_recon, mean, logvar = vae(images)\n",
     "\n",
     "            # Reconstruction loss\n",
     "        vae_losses[k].append(epoch_losses[k])\n",
     "\n",
     "    scheduler_vae.step()\n",
+    "    pbar.set_postfix(loss=f\"{epoch_losses['total']:.4f}\", recon=f\"{epoch_losses['recon']:.4f}\")\n",
     "\n",
     "print(\"\\n\u2705 VAE training complete!\")"
    ],
     "print(f\"  Effective at r=6: ~{gen_params + 5*core_params:,} effective params\")\n",
     "print(f\"  Memory fp16: {gen_params*2/1024/1024:.1f} MB\")\n",
     "\n",
+    "# \u2500\u2500\u2500 Pre-cache CLIP text embeddings (HUGE speedup) \u2500\u2500\u2500\n",
+    "# Instead of encoding text every batch, cache all embeddings upfront\n",
+    "print(\"\\nPre-caching CLIP text embeddings...\")\n",
+    "all_text_embeddings = []\n",
+    "cache_loader = DataLoader(train_dataset, batch_size=32, shuffle=False, num_workers=0)\n",
+    "with torch.no_grad():\n",
+    "    for _, captions in tqdm(cache_loader, desc=\"Encoding text\"):\n",
+    "        emb = encode_text(list(captions))\n",
+    "        all_text_embeddings.append(emb.cpu())\n",
+    "all_text_embeddings = torch.cat(all_text_embeddings, dim=0)  # [N, 77, 768]\n",
+    "print(f\"\u2705 Cached {all_text_embeddings.shape[0]} text embeddings: {all_text_embeddings.shape}\")\n",
+    "\n",
+    "# Free CLIP from GPU (we don't need it during training anymore!)\n",
+    "text_encoder.cpu()\n",
+    "torch.cuda.empty_cache()\n",
+    "print(\"\u2705 CLIP moved to CPU to free ~600MB VRAM\")\n",
+    "\n",
+    "# Create a new dataset that uses cached embeddings\n",
+    "class CachedDataset(Dataset):\n",
+    "    def __init__(self, image_dataset, cached_text_emb):\n",
+    "        self.image_dataset = image_dataset\n",
+    "        self.text_emb = cached_text_emb\n",
+    "    def __len__(self):\n",
+    "        return len(self.image_dataset)\n",
+    "    def __getitem__(self, idx):\n",
+    "        image, _ = self.image_dataset[idx]\n",
+    "        return image, self.text_emb[idx]\n",
+    "\n",
+    "cached_dataset = CachedDataset(train_dataset, all_text_embeddings)\n",
+    "cached_loader = DataLoader(\n",
+    "    cached_dataset, batch_size=BATCH_SIZE, shuffle=True,\n",
+    "    num_workers=NUM_WORKERS, pin_memory=True, drop_last=True,\n",
+    ")\n",
+    "\n",
     "# Free standalone VAE to save memory\n",
     "del vae, optimizer_vae, scheduler_vae\n",
     "torch.cuda.empty_cache()"
    "metadata": {},
    "source": [
     "# \u2500\u2500\u2500 Generator Training Loop \u2500\u2500\u2500\n",
+    "import time\n",
+    "\n",
     "GEN_EPOCHS = 150           # More epochs for small dataset\n",
     "GEN_LR = 2e-4             # Higher LR works well with AdamW + cosine\n",
     "GRAD_ACCUM = 2             # Effective batch = BATCH_SIZE \u00d7 GRAD_ACCUM = 8\n",
     "    betas=(0.9, 0.95),\n",
     ")\n",
     "\n",
+    "total_steps = GEN_EPOCHS * len(cached_loader) // GRAD_ACCUM\n",
     "\n",
     "def lr_lambda(step):\n",
     "    if step < WARMUP_STEPS:\n",
     "    return 0.5 * (1 + __import__('math').cos(__import__('math').pi * progress))\n",
     "\n",
     "scheduler_gen = torch.optim.lr_scheduler.LambdaLR(optimizer_gen, lr_lambda)\n",
+    "scaler_gen = torch.amp.GradScaler('cuda')\n",
     "\n",
     "# Logging\n",
     "gen_losses = {\"total\": [], \"velocity\": [], \"kl\": []}\n",
     "\n",
     "print(f\"Training generator for {GEN_EPOCHS} epochs ({total_steps} optimizer steps)\")\n",
     "print(f\"Effective batch size: {BATCH_SIZE} \u00d7 {GRAD_ACCUM} = {BATCH_SIZE * GRAD_ACCUM}\")\n",
+    "print(f\"Using cached CLIP embeddings (no per-batch encoding overhead)\")\n",
     "\n",
     "iris.generator.train()\n",
     "global_step = 0\n",
     "best_loss = float('inf')\n",
     "\n",
+    "pbar = tqdm(range(GEN_EPOCHS), desc=\"Gen Training\")\n",
+    "for epoch in pbar:\n",
     "    epoch_vel = 0\n",
     "    epoch_total = 0\n",
     "    n_batches = 0\n",
     "\n",
     "    optimizer_gen.zero_grad(set_to_none=True)\n",
     "\n",
+    "    for batch_idx, (images, text_emb) in enumerate(cached_loader):\n",
+    "        images = images.to(device, non_blocking=True)\n",
+    "        text_emb = text_emb.to(device, non_blocking=True)\n",
     "\n",
     "        # Forward pass with mixed precision\n",
+    "        with torch.amp.autocast('cuda', dtype=torch.float16):\n",
+    "            # Randomly sample iteration count for robustness (keep low for speed)\n",
+    "            r = [3, 4, 5][torch.randint(0, 3, (1,)).item()]\n",
     "            result = iris.train_step(images, text_emb, num_iterations=r)\n",
     "            loss = result[\"loss\"] / GRAD_ACCUM\n",
     "\n",
     "    avg_total = epoch_total / n_batches\n",
     "    gen_losses[\"velocity\"].append(avg_vel)\n",
     "    gen_losses[\"total\"].append(avg_total)\n",
     "\n",
     "    if avg_vel < best_loss:\n",
     "        best_loss = avg_vel\n",
     "\n",
+    "    pbar.set_postfix(vel_loss=f\"{avg_vel:.4f}\", best=f\"{best_loss:.4f}\")\n",
+    "\n",
+    "print(f\"\\n\u2705 Generator training complete! Best velocity loss: {best_loss:.4f}\")\n",
     "\n",
+    "# Reload CLIP for generation\n",
+    "text_encoder.to(device)"
    ],
    "outputs": [],
    "execution_count": null