Leacb4
/

gap-clip

@@ -14,6 +14,7 @@ from __future__ import annotations
 import difflib
 import hashlib
 import sys
 from pathlib import Path
 from io import BytesIO
@@ -33,17 +34,13 @@ if str(_PROJECT_ROOT) not in sys.path:
     sys.path.insert(0, str(_PROJECT_ROOT))
 from config import (  # type: ignore
     column_local_image_path,
     fashion_mnist_csv,
     local_dataset_path,
     images_dir,
 )
-_VALID_COLORS = [
-    "beige", "black", "blue", "brown", "green",
-    "orange", "pink", "purple", "red", "white", "yellow",
-]
 # ---------------------------------------------------------------------------
 # Fashion-MNIST helpers
 # ---------------------------------------------------------------------------
@@ -215,7 +212,6 @@ class KaggleDataset(Dataset):
         self.transform = transforms.Compose([
             transforms.Resize((224, 224)),
-            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ])
@@ -244,17 +240,40 @@ class KaggleDataset(Dataset):
         return image, description, color
 def load_kaggle_marqo_dataset(
     max_samples: int = 5000,
     include_hierarchy: bool = False,
 ) -> KaggleDataset:
-    """Download and prepare the KAGL Marqo HuggingFace dataset."""
-    from datasets import load_dataset  # type: ignore
-    print("Loading KAGL Marqo dataset...")
-    dataset = load_dataset("Marqo/KAGL")
-    df = dataset["data"].to_pandas()
-    print(f"Dataset loaded: {len(df)} samples, columns: {list(df.columns)}")
     df = df.dropna(subset=["text", "image"])
@@ -269,8 +288,8 @@ def load_kaggle_marqo_dataset(
     })
     kaggle_df = kaggle_df.dropna(subset=["color"])
-    kaggle_df = kaggle_df[kaggle_df["color"].isin(_VALID_COLORS)]
-    print(f"After color filtering: {len(kaggle_df)} samples, colors: {sorted(kaggle_df['color'].unique())}")
     return KaggleDataset(kaggle_df, include_hierarchy=include_hierarchy)
@@ -289,7 +308,6 @@ class LocalDataset(Dataset):
         self.transform = transforms.Compose([
             transforms.Resize((224, 224)),
-            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ])
@@ -301,7 +319,9 @@ class LocalDataset(Dataset):
         row = self.dataframe.iloc[idx]
         try:
             image_path = row.get(column_local_image_path) if hasattr(row, "get") else None
-            if isinstance(image_path, str) and image_path and Path(image_path).exists():
                 image = Image.open(image_path).convert("RGB")
             else:
                 # Fallback: download image from URL (and cache).
@@ -339,10 +359,21 @@ class LocalDataset(Dataset):
 def load_local_validation_dataset(
     max_samples: int = 5000,
     include_hierarchy: bool = False,
 ) -> LocalDataset:
-    """Load and prepare the internal local validation dataset."""
-    print("Loading local validation dataset...")
-    df = pd.read_csv(local_dataset_path)
     print(f"Dataset loaded: {len(df)} samples")
     if column_local_image_path in df.columns:
@@ -352,7 +383,6 @@ def load_local_validation_dataset(
         print(f"Column '{column_local_image_path}' not found; falling back to 'image_url'.")
     if "color" in df.columns:
-        df = df[df["color"].isin(_VALID_COLORS)]
         print(f"After color filtering: {len(df)} samples, colors: {sorted(df['color'].unique())}")
     if len(df) > max_samples:
@@ -376,6 +406,10 @@ def collate_fn_filter_none(batch):
     if not batch:
         print("Empty batch after filtering None values")
         return torch.tensor([]), [], []
     images, texts, colors = zip(*batch)
     return torch.stack(images), list(texts), list(colors)

 import difflib
 import hashlib
+import os
 import sys
 from pathlib import Path
 from io import BytesIO
     sys.path.insert(0, str(_PROJECT_ROOT))
 from config import (  # type: ignore
+    ROOT_DIR,
     column_local_image_path,
     fashion_mnist_csv,
     local_dataset_path,
     images_dir,
 )
 # ---------------------------------------------------------------------------
 # Fashion-MNIST helpers
 # ---------------------------------------------------------------------------
         self.transform = transforms.Compose([
             transforms.Resize((224, 224)),
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ])
         return image, description, color
+def download_kaggle_raw_df() -> pd.DataFrame:
+    """Download the raw KAGL Marqo DataFrame from HuggingFace.
+    This is the expensive network operation.  Callers can cache the result
+    and pass it to :func:`load_kaggle_marqo_dataset` via *raw_df* to avoid
+    repeated downloads.
+    """
+    from datasets import load_dataset  # type: ignore
+    print("Downloading KAGL Marqo dataset from HuggingFace...")
+    dataset = load_dataset("Marqo/KAGL")
+    df = dataset["data"].to_pandas()
+    print(f"KAGL dataset downloaded: {len(df)} samples, columns: {list(df.columns)}")
+    return df
 def load_kaggle_marqo_dataset(
     max_samples: int = 5000,
     include_hierarchy: bool = False,
+    raw_df: Optional[pd.DataFrame] = None,
 ) -> KaggleDataset:
+    """Download and prepare the KAGL Marqo HuggingFace dataset.
+    Args:
+        max_samples: Maximum number of samples to return.
+        include_hierarchy: If True, dataset tuples include a hierarchy element.
+        raw_df: Pre-downloaded DataFrame (from :func:`download_kaggle_raw_df`).
+            If *None*, the dataset is downloaded from HuggingFace.
+    """
+    if raw_df is not None:
+        df = raw_df.copy()
+        print(f"Using cached KAGL DataFrame: {len(df)} samples")
+    else:
+        df = download_kaggle_raw_df()
     df = df.dropna(subset=["text", "image"])
     })
     kaggle_df = kaggle_df.dropna(subset=["color"])
+    print(f"Colors: {sorted(kaggle_df['color'].unique())}")
     return KaggleDataset(kaggle_df, include_hierarchy=include_hierarchy)
         self.transform = transforms.Compose([
             transforms.Resize((224, 224)),
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ])
         row = self.dataframe.iloc[idx]
         try:
             image_path = row.get(column_local_image_path) if hasattr(row, "get") else None
+            if isinstance(image_path, str) and image_path:
+                if not os.path.isabs(image_path):
+                    image_path = str(ROOT_DIR / image_path)
                 image = Image.open(image_path).convert("RGB")
             else:
                 # Fallback: download image from URL (and cache).
 def load_local_validation_dataset(
     max_samples: int = 5000,
     include_hierarchy: bool = False,
+    raw_df: Optional[pd.DataFrame] = None,
 ) -> LocalDataset:
+    """Load and prepare the internal local validation dataset.
+    Args:
+        max_samples: Maximum number of samples to return.
+        include_hierarchy: If True, dataset tuples include a hierarchy element.
+        raw_df: Pre-loaded DataFrame. If *None*, the CSV is read from disk.
+    """
+    if raw_df is not None:
+        df = raw_df.copy()
+        print(f"Using cached local DataFrame: {len(df)} samples")
+    else:
+        print("Loading local validation dataset...")
+        df = pd.read_csv(local_dataset_path)
     print(f"Dataset loaded: {len(df)} samples")
     if column_local_image_path in df.columns:
         print(f"Column '{column_local_image_path}' not found; falling back to 'image_url'.")
     if "color" in df.columns:
         print(f"After color filtering: {len(df)} samples, colors: {sorted(df['color'].unique())}")
     if len(df) > max_samples:
     if not batch:
         print("Empty batch after filtering None values")
         return torch.tensor([]), [], []
+    # Support both 3-value (image, text, color) and 4-value (image, text, color, hierarchy) tuples
+    if len(batch[0]) == 4:
+        images, texts, colors, hierarchies = zip(*batch)
+        return torch.stack(images), list(texts), list(colors), list(hierarchies)
     images, texts, colors = zip(*batch)
     return torch.stack(images), list(texts), list(colors)