Leacb4
/

gap-clip

@@ -20,31 +20,16 @@ Paper reference: Section 5.1, Table 1.
 """
 import os
-import json
-import hashlib
-import requests
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import sys
 from pathlib import Path
 import torch
-import pandas as pd
-import numpy as np
 import matplotlib.pyplot as plt
-import seaborn as sns
-import difflib
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
-from collections import defaultdict
-from tqdm import tqdm
-from torch.utils.data import Dataset, DataLoader
-from torchvision import transforms
-from PIL import Image
-from io import BytesIO
 import warnings
 warnings.filterwarnings('ignore')
-from transformers import CLIPProcessor, CLIPModel as CLIPModel_transformers
-from huggingface_hub import hf_hub_download
 # Ensure project root is importable when running this file directly.
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
@@ -54,220 +39,20 @@ if str(PROJECT_ROOT) not in sys.path:
 from config import (
     color_model_path,
     color_emb_dim,
-    local_dataset_path,
-    column_local_image_path,
-    tokeniser_path,
-    images_dir,
 )
-from training.color_model import ColorCLIP, Tokenizer
-class KaggleDataset(Dataset):
-    """Dataset class for KAGL Marqo dataset"""
-    def __init__(self, dataframe, image_size=224):
-        self.dataframe = dataframe
-        self.image_size = image_size
-        # Transforms for validation (no augmentation)
-        self.transform = transforms.Compose([
-        transforms.Resize((224, 224)),
-        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),  # AUGMENTATION
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    ])
-    def __len__(self):
-        return len(self.dataframe)
-    def __getitem__(self, idx):
-        row = self.dataframe.iloc[idx]
-        # Handle image - it should be in row['image_url'] and contain the image data as bytes
-        image_data = row['image_url']
-        # Check if image_data has 'bytes' key or is already PIL Image
-        if isinstance(image_data, dict) and 'bytes' in image_data:
-            image = Image.open(BytesIO(image_data['bytes'])).convert("RGB")
-        elif hasattr(image_data, 'convert'):  # Already a PIL Image
-            image = image_data.convert("RGB")
-        else:
-            # Assume it's raw bytes
-            image = Image.open(BytesIO(image_data)).convert("RGB")
-        # Apply validation transform
-        image = self.transform(image)
-        # Get text and labels
-        description = row['text']
-        color = row['color']
-        return image, description, color
-def load_kaggle_marqo_dataset(max_samples=5000):
-    """Load and prepare Kaggle KAGL dataset with memory optimization"""
-    from datasets import load_dataset
-    print("📊 Loading Kaggle KAGL dataset...")
-    # Load the dataset
-    dataset = load_dataset("Marqo/KAGL")
-    df = dataset["data"].to_pandas()
-    print(f"✅ Dataset Kaggle loaded")
-    print(f" Before filtering: {len(df)} samples")
-    print(f" Available columns: {list(df.columns)}")
-    # Ensure we have text and image data
-    df = df.dropna(subset=['text', 'image'])
-    print(f" After removing missing text/image: {len(df)} samples")
-    df_test = df.copy()
-    # Limit to max_samples with RANDOM SAMPLING to get diverse colors
-    if len(df_test) > max_samples:
-        df_test = df_test.sample(n=max_samples, random_state=42)
-        print(f"📊 Randomly sampled {max_samples} samples from Kaggle dataset")
-    # Create formatted dataset with proper column names
-    kaggle_formatted = pd.DataFrame({
-        'image_url': df_test['image'],  # This contains image data as bytes
-        'text': df_test['text'],
-        'color': df_test['baseColour'].str.lower().str.replace("grey", "gray")  # Use actual colors
-    })
-    # Filter out rows with None/NaN colors
-    before_color_filter = len(kaggle_formatted)
-    kaggle_formatted = kaggle_formatted.dropna(subset=['color'])
-    if len(kaggle_formatted) < before_color_filter:
-        print(f" After removing missing colors: {len(kaggle_formatted)} samples (removed {before_color_filter - len(kaggle_formatted)} samples)")
-    # Filter for colors that were used during training (11 colors)
-    valid_colors = ['beige', 'black', 'blue', 'brown', 'green', 'orange', 'pink', 'purple', 'red', 'white', 'yellow']
-    before_valid_filter = len(kaggle_formatted)
-    kaggle_formatted = kaggle_formatted[kaggle_formatted['color'].isin(valid_colors)]
-    print(f" After filtering for valid colors: {len(kaggle_formatted)} samples (removed {before_valid_filter - len(kaggle_formatted)} samples)")
-    print(f" Valid colors found: {sorted(kaggle_formatted['color'].unique())}")
-    print(f" Final dataset size: {len(kaggle_formatted)} samples")
-    # Show color distribution in final dataset
-    print(f"🎨 Color distribution in Kaggle dataset:")
-    color_counts = kaggle_formatted['color'].value_counts()
-    for color in color_counts.index:
-        print(f"   {color}: {color_counts[color]} samples")
-    return KaggleDataset(kaggle_formatted)
-class LocalDataset(Dataset):
-    """Dataset class for local validation dataset"""
-    def __init__(self, dataframe, image_size=224):
-        self.dataframe = dataframe
-        self.image_size = image_size
-        # Transforms for validation (no augmentation)
-        self.transform = transforms.Compose([
-        transforms.Resize((224, 224)),
-        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),  # AUGMENTATION
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    ])
-    def __len__(self):
-        return len(self.dataframe)
-    def __getitem__(self, idx):
-        row = self.dataframe.iloc[idx]
-        try:
-            # Try local path first
-            image_path = row.get(column_local_image_path) if hasattr(row, 'get') else None
-            if isinstance(image_path, str) and image_path and os.path.exists(image_path):
-                image = Image.open(image_path).convert("RGB")
-            else:
-                # Fallback: download from image_url with caching
-                image_url = row.get('image_url') if hasattr(row, 'get') else None
-                if isinstance(image_url, str) and image_url:
-                    cache_dir = Path(images_dir)
-                    cache_dir.mkdir(parents=True, exist_ok=True)
-                    url_hash = hashlib.md5(image_url.encode("utf-8")).hexdigest()
-                    cache_path = cache_dir / f"{url_hash}.jpg"
-                    if cache_path.exists():
-                        image = Image.open(cache_path).convert("RGB")
-                    else:
-                        resp = requests.get(image_url, timeout=10)
-                        resp.raise_for_status()
-                        image = Image.open(BytesIO(resp.content)).convert("RGB")
-                        image.save(cache_path, "JPEG", quality=85, optimize=True)
-                else:
-                    raise ValueError("No valid image_path or image_url")
-        except Exception as e:
-            image = Image.new('RGB', (224, 224), color='gray')
-        # Apply transform
-        image = self.transform(image)
-        # Get text and labels
-        description = row['text']
-        color = row['color']
-        return image, description, color
-def load_local_validation_dataset(max_samples=5000):
-    """Load and prepare local validation dataset"""
-    print("📊 Loading local validation dataset...")
-    df = pd.read_csv(local_dataset_path)
-    print(f"✅ Dataset loaded: {len(df)} samples")
-    # Filter out rows with NaN values in image path (use whichever column exists)
-    img_col = column_local_image_path if column_local_image_path in df.columns else 'image_url'
-    df_clean = df.dropna(subset=[img_col])
-    print(f"📊 After filtering NaN image paths ({img_col}): {len(df_clean)} samples")
-    # Filter for colors that were used during training (11 colors)
-    valid_colors = ['beige', 'black', 'blue', 'brown', 'green', 'orange', 'pink', 'purple', 'red', 'white', 'yellow']
-    if 'color' in df_clean.columns:
-        before_valid_filter = len(df_clean)
-        df_clean = df_clean[df_clean['color'].isin(valid_colors)]
-        print(f"📊 After filtering for valid colors: {len(df_clean)} samples (removed {before_valid_filter - len(df_clean)} samples)")
-        print(f"🎨 Valid colors found: {sorted(df_clean['color'].unique())}")
-    # Limit to max_samples with RANDOM SAMPLING to get diverse colors
-    if len(df_clean) > max_samples:
-        df_clean = df_clean.sample(n=max_samples, random_state=42)
-        print(f"📊 Randomly sampled {max_samples} samples")
-    print(f"📊 Using {len(df_clean)} samples for evaluation")
-    # Show color distribution after sampling
-    if 'color' in df_clean.columns:
-        print(f"🎨 Color distribution in sampled data:")
-        color_counts = df_clean['color'].value_counts()
-        print(f"   Total unique colors: {len(color_counts)}")
-        for color in color_counts.index[:15]:  # Show top 15
-            print(f"   {color}: {color_counts[color]} samples")
-    return LocalDataset(df_clean)
-def collate_fn_filter_none(batch):
-    """Collate function that filters out None values from batch with debug print"""
-    # Filter out None values
-    original_len = len(batch)
-    batch = [item for item in batch if item is not None]
-    if original_len > len(batch):
-        print(f"⚠️ Filtered out {original_len - len(batch)} None values from batch (original: {original_len}, filtered: {len(batch)})")
-    if len(batch) == 0:
-        # Return empty batch with correct structure
-        print("⚠️ Empty batch after filtering None values")
-        return torch.tensor([]), [], []
-    images, texts, colors = zip(*batch)
-    images = torch.stack(images, dim=0)
-    return images, list(texts), list(colors)
 class ColorEvaluator:
@@ -277,325 +62,54 @@ class ColorEvaluator:
         self,
         device='mps',
         directory="figures/confusion_matrices/cm_color",
-        repo_id="Leacb4/gap-clip",
-        cache_dir="./models_cache",
     ):
         self.device = torch.device(device)
         self.directory = directory
         self.color_emb_dim = color_emb_dim
-        self.repo_id = repo_id
-        self.cache_dir = cache_dir
         os.makedirs(self.directory, exist_ok=True)
-        # Load baseline Fashion CLIP model
-        print("📦 Loading baseline Fashion CLIP model...")
-        patrick_model_name = "patrickjohncyh/fashion-clip"
-        self.baseline_processor = CLIPProcessor.from_pretrained(patrick_model_name)
-        self.baseline_model = CLIPModel_transformers.from_pretrained(patrick_model_name).to(self.device)
-        self.baseline_model.eval()
-        print("✅ Baseline Fashion CLIP model loaded successfully")
-        # Load specialized color model (16D)
-        self.color_model = None
-        self.color_tokenizer = None
-        self._load_color_model()
-    def _load_color_model(self):
-        """Load the specialized 16D color model and tokenizer."""
-        if self.color_model is not None and self.color_tokenizer is not None:
-            return
-        local_model_exists = os.path.exists(color_model_path)
-        local_tokenizer_exists = os.path.exists(tokeniser_path)
-        if local_model_exists and local_tokenizer_exists:
-            print("🎨 Loading specialized color model (16D) from local files...")
-            state_dict = torch.load(color_model_path, map_location=self.device)
-            with open(tokeniser_path, "r") as f:
-                vocab = json.load(f)
-        else:
-            print("🎨 Local color model/tokenizer not found. Loading from Hugging Face...")
-            print(f"   Repo: {self.repo_id}")
-            hf_model_path = hf_hub_download(
-                repo_id=self.repo_id,
-                filename="color_model.pt",
-                cache_dir=self.cache_dir,
-            )
-            hf_vocab_path = hf_hub_download(
-                repo_id=self.repo_id,
-                filename="tokenizer_vocab.json",
-                cache_dir=self.cache_dir,
-            )
-            state_dict = torch.load(hf_model_path, map_location=self.device)
-            with open(hf_vocab_path, "r") as f:
-                vocab = json.load(f)
-        # Get vocab size from the embedding weight shape in checkpoint
-        vocab_size = state_dict['text_encoder.embedding.weight'].shape[0]
-        print(f"   Detected vocab size from checkpoint: {vocab_size}")
-        self.color_tokenizer = Tokenizer()
-        self.color_tokenizer.load_vocab(vocab)
-        # Create model with the vocab size from checkpoint (not from tokenizer)
-        self.color_model = ColorCLIP(vocab_size=vocab_size, embedding_dim=self.color_emb_dim)
-        # Load state dict
-        self.color_model.load_state_dict(state_dict)
-        self.color_model.to(self.device)
-        self.color_model.eval()
-        print("✅ Color model loaded successfully")
-    def _tokenize_color_texts(self, texts):
-        """Tokenize texts with the color tokenizer and return padded tensors."""
-        token_lists = [self.color_tokenizer(t) for t in texts]
-        max_len = max((len(toks) for toks in token_lists), default=0)
-        max_len = max_len if max_len > 0 else 1
-        input_ids = torch.zeros(len(texts), max_len, dtype=torch.long, device=self.device)
-        lengths = torch.zeros(len(texts), dtype=torch.long, device=self.device)
-        for i, toks in enumerate(token_lists):
-            if len(toks) > 0:
-                input_ids[i, :len(toks)] = torch.tensor(toks, dtype=torch.long, device=self.device)
-                lengths[i] = len(toks)
-            else:
-                lengths[i] = 1  # avoid zero-length
-        return input_ids, lengths
-    def extract_color_embeddings(self, dataloader, embedding_type='text', max_samples=10000):
-        """Extract 16D color embeddings from specialized color model."""
-        self._load_color_model()
-        all_embeddings = []
-        all_colors = []
-        sample_count = 0
-        with torch.no_grad():
-            for batch in tqdm(dataloader, desc=f"Extracting {embedding_type} color embeddings"):
-                if sample_count >= max_samples:
-                    break
-                images, texts, colors = batch
-                images = images.to(self.device)
-                images = images.expand(-1, 3, -1, -1)
-                if embedding_type == 'text':
-                    input_ids, lengths = self._tokenize_color_texts(texts)
-                    embeddings = self.color_model.text_encoder(input_ids, lengths)
-                elif embedding_type == 'image':
-                    embeddings = self.color_model.image_encoder(images)
-                else:
-                    input_ids, lengths = self._tokenize_color_texts(texts)
-                    embeddings = self.color_model.text_encoder(input_ids, lengths)
-                all_embeddings.append(embeddings.cpu().numpy())
-                normalized_colors = [str(c).lower().strip().replace("grey", "gray") for c in colors]
-                all_colors.extend(normalized_colors)
-                sample_count += len(images)
-                del images, embeddings
-                if embedding_type != 'image':
-                    del input_ids, lengths
-                torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        return np.vstack(all_embeddings), all_colors
-    def extract_baseline_embeddings_batch(self, dataloader, embedding_type='text', max_samples=10000):
-        """Extract embeddings from baseline Fashion CLIP model"""
-        all_embeddings = []
-        all_colors = []
-        sample_count = 0
-        with torch.no_grad():
-            for batch in tqdm(dataloader, desc=f"Extracting baseline {embedding_type} embeddings"):
-                if sample_count >= max_samples:
-                    break
-                images, texts, colors = batch
-                images = images.to(self.device)
-                images = images.expand(-1, 3, -1, -1)  # Ensure 3 channels
-                # Process text inputs with baseline processor
-                text_inputs = self.baseline_processor(text=texts, padding=True, return_tensors="pt")
-                text_inputs = {k: v.to(self.device) for k, v in text_inputs.items()}
-                # Forward pass through baseline model
-                outputs = self.baseline_model(**text_inputs, pixel_values=images)
-                # Extract embeddings based on type
-                if embedding_type == 'text':
-                    embeddings = outputs.text_embeds
-                elif embedding_type == 'image':
-                    embeddings = outputs.image_embeds
-                else:
-                    embeddings = outputs.text_embeds
-                all_embeddings.append(embeddings.cpu().numpy())
-                all_colors.extend(colors)
-                sample_count += len(images)
-                # Clear GPU memory
-                del images, text_inputs, outputs, embeddings
-                torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        return np.vstack(all_embeddings), all_colors
-    def compute_similarity_metrics(self, embeddings, labels):
-        """Compute intra-class and inter-class similarities - optimized version"""
-        max_samples = min(5000, len(embeddings))
-        if len(embeddings) > max_samples:
-            indices = np.random.choice(len(embeddings), max_samples, replace=False)
-            embeddings = embeddings[indices]
-            labels = [labels[i] for i in indices]
-        similarities = cosine_similarity(embeddings)
-        # Create label groups using numpy for faster indexing
-        label_array = np.array(labels)
-        unique_labels = np.unique(label_array)
-        label_groups = {label: np.where(label_array == label)[0] for label in unique_labels}
-        # Compute intra-class similarities using vectorized operations
-        intra_class_similarities = []
-        for label, indices in label_groups.items():
-            if len(indices) > 1:
-                # Extract submatrix for this class
-                class_similarities = similarities[np.ix_(indices, indices)]
-                # Get upper triangle (excluding diagonal)
-                triu_indices = np.triu_indices_from(class_similarities, k=1)
-                intra_class_similarities.extend(class_similarities[triu_indices].tolist())
-        # Compute inter-class similarities using vectorized operations
-        inter_class_similarities = []
-        labels_list = list(label_groups.keys())
-        for i in range(len(labels_list)):
-            for j in range(i + 1, len(labels_list)):
-                label1_indices = label_groups[labels_list[i]]
-                label2_indices = label_groups[labels_list[j]]
-                # Extract submatrix between two classes
-                inter_sims = similarities[np.ix_(label1_indices, label2_indices)]
-                inter_class_similarities.extend(inter_sims.flatten().tolist())
-        nn_accuracy = self.compute_embedding_accuracy(embeddings, labels, similarities)
-        centroid_accuracy = self.compute_centroid_accuracy(embeddings, labels)
-        return {
-            'intra_class_similarities': intra_class_similarities,
-            'inter_class_similarities': inter_class_similarities,
-            'intra_class_mean': float(np.mean(intra_class_similarities)) if intra_class_similarities else 0.0,
-            'inter_class_mean': float(np.mean(inter_class_similarities)) if inter_class_similarities else 0.0,
-            'separation_score': float(np.mean(intra_class_similarities) - np.mean(inter_class_similarities)) if intra_class_similarities and inter_class_similarities else 0.0,
-            'accuracy': nn_accuracy,
-            'centroid_accuracy': centroid_accuracy,
-        }
-    def compute_embedding_accuracy(self, embeddings, labels, similarities):
-        """Compute classification accuracy using nearest neighbor"""
-        correct_predictions = 0
-        total_predictions = len(labels)
-        for i in range(len(embeddings)):
-            true_label = labels[i]
-            similarities_row = similarities[i].copy()
-            similarities_row[i] = -1
-            nearest_neighbor_idx = int(np.argmax(similarities_row))
-            predicted_label = labels[nearest_neighbor_idx]
-            if predicted_label == true_label:
-                correct_predictions += 1
-        return correct_predictions / total_predictions if total_predictions > 0 else 0.0
-    def compute_centroid_accuracy(self, embeddings, labels):
-        """Compute classification accuracy using centroids - optimized vectorized version"""
-        unique_labels = list(set(labels))
-        # Compute centroids efficiently
-        centroids = {}
-        for label in unique_labels:
-            label_mask = np.array(labels) == label
-            centroids[label] = np.mean(embeddings[label_mask], axis=0)
-        # Stack centroids for vectorized similarity computation
-        centroid_matrix = np.vstack([centroids[label] for label in unique_labels])
-        # Compute all similarities at once
-        similarities = cosine_similarity(embeddings, centroid_matrix)
-        # Get predicted labels
-        predicted_indices = np.argmax(similarities, axis=1)
-        predicted_labels = [unique_labels[idx] for idx in predicted_indices]
-        # Compute accuracy
-        correct_predictions = sum(pred == true for pred, true in zip(predicted_labels, labels))
-        return correct_predictions / len(labels) if len(labels) > 0 else 0.0
-    def predict_labels_from_embeddings(self, embeddings, labels):
-        """Predict labels from embeddings using centroid-based classification - optimized vectorized version"""
-        # Filter out None labels when computing centroids
-        unique_labels = [l for l in set(labels) if l is not None]
-        if len(unique_labels) == 0:
-            # If no valid labels, return None for all predictions
-            return [None] * len(embeddings)
-        # Compute centroids efficiently
-        centroids = {}
-        for label in unique_labels:
-            label_mask = np.array(labels) == label
-            if np.any(label_mask):
-                centroids[label] = np.mean(embeddings[label_mask], axis=0)
-        # Stack centroids for vectorized similarity computation
-        centroid_labels = list(centroids.keys())
-        centroid_matrix = np.vstack([centroids[label] for label in centroid_labels])
-        # Compute all similarities at once
-        similarities = cosine_similarity(embeddings, centroid_matrix)
-        # Get predicted labels
-        predicted_indices = np.argmax(similarities, axis=1)
-        predictions = [centroid_labels[idx] for idx in predicted_indices]
-        return predictions
-    def create_confusion_matrix(self, true_labels, predicted_labels, title="Confusion Matrix", label_type="Label"):
-        """Create and plot confusion matrix"""
-        unique_labels = sorted(list(set(true_labels + predicted_labels)))
-        cm = confusion_matrix(true_labels, predicted_labels, labels=unique_labels)
-        accuracy = accuracy_score(true_labels, predicted_labels)
-        plt.figure(figsize=(12, 10))
-        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)
-        plt.title(f'{title}\nAccuracy: {accuracy:.3f} ({accuracy*100:.1f}%)')
-        plt.ylabel(f'True {label_type}')
-        plt.xlabel(f'Predicted {label_type}')
-        plt.xticks(rotation=45)
-        plt.yticks(rotation=0)
-        plt.tight_layout()
-        return plt.gcf(), accuracy, cm
     def evaluate_classification_performance(self, embeddings, labels, embedding_type="Embeddings", label_type="Label"):
         """
         Evaluate classification performance and create confusion matrix.
         Args:
             embeddings: Embeddings
             labels: True labels
             embedding_type: Type of embeddings for display
             label_type: Type of labels (Color)
-            full_embeddings: Optional full 512-dim embeddings for ensemble (if None, uses only embeddings)
-            ensemble_weight: Weight for embeddings in ensemble (0.0 = only full, 1.0 = only embeddings)
         """
-        predictions = self.predict_labels_from_embeddings(embeddings, labels)
-        title_suffix = ""
         # Filter out None values from labels and predictions
-        valid_indices = [i for i, (label, pred) in enumerate(zip(labels, predictions))
                         if label is not None and pred is not None]
         if len(valid_indices) == 0:
-            print(f"⚠️ Warning: No valid labels/predictions found (all are None)")
             return {
                 'accuracy': 0.0,
                 'predictions': predictions,
@@ -603,12 +117,12 @@ class ColorEvaluator:
                 'classification_report': None,
                 'figure': None,
             }
         filtered_labels = [labels[i] for i in valid_indices]
         filtered_predictions = [predictions[i] for i in valid_indices]
         accuracy = accuracy_score(filtered_labels, filtered_predictions)
-        fig, acc, cm = self.create_confusion_matrix(
             filtered_labels, filtered_predictions,
             embedding_type,
             label_type
@@ -631,27 +145,31 @@ class ColorEvaluator:
         print(f"Max samples: {max_samples}")
         print(f"{'='*60}")
-        kaggle_dataset = load_kaggle_marqo_dataset(max_samples)
         if kaggle_dataset is None:
-            print("❌ Failed to load KAGL dataset")
             return None
         dataloader = DataLoader(kaggle_dataset, batch_size=8, shuffle=False, num_workers=0, collate_fn=collate_fn_filter_none)
         results = {}
-        # ========== EXTRACT BASELINE EMBEDDINGS ==========
-        print("\n📦 Extracting baseline embeddings...")
-        text_full_embeddings, text_colors_full = self.extract_color_embeddings(dataloader, embedding_type='text', max_samples=max_samples)
-        image_full_embeddings, image_colors_full = self.extract_color_embeddings(dataloader, embedding_type='image', max_samples=max_samples)
-        text_color_metrics = self.compute_similarity_metrics(text_full_embeddings, text_colors_full)
         text_color_class = self.evaluate_classification_performance(
             text_full_embeddings, text_colors_full,
             "KAGL Marqo, text, color confusion matrix", "Color",
         )
         text_color_metrics.update(text_color_class)
         results['text_color'] = text_color_metrics
-        image_color_metrics = self.compute_similarity_metrics(image_full_embeddings, image_colors_full)
         image_color_class = self.evaluate_classification_performance(
             image_full_embeddings, image_colors_full,
             "KAGL Marqo, image, color confusion matrix", "Color",
@@ -681,20 +199,22 @@ class ColorEvaluator:
         print(f"Max samples: {max_samples}")
         print(f"{'='*60}")
-        local_dataset = load_local_validation_dataset(max_samples)
         dataloader = DataLoader(local_dataset, batch_size=8, shuffle=False, num_workers=0)
         results = {}
         # ========== COLOR EVALUATION  ==========
-        print("\n🎨 COLOR EVALUATION ")
         print("=" * 50)
         # Text color embeddings
-        print("\n📝 Extracting text color embeddings...")
-        text_color_embeddings, text_colors = self.extract_color_embeddings(dataloader, 'text', max_samples)
         print(f"   Text color embeddings shape: {text_color_embeddings.shape}")
-        text_color_metrics = self.compute_similarity_metrics(text_color_embeddings, text_colors)
         text_color_class = self.evaluate_classification_performance(
             text_color_embeddings, text_colors, "Test Dataset, text, color confusion matrix", "Color"
         )
@@ -705,10 +225,12 @@ class ColorEvaluator:
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         # Image color embeddings
-        print("\n🖼️ Extracting image color embeddings...")
-        image_color_embeddings, image_colors = self.extract_color_embeddings(dataloader, 'image', max_samples)
         print(f"   Image color embeddings shape: {image_color_embeddings.shape}")
-        image_color_metrics = self.compute_similarity_metrics(image_color_embeddings, image_colors)
         image_color_class = self.evaluate_classification_performance(
             image_color_embeddings, image_colors, "Test Dataset, image, color confusion matrix", "Color"
         )
@@ -736,24 +258,27 @@ class ColorEvaluator:
         print("Evaluating Baseline Fashion CLIP on KAGL Marqo Dataset")
         print(f"Max samples: {max_samples}")
         print(f"{'='*60}")
         # Load KAGL Marqo dataset
-        kaggle_dataset = load_kaggle_marqo_dataset(max_samples)
         if kaggle_dataset is None:
-            print("❌ Failed to load KAGL dataset")
             return None
         # Create dataloader
         dataloader = DataLoader(kaggle_dataset, batch_size=8, shuffle=False, num_workers=0, collate_fn=collate_fn_filter_none)
         results = {}
         # Evaluate text embeddings
-        print("\n📝 Extracting baseline text embeddings from KAGL Marqo...")
-        text_embeddings, text_colors = self.extract_baseline_embeddings_batch(dataloader, 'text', max_samples)
         print(f"   Baseline text embeddings shape: {text_embeddings.shape} (using all {text_embeddings.shape[1]} dimensions)")
-        text_color_metrics = self.compute_similarity_metrics(text_embeddings, text_colors)
         text_color_classification = self.evaluate_classification_performance(
             text_embeddings, text_colors, "KAGL Marqo, text, color confusion matrix", "Color"
         )
@@ -761,17 +286,20 @@ class ColorEvaluator:
         results['text'] = {
             'color': text_color_metrics
         }
         # Clear memory
         del text_embeddings
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         # Evaluate image embeddings
-        print("\n🖼️ Extracting baseline image embeddings from KAGL Marqo...")
-        image_embeddings, image_colors = self.extract_baseline_embeddings_batch(dataloader, 'image', max_samples)
         print(f"   Baseline image embeddings shape: {image_embeddings.shape} (using all {image_embeddings.shape[1]} dimensions)")
-        image_color_metrics = self.compute_similarity_metrics(image_embeddings, image_colors)
         image_color_classification = self.evaluate_classification_performance(
             image_embeddings, image_colors, "KAGL Marqo, image, color confusion matrix", "Color"
         )
@@ -779,11 +307,11 @@ class ColorEvaluator:
         results['image'] = {
             'color': image_color_metrics
         }
         # Clear memory
         del image_embeddings
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         # ========== SAVE VISUALIZATIONS ==========
         os.makedirs(self.directory, exist_ok=True)
         for key in ['text', 'image']:
@@ -795,7 +323,7 @@ class ColorEvaluator:
                     bbox_inches='tight',
                 )
                 plt.close(figure)
         return results
     def evaluate_baseline_local_validation(self, max_samples=5000):
@@ -804,24 +332,27 @@ class ColorEvaluator:
         print("Evaluating Baseline Fashion CLIP on Local Validation Dataset")
         print(f"Max samples: {max_samples}")
         print(f"{'='*60}")
         # Load local validation dataset
-        local_dataset = load_local_validation_dataset(max_samples)
         if local_dataset is None:
-            print("❌ Failed to load local validation dataset")
             return None
         # Create dataloader
         dataloader = DataLoader(local_dataset, batch_size=8, shuffle=False, num_workers=0)
         results = {}
         # Evaluate text embeddings
-        print("\n📝 Extracting baseline text embeddings from Local Validation...")
-        text_embeddings, text_colors = self.extract_baseline_embeddings_batch(dataloader, 'text', max_samples)
         print(f"   Baseline text embeddings shape: {text_embeddings.shape} (using all {text_embeddings.shape[1]} dimensions)")
-        text_color_metrics = self.compute_similarity_metrics(text_embeddings, text_colors)
         text_color_classification = self.evaluate_classification_performance(
             text_embeddings, text_colors, "Test Dataset, text, color confusion matrix", "Color"
         )
@@ -829,17 +360,20 @@ class ColorEvaluator:
         results['text'] = {
             'color': text_color_metrics
         }
         # Clear memory
         del text_embeddings
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         # Evaluate image embeddings
-        print("\n🖼️ Extracting baseline image embeddings from Local Validation...")
-        image_embeddings, image_colors = self.extract_baseline_embeddings_batch(dataloader, 'image', max_samples)
         print(f"   Baseline image embeddings shape: {image_embeddings.shape} (using all {image_embeddings.shape[1]} dimensions)")
-        image_color_metrics = self.compute_similarity_metrics(image_embeddings, image_colors)
         image_color_classification = self.evaluate_classification_performance(
             image_embeddings, image_colors, "Test Dataset, image, color confusion matrix", "Color"
         )
@@ -847,11 +381,11 @@ class ColorEvaluator:
         results['image'] = {
             'color': image_color_metrics
         }
         # Clear memory
         del image_embeddings
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         # ========== SAVE VISUALIZATIONS ==========
         os.makedirs(self.directory, exist_ok=True)
         for key in ['text', 'image']:
@@ -863,27 +397,17 @@ class ColorEvaluator:
                     bbox_inches='tight',
                 )
                 plt.close(figure)
         return results
     def analyze_baseline_vs_trained_performance(self, results_trained, results_baseline, dataset_name):
-        """
-        Analyse et explique pourquoi la baseline peut performer mieux que le modèle entraîné
-        Raisons possibles:
-        1. Capacité dimensionnelle: Baseline utilise toutes les dimensions (512), modèle entraîné utilise seulement des sous-espaces (17 ou 64 dims)
-        2. Distribution shift: Dataset de validation différent de celui d'entraînement
-        3. Overfitting: Modèle trop spécialisé sur le dataset d'entraînement
-        4. Généralisation: Baseline pré-entraînée sur un dataset plus large et diversifié
-        5. Perte d'information: Spécialisation excessive peut causer perte d'information générale
-        """
         print(f"\n{'='*60}")
-        print(f"📊 ANALYSE: Baseline vs Modèle Entraîné - {dataset_name}")
         print(f"{'='*60}")
-        # Comparer les métriques pour chaque type d'embedding
         comparisons = []
         # Text Color
         trained_color_text_acc = results_trained.get('text_color', {}).get('accuracy', 0)
         baseline_color_text_acc = results_baseline.get('text', {}).get('color', {}).get('accuracy', 0)
@@ -894,10 +418,10 @@ class ColorEvaluator:
                 'trained': trained_color_text_acc,
                 'baseline': baseline_color_text_acc,
                 'diff': diff,
-                'trained_dims': '0-15 (16 dims)',
-                'baseline_dims': 'All dimensions (512 dims)'
             })
         # Image Color
         trained_color_img_acc = results_trained.get('image_color', {}).get('accuracy', 0)
         baseline_color_img_acc = results_baseline.get('image', {}).get('color', {}).get('accuracy', 0)
@@ -908,8 +432,8 @@ class ColorEvaluator:
                 'trained': trained_color_img_acc,
                 'baseline': baseline_color_img_acc,
                 'diff': diff,
-                'trained_dims': '0-15 (16 dims)',
-                'baseline_dims': 'All dimensions (512 dims)'
             })
         return comparisons
@@ -924,39 +448,11 @@ if __name__ == "__main__":
     max_samples = 10000
     local_max_samples = 10000
-    evaluator = ColorEvaluator(device=device, directory=directory, repo_id="Leacb4/gap-clip")
-    # # Evaluate KAGL Marqo (skipped — CMs already generated)
-    # print("\n" + "="*60)
-    # print("🚀 Starting evaluation of KAGL Marqo with Color embeddings")
-    # print("="*60)
-    # results_kaggle = evaluator.evaluate_kaggle_marqo(max_samples=max_samples)
-    #
-    # print(f"\n{'='*60}")
-    # print("KAGL MARQO EVALUATION SUMMARY")
-    # print(f"{'='*60}")
-    #
-    # print("\n🎨 COLOR CLASSIFICATION RESULTS:")
-    # print(f"  Text  - NN Acc: {results_kaggle['text_color']['accuracy']*100:.1f}% | Centroid Acc: {results_kaggle['text_color']['centroid_accuracy']*100:.1f}% | Separation: {results_kaggle['text_color']['separation_score']:.4f}")
-    # print(f"  Image - NN Acc: {results_kaggle['image_color']['accuracy']*100:.1f}% | Centroid Acc: {results_kaggle['image_color']['centroid_accuracy']*100:.1f}% | Separation: {results_kaggle['image_color']['separation_score']:.4f}")
-    #
-    # # Evaluate Baseline Fashion CLIP on KAGL Marqo
-    # print("\n" + "="*60)
-    # print("🚀 Starting evaluation of Baseline Fashion CLIP on KAGL Marqo")
-    # print("="*60)
-    # results_baseline_kaggle = evaluator.evaluate_baseline_kaggle_marqo(max_samples=max_samples)
-    #
-    # print(f"\n{'='*60}")
-    # print("BASELINE KAGL MARQO EVALUATION SUMMARY")
-    # print(f"{'='*60}")
-    #
-    # print("\n🎨 COLOR CLASSIFICATION RESULTS (Baseline):")
-    # print(f"  Text  - NN Acc: {results_baseline_kaggle['text']['color']['accuracy']*100:.1f}% | Centroid Acc: {results_baseline_kaggle['text']['color']['centroid_accuracy']*100:.1f}% | Separation: {results_baseline_kaggle['text']['color']['separation_score']:.4f}")
-    # print(f"  Image - NN Acc: {results_baseline_kaggle['image']['color']['accuracy']*100:.1f}% | Centroid Acc: {results_baseline_kaggle['image']['color']['centroid_accuracy']*100:.1f}% | Separation: {results_baseline_kaggle['image']['color']['separation_score']:.4f}")
     # Evaluate Local Validation Dataset
     print("\n" + "="*60)
-    print("🚀 Starting evaluation of Local Validation Dataset with Color embeddings")
     print("="*60)
     results_local = evaluator.evaluate_local_validation(max_samples=local_max_samples)
@@ -964,25 +460,25 @@ if __name__ == "__main__":
         print(f"\n{'='*60}")
         print("LOCAL VALIDATION DATASET EVALUATION SUMMARY")
         print(f"{'='*60}")
-        print("\n🎨 COLOR CLASSIFICATION RESULTS:")
         print(f"  Text  - NN Acc: {results_local['text_color']['accuracy']*100:.1f}% | Centroid Acc: {results_local['text_color']['centroid_accuracy']*100:.1f}% | Separation: {results_local['text_color']['separation_score']:.4f}")
         print(f"  Image - NN Acc: {results_local['image_color']['accuracy']*100:.1f}% | Centroid Acc: {results_local['image_color']['centroid_accuracy']*100:.1f}% | Separation: {results_local['image_color']['separation_score']:.4f}")
     # Evaluate Baseline Fashion CLIP on Local Validation
     print("\n" + "="*60)
-    print("🚀 Starting evaluation of Baseline Fashion CLIP on Local Validation")
     print("="*60)
     results_baseline_local = evaluator.evaluate_baseline_local_validation(max_samples=local_max_samples)
     if results_baseline_local is not None:
         print(f"\n{'='*60}")
         print("BASELINE LOCAL VALIDATION EVALUATION SUMMARY")
         print(f"{'='*60}")
-        print("\n🎨 COLOR CLASSIFICATION RESULTS (Baseline):")
         print(f"  Text  - NN Acc: {results_baseline_local['text']['color']['accuracy']*100:.1f}% | Centroid Acc: {results_baseline_local['text']['color']['centroid_accuracy']*100:.1f}% | Separation: {results_baseline_local['text']['color']['separation_score']:.4f}")
         print(f"  Image - NN Acc: {results_baseline_local['image']['color']['accuracy']*100:.1f}% | Centroid Acc: {results_baseline_local['image']['color']['centroid_accuracy']*100:.1f}% | Separation: {results_baseline_local['image']['color']['separation_score']:.4f}")
-    print(f"\n✅ Evaluation completed! Check '{directory}/' for visualization files.")

 """
 import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import sys
 from pathlib import Path
 import torch
 import matplotlib.pyplot as plt
+from sklearn.metrics import classification_report, accuracy_score
+from torch.utils.data import DataLoader
 import warnings
 warnings.filterwarnings('ignore')
 # Ensure project root is importable when running this file directly.
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 from config import (
     color_model_path,
     color_emb_dim,
+    main_emb_dim,
 )
+from utils.datasets import (
+    load_kaggle_marqo_dataset,
+    load_local_validation_dataset,
+    collate_fn_filter_none,
+)
+from utils.embeddings import extract_clip_embeddings, extract_color_model_embeddings
+from utils.metrics import (
+    compute_similarity_metrics,
+    predict_labels_from_embeddings,
+    create_confusion_matrix,
+)
+from utils.model_loader import load_color_model, load_baseline_fashion_clip
 class ColorEvaluator:
         self,
         device='mps',
         directory="figures/confusion_matrices/cm_color",
+        baseline_model=None,
+        baseline_processor=None,
+        color_model=None,
+        kaggle_raw_df=None,
+        local_raw_df=None,
     ):
         self.device = torch.device(device)
         self.directory = directory
         self.color_emb_dim = color_emb_dim
+        self.main_emb_dim = main_emb_dim
+        self.kaggle_raw_df = kaggle_raw_df
+        self.local_raw_df = local_raw_df
         os.makedirs(self.directory, exist_ok=True)
+        # Load baseline Fashion CLIP model (or reuse pre-loaded)
+        if baseline_model is not None and baseline_processor is not None:
+            self.baseline_model = baseline_model
+            self.baseline_processor = baseline_processor
+        else:
+            print("Loading baseline Fashion CLIP model...")
+            self.baseline_model, self.baseline_processor = load_baseline_fashion_clip(self.device)
+            print("Baseline Fashion CLIP model loaded successfully")
+        # Load specialized color model (or reuse pre-loaded)
+        if color_model is not None:
+            self.color_model = color_model
+        else:
+            self.color_model, _ = load_color_model(color_model_path, self.device)
     def evaluate_classification_performance(self, embeddings, labels, embedding_type="Embeddings", label_type="Label"):
         """
         Evaluate classification performance and create confusion matrix.
         Args:
             embeddings: Embeddings
             labels: True labels
             embedding_type: Type of embeddings for display
             label_type: Type of labels (Color)
         """
+        predictions = predict_labels_from_embeddings(embeddings, labels)
         # Filter out None values from labels and predictions
+        valid_indices = [i for i, (label, pred) in enumerate(zip(labels, predictions))
                         if label is not None and pred is not None]
         if len(valid_indices) == 0:
+            print(f"Warning: No valid labels/predictions found (all are None)")
             return {
                 'accuracy': 0.0,
                 'predictions': predictions,
                 'classification_report': None,
                 'figure': None,
             }
         filtered_labels = [labels[i] for i in valid_indices]
         filtered_predictions = [predictions[i] for i in valid_indices]
         accuracy = accuracy_score(filtered_labels, filtered_predictions)
+        fig, _, cm = create_confusion_matrix(
             filtered_labels, filtered_predictions,
             embedding_type,
             label_type
         print(f"Max samples: {max_samples}")
         print(f"{'='*60}")
+        kaggle_dataset = load_kaggle_marqo_dataset(max_samples, raw_df=self.kaggle_raw_df)
         if kaggle_dataset is None:
+            print("Failed to load KAGL dataset")
             return None
         dataloader = DataLoader(kaggle_dataset, batch_size=8, shuffle=False, num_workers=0, collate_fn=collate_fn_filter_none)
         results = {}
+        # ========== EXTRACT COLOR MODEL EMBEDDINGS ==========
+        print("\nExtracting color model embeddings...")
+        text_full_embeddings, text_colors_full = extract_color_model_embeddings(
+            self.color_model, dataloader, self.device, embedding_type='text', max_samples=max_samples
+        )
+        image_full_embeddings, image_colors_full = extract_color_model_embeddings(
+            self.color_model, dataloader, self.device, embedding_type='image', max_samples=max_samples
+        )
+        text_color_metrics = compute_similarity_metrics(text_full_embeddings, text_colors_full)
         text_color_class = self.evaluate_classification_performance(
             text_full_embeddings, text_colors_full,
             "KAGL Marqo, text, color confusion matrix", "Color",
         )
         text_color_metrics.update(text_color_class)
         results['text_color'] = text_color_metrics
+        image_color_metrics = compute_similarity_metrics(image_full_embeddings, image_colors_full)
         image_color_class = self.evaluate_classification_performance(
             image_full_embeddings, image_colors_full,
             "KAGL Marqo, image, color confusion matrix", "Color",
         print(f"Max samples: {max_samples}")
         print(f"{'='*60}")
+        local_dataset = load_local_validation_dataset(max_samples, raw_df=self.local_raw_df)
         dataloader = DataLoader(local_dataset, batch_size=8, shuffle=False, num_workers=0)
         results = {}
         # ========== COLOR EVALUATION  ==========
+        print("\nCOLOR EVALUATION")
         print("=" * 50)
         # Text color embeddings
+        print("\nExtracting text color embeddings...")
+        text_color_embeddings, text_colors = extract_color_model_embeddings(
+            self.color_model, dataloader, self.device, embedding_type='text', max_samples=max_samples
+        )
         print(f"   Text color embeddings shape: {text_color_embeddings.shape}")
+        text_color_metrics = compute_similarity_metrics(text_color_embeddings, text_colors)
         text_color_class = self.evaluate_classification_performance(
             text_color_embeddings, text_colors, "Test Dataset, text, color confusion matrix", "Color"
         )
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         # Image color embeddings
+        print("\nExtracting image color embeddings...")
+        image_color_embeddings, image_colors = extract_color_model_embeddings(
+            self.color_model, dataloader, self.device, embedding_type='image', max_samples=max_samples
+        )
         print(f"   Image color embeddings shape: {image_color_embeddings.shape}")
+        image_color_metrics = compute_similarity_metrics(image_color_embeddings, image_colors)
         image_color_class = self.evaluate_classification_performance(
             image_color_embeddings, image_colors, "Test Dataset, image, color confusion matrix", "Color"
         )
         print("Evaluating Baseline Fashion CLIP on KAGL Marqo Dataset")
         print(f"Max samples: {max_samples}")
         print(f"{'='*60}")
         # Load KAGL Marqo dataset
+        kaggle_dataset = load_kaggle_marqo_dataset(max_samples, raw_df=self.kaggle_raw_df)
         if kaggle_dataset is None:
+            print("Failed to load KAGL dataset")
             return None
         # Create dataloader
         dataloader = DataLoader(kaggle_dataset, batch_size=8, shuffle=False, num_workers=0, collate_fn=collate_fn_filter_none)
         results = {}
         # Evaluate text embeddings
+        print("\nExtracting baseline text embeddings from KAGL Marqo...")
+        text_embeddings, text_colors, _ = extract_clip_embeddings(
+            self.baseline_model, self.baseline_processor, dataloader, self.device,
+            embedding_type='text', max_samples=max_samples
+        )
         print(f"   Baseline text embeddings shape: {text_embeddings.shape} (using all {text_embeddings.shape[1]} dimensions)")
+        text_color_metrics = compute_similarity_metrics(text_embeddings, text_colors)
         text_color_classification = self.evaluate_classification_performance(
             text_embeddings, text_colors, "KAGL Marqo, text, color confusion matrix", "Color"
         )
         results['text'] = {
             'color': text_color_metrics
         }
         # Clear memory
         del text_embeddings
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         # Evaluate image embeddings
+        print("\nExtracting baseline image embeddings from KAGL Marqo...")
+        image_embeddings, image_colors, _ = extract_clip_embeddings(
+            self.baseline_model, self.baseline_processor, dataloader, self.device,
+            embedding_type='image', max_samples=max_samples
+        )
         print(f"   Baseline image embeddings shape: {image_embeddings.shape} (using all {image_embeddings.shape[1]} dimensions)")
+        image_color_metrics = compute_similarity_metrics(image_embeddings, image_colors)
         image_color_classification = self.evaluate_classification_performance(
             image_embeddings, image_colors, "KAGL Marqo, image, color confusion matrix", "Color"
         )
         results['image'] = {
             'color': image_color_metrics
         }
         # Clear memory
         del image_embeddings
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         # ========== SAVE VISUALIZATIONS ==========
         os.makedirs(self.directory, exist_ok=True)
         for key in ['text', 'image']:
                     bbox_inches='tight',
                 )
                 plt.close(figure)
         return results
     def evaluate_baseline_local_validation(self, max_samples=5000):
         print("Evaluating Baseline Fashion CLIP on Local Validation Dataset")
         print(f"Max samples: {max_samples}")
         print(f"{'='*60}")
         # Load local validation dataset
+        local_dataset = load_local_validation_dataset(max_samples, raw_df=self.local_raw_df)
         if local_dataset is None:
+            print("Failed to load local validation dataset")
             return None
         # Create dataloader
         dataloader = DataLoader(local_dataset, batch_size=8, shuffle=False, num_workers=0)
         results = {}
         # Evaluate text embeddings
+        print("\nExtracting baseline text embeddings from Local Validation...")
+        text_embeddings, text_colors, _ = extract_clip_embeddings(
+            self.baseline_model, self.baseline_processor, dataloader, self.device,
+            embedding_type='text', max_samples=max_samples
+        )
         print(f"   Baseline text embeddings shape: {text_embeddings.shape} (using all {text_embeddings.shape[1]} dimensions)")
+        text_color_metrics = compute_similarity_metrics(text_embeddings, text_colors)
         text_color_classification = self.evaluate_classification_performance(
             text_embeddings, text_colors, "Test Dataset, text, color confusion matrix", "Color"
         )
         results['text'] = {
             'color': text_color_metrics
         }
         # Clear memory
         del text_embeddings
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         # Evaluate image embeddings
+        print("\nExtracting baseline image embeddings from Local Validation...")
+        image_embeddings, image_colors, _ = extract_clip_embeddings(
+            self.baseline_model, self.baseline_processor, dataloader, self.device,
+            embedding_type='image', max_samples=max_samples
+        )
         print(f"   Baseline image embeddings shape: {image_embeddings.shape} (using all {image_embeddings.shape[1]} dimensions)")
+        image_color_metrics = compute_similarity_metrics(image_embeddings, image_colors)
         image_color_classification = self.evaluate_classification_performance(
             image_embeddings, image_colors, "Test Dataset, image, color confusion matrix", "Color"
         )
         results['image'] = {
             'color': image_color_metrics
         }
         # Clear memory
         del image_embeddings
         torch.cuda.empty_cache() if torch.cuda.is_available() else None
         # ========== SAVE VISUALIZATIONS ==========
         os.makedirs(self.directory, exist_ok=True)
         for key in ['text', 'image']:
                     bbox_inches='tight',
                 )
                 plt.close(figure)
         return results
     def analyze_baseline_vs_trained_performance(self, results_trained, results_baseline, dataset_name):
+        """Analyse baseline vs trained model performance."""
         print(f"\n{'='*60}")
+        print(f"ANALYSE: Baseline vs Trained - {dataset_name}")
         print(f"{'='*60}")
         comparisons = []
         # Text Color
         trained_color_text_acc = results_trained.get('text_color', {}).get('accuracy', 0)
         baseline_color_text_acc = results_baseline.get('text', {}).get('color', {}).get('accuracy', 0)
                 'trained': trained_color_text_acc,
                 'baseline': baseline_color_text_acc,
                 'diff': diff,
+                'trained_dims': f'0-{self.color_emb_dim - 1} ({self.color_emb_dim} dims)',
+                'baseline_dims': f'All dimensions ({self.main_emb_dim} dims)'
             })
         # Image Color
         trained_color_img_acc = results_trained.get('image_color', {}).get('accuracy', 0)
         baseline_color_img_acc = results_baseline.get('image', {}).get('color', {}).get('accuracy', 0)
                 'trained': trained_color_img_acc,
                 'baseline': baseline_color_img_acc,
                 'diff': diff,
+                'trained_dims': f'0-{self.color_emb_dim - 1} ({self.color_emb_dim} dims)',
+                'baseline_dims': f'All dimensions ({self.main_emb_dim} dims)'
             })
         return comparisons
     max_samples = 10000
     local_max_samples = 10000
+    evaluator = ColorEvaluator(device=device, directory=directory)
     # Evaluate Local Validation Dataset
     print("\n" + "="*60)
+    print("Starting evaluation of Local Validation Dataset with Color embeddings")
     print("="*60)
     results_local = evaluator.evaluate_local_validation(max_samples=local_max_samples)
         print(f"\n{'='*60}")
         print("LOCAL VALIDATION DATASET EVALUATION SUMMARY")
         print(f"{'='*60}")
+        print("\nCOLOR CLASSIFICATION RESULTS:")
         print(f"  Text  - NN Acc: {results_local['text_color']['accuracy']*100:.1f}% | Centroid Acc: {results_local['text_color']['centroid_accuracy']*100:.1f}% | Separation: {results_local['text_color']['separation_score']:.4f}")
         print(f"  Image - NN Acc: {results_local['image_color']['accuracy']*100:.1f}% | Centroid Acc: {results_local['image_color']['centroid_accuracy']*100:.1f}% | Separation: {results_local['image_color']['separation_score']:.4f}")
     # Evaluate Baseline Fashion CLIP on Local Validation
     print("\n" + "="*60)
+    print("Starting evaluation of Baseline Fashion CLIP on Local Validation")
     print("="*60)
     results_baseline_local = evaluator.evaluate_baseline_local_validation(max_samples=local_max_samples)
     if results_baseline_local is not None:
         print(f"\n{'='*60}")
         print("BASELINE LOCAL VALIDATION EVALUATION SUMMARY")
         print(f"{'='*60}")
+        print("\nCOLOR CLASSIFICATION RESULTS (Baseline):")
         print(f"  Text  - NN Acc: {results_baseline_local['text']['color']['accuracy']*100:.1f}% | Centroid Acc: {results_baseline_local['text']['color']['centroid_accuracy']*100:.1f}% | Separation: {results_baseline_local['text']['color']['separation_score']:.4f}")
         print(f"  Image - NN Acc: {results_baseline_local['image']['color']['accuracy']*100:.1f}% | Centroid Acc: {results_baseline_local['image']['color']['centroid_accuracy']*100:.1f}% | Separation: {results_baseline_local['image']['color']['separation_score']:.4f}")
+    print(f"\nEvaluation completed! Check '{directory}/' for visualization files.")