#!/usr/bin/env python3 """ Annex 9.3 t-SNE Embedding Visualisations ========================================== Produces the t-SNE scatter plots shown in **Annex 9.3** of the paper. The script loads the local validation dataset, encodes each image with the main GAP-CLIP model (and, optionally, the CLIP baseline), then reduces the 512-D embeddings to 2-D via t-SNE and renders: * **Colour overlay** – points coloured by garment colour, convex hulls drawn around each colour cluster. * **Hierarchy overlay** – points coloured by clothing category (top, bottom, shoes, …), convex hulls drawn around each category cluster. * **Per-hierarchy colour scatter** – one subplot per category, showing how colours are distributed within each category. These plots complement the quantitative separation scores in §5.3.6 and provide an intuitive sanity check that the dedicated embedding dimensions (0–15 for colour, 16–79 for hierarchy) encode the intended structure. See also: - §5.3.6 (``sec536_embedding_structure.py``) – quantitative Tests A/B/C - Annex 9.2 (``annex92_color_heatmaps.py``) – pairwise colour heatmaps """ import math import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns import torch from matplotlib.patches import Polygon from PIL import Image from sklearn.manifold import TSNE from sklearn.metrics import ( silhouette_score, davies_bouldin_score, calinski_harabasz_score, ) from sklearn.preprocessing import normalize from sklearn.metrics.pairwise import cosine_similarity from torch.utils.data import DataLoader, Dataset from torchvision import transforms from tqdm import tqdm from transformers import CLIPModel as CLIPModel_transformers, CLIPProcessor try: from scipy.spatial import ConvexHull except ImportError: ConvexHull = None from config import ( color_column, color_emb_dim, column_local_image_path, device, hierarchy_column, hierarchy_emb_dim, images_dir, local_dataset_path, main_model_path, ) class ImageDataset(Dataset): """Lightweight dataset to load local images along with colors and hierarchies.""" def __init__(self, dataframe: pd.DataFrame, root_dir: str): self.df = dataframe.reset_index(drop=True) self.root_dir = root_dir self.transform = transforms.Compose( [ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], ), ] ) def __len__(self): return len(self.df) def __getitem__(self, idx): row = self.df.iloc[idx] img_path = row[column_local_image_path] image = Image.open(img_path).convert("RGB") image = self.transform(image) color = row[color_column] hierarchy = row[hierarchy_column] return image, color, hierarchy def load_main_model(): """Load the main model with the trained weights.""" checkpoint = torch.load(main_model_path, map_location=device) state_dict = checkpoint.get("model_state_dict", checkpoint) model = CLIPModel_transformers.from_pretrained( "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" ) model.load_state_dict(state_dict) model.to(device) model.eval() # Load processor for text tokenization processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K") return model, processor def load_clip_baseline(): """Load the CLIP baseline model from transformers.""" print("🤗 Loading CLIP baseline model from transformers...") clip_model = CLIPModel_transformers.from_pretrained("openai/clip-vit-base-patch32").to(device) clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") clip_model.eval() print("✅ CLIP baseline model loaded successfully") return clip_model, clip_processor def enforce_min_hierarchy_samples(df, min_per_hierarchy): """Filter out hierarchy groups with fewer than min_per_hierarchy rows.""" if not min_per_hierarchy or min_per_hierarchy <= 0: return df counts = df[hierarchy_column].value_counts() keep_values = counts[counts >= min_per_hierarchy].index filtered = df[df[hierarchy_column].isin(keep_values)].reset_index(drop=True) return filtered def prepare_dataframe(df, sample_size, per_color_limit, min_per_hierarchy=None): """Subsample the dataframe to speed up the t-SNE.""" if per_color_limit and per_color_limit > 0: df_limited = ( df.groupby(color_column) .apply(lambda g: g.sample(min(len(g), per_color_limit), random_state=42)) .reset_index(drop=True) ) else: df_limited = df if sample_size and 0 < sample_size < len(df_limited): df_limited = df_limited.sample(sample_size, random_state=42).reset_index( drop=True ) df_limited = enforce_min_hierarchy_samples(df_limited, min_per_hierarchy) return df_limited def compute_embeddings(model, dataloader): """Extract color, hierarchy, and combined embeddings.""" color_embeddings = [] hierarchy_embeddings = [] color_labels = [] hierarchy_labels = [] with torch.no_grad(): for images, colors, hierarchies in tqdm( dataloader, desc="Extracting embeddings" ): images = images.to(device) if images.shape[1] == 1: # safety in case images = images.expand(-1, 3, -1, -1) image_embeds = model.get_image_features(pixel_values=images) color_part = image_embeds[:, :color_emb_dim] hierarchy_part = image_embeds[ :, color_emb_dim : color_emb_dim + hierarchy_emb_dim ] color_embeddings.append(color_part.cpu().numpy()) hierarchy_embeddings.append(hierarchy_part.cpu().numpy()) color_labels.extend(colors) hierarchy_labels.extend(hierarchies) return ( np.concatenate(color_embeddings, axis=0), np.concatenate(hierarchy_embeddings, axis=0), color_labels, hierarchy_labels, ) def compute_clip_embeddings(clip_model, clip_processor, dataloader): """Extract CLIP baseline embeddings (full image embeddings, not separated).""" all_embeddings = [] color_labels = [] hierarchy_labels = [] with torch.no_grad(): for images, colors, hierarchies in tqdm( dataloader, desc="Extracting CLIP embeddings" ): batch_embeddings = [] for i in range(images.shape[0]): # Get single image from batch image_tensor = images[i] # Shape: (3, 224, 224) # Denormalize on CPU (safer for PIL conversion) mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1) std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) image_tensor = image_tensor * std + mean image_tensor = torch.clamp(image_tensor, 0, 1) # Convert to PIL Image (must be on CPU) image_pil = transforms.ToPILImage()(image_tensor.cpu()) # Process with CLIP (using empty text since we only need image embeddings) inputs = clip_processor( text="", images=image_pil, return_tensors="pt", padding=True ).to(device) outputs = clip_model(**inputs) # Get normalized image embeddings image_emb = outputs.image_embeds / outputs.image_embeds.norm(p=2, dim=-1, keepdim=True) batch_embeddings.append(image_emb.cpu().numpy()) all_embeddings.append(np.vstack(batch_embeddings)) color_labels.extend(colors) hierarchy_labels.extend(hierarchies) # For CLIP, we use the full embeddings for all visualizations # (no separation into color/hierarchy dimensions) full_embeddings = np.concatenate(all_embeddings, axis=0) return ( full_embeddings, # color_embeddings (using full CLIP embeddings) full_embeddings, # hierarchy_embeddings (using full CLIP embeddings) full_embeddings, # color_hier_embeddings (using full CLIP embeddings) color_labels, hierarchy_labels, ) def compute_dunn_index(embeddings, labels): """ Compute the Dunn Index for clustering evaluation. The Dunn Index is the ratio of the minimum inter-cluster distance to the maximum intra-cluster distance. Higher values indicate better clustering. Args: embeddings: Array of embeddings [N, embed_dim] labels: Array of cluster labels [N] Returns: Dunn Index value (float) or None if calculation fails """ try: unique_labels = np.unique(labels) if len(unique_labels) < 2: return None # Calculate intra-cluster distances (maximum within each cluster) max_intra_cluster_dist = 0 for label in unique_labels: cluster_points = embeddings[labels == label] if len(cluster_points) > 1: # Calculate pairwise distances within cluster from scipy.spatial.distance import pdist intra_dists = pdist(cluster_points, metric='euclidean') if len(intra_dists) > 0: max_intra = np.max(intra_dists) max_intra_cluster_dist = max(max_intra_cluster_dist, max_intra) if max_intra_cluster_dist == 0: return None # Calculate inter-cluster distances (minimum between clusters) min_inter_cluster_dist = float('inf') for i, label1 in enumerate(unique_labels): for label2 in unique_labels[i+1:]: cluster1_points = embeddings[labels == label1] cluster2_points = embeddings[labels == label2] # Calculate distances between clusters from scipy.spatial.distance import cdist inter_dists = cdist(cluster1_points, cluster2_points, metric='euclidean') min_inter = np.min(inter_dists) min_inter_cluster_dist = min(min_inter_cluster_dist, min_inter) if min_inter_cluster_dist == float('inf'): return None # Dunn Index = minimum inter-cluster distance / maximum intra-cluster distance dunn_index = min_inter_cluster_dist / max_intra_cluster_dist return float(dunn_index) except Exception as e: print(f"⚠️ Error computing Dunn Index: {e}") return None def build_color_map(labels, prefer_true_colors=False): """Build a color mapping for labels.""" unique_labels = sorted(set(labels)) palette = sns.color_palette("husl", len(unique_labels)) return {label: palette[idx] for idx, label in enumerate(unique_labels)} def run_tsne(embeddings,legend_labels,output_path,perplexity,title,scatter_color_labels=None,prefer_true_colors=False): """Calculate and plot a t-SNE projection.""" tsne = TSNE( n_components=2, perplexity=perplexity, init="pca", learning_rate="auto", random_state=42, ) reduced = tsne.fit_transform(embeddings) label_array = np.array(legend_labels) color_labels = ( np.array(scatter_color_labels) if scatter_color_labels is not None else label_array ) # Calculate silhouette scores unique_labels_list = sorted(set(label_array)) if len(unique_labels_list) > 1 and len(label_array) > 1: # Convert labels to numeric indices for silhouette_score label_to_idx = {label: idx for idx, label in enumerate(unique_labels_list)} numeric_labels = np.array([label_to_idx[label] for label in label_array]) # Calculate in original embedding space (ground truth - measures real separation) silhouette = silhouette_score(embeddings, numeric_labels, metric='euclidean') davies_bouldin = davies_bouldin_score(embeddings, numeric_labels) calinski_harabasz = calinski_harabasz_score(embeddings, numeric_labels) dunn = compute_dunn_index(embeddings, numeric_labels) else: silhouette = None davies_bouldin = None calinski_harabasz = None dunn = None # Helpful reference for the reported clustering indices: # • Silhouette Score ∈ [-1, 1] — closer to 1 means points fit their cluster well, 0 means overlap, < 0 suggests misassignment. # • Davies–Bouldin Index ∈ [0, +∞) — lower is better; quantifies average similarity between clusters relative to their size. # • Calinski–Harabasz Index ∈ [0, +∞) — higher is better; ratio of between-cluster dispersion to within-cluster dispersion. # • Dunn Index ∈ [0, +∞) — higher is better; compares the tightest cluster diameter to the closest distance between clusters. # Build color map for visualization color_map = build_color_map(color_labels, prefer_true_colors=prefer_true_colors) color_series = np.array([color_map[label] for label in color_labels]) plt.figure(figsize=(10, 8)) unique_labels = sorted(set(label_array)) for label in unique_labels: mask = label_array == label if 'color' in title: c = label else: c = color_series[mask] plt.scatter( reduced[mask, 0], reduced[mask, 1], c=c, s=15, alpha=0.8, label=label, ) # Add silhouette score to title if silhouette is not None: title_with_score = f"{title}\n(t-SNE Silhouette: {silhouette:.3f} | Davies-Bouldin: {davies_bouldin:.3f} | Calinski-Harabasz: {calinski_harabasz:.3f} | Dunn: {dunn:.3f})" else: title_with_score = title plt.title(title_with_score) plt.xlabel("t-SNE 1") plt.ylabel("t-SNE 2") plt.legend( bbox_to_anchor=(1.05, 1), loc="upper left", fontsize="small", frameon=False ) plt.tight_layout() plt.savefig(output_path, dpi=300) plt.close() print(f"✅ Figure saved in {output_path}") print(f" 📊 t-SNE space: {silhouette:.3f} (matches visualization) | Davies-Bouldin: {davies_bouldin:.3f} | Calinski-Harabasz: {calinski_harabasz:.3f} | Dunn: {dunn:.3f}") def filter_valid_rows(dataframe: pd.DataFrame) -> pd.DataFrame: """Keep only rows with valid local image paths and colors.""" dataframe = dataframe[dataframe['color'] != 'unknown'].copy() df = dataframe.dropna( subset=[column_local_image_path, color_column, hierarchy_column] ).copy() mask = df[column_local_image_path].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0) return df[mask].reset_index(drop=True) if __name__ == "__main__": sample_size = None per_color_limit = 500 min_per_hierarchy = 200 batch_size = 32 perplexity = 30 output_color = "evaluation/evaluation_results/tsne/tsne_color_space.png" output_hierarchy = "evaluation/evaluation_results/tsne/tsne_hierarchy_space.png" print("📥 Loading the dataset...") df = pd.read_csv("data/data.csv") df = filter_valid_rows(df) print(f"Total len if the dataset: {len(df)}") df = prepare_dataframe(df, sample_size, per_color_limit, min_per_hierarchy) print(f"✅ {len(df)} samples will be used for the t-SNE") print(f"Number of colors in the dataset: {len(df['color'].unique())}") print(f"Colors in the dataset: {df['color'].unique()}") dataset = ImageDataset(df, images_dir) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4) # 2) Loading the models print("⚙️ Loading the main model...") model, processor = load_main_model() print("⚙️ Loading CLIP baseline model...") clip_model, clip_processor = load_clip_baseline() # 3) Extracting the embeddings print("🎯 Extracting the embeddings...") ( color_embeddings, hierarchy_embeddings, colors, hierarchies, ) = compute_embeddings(model, dataloader) # 4) Calculating the t-SNE print("🌀 Calculating the color t-SNE...") run_tsne( color_embeddings, colors, output_color, perplexity, "t-SNE of the color embeddings of the main model", scatter_color_labels=colors, prefer_true_colors=True, ) print("🌀 Calculating the hierarchy t-SNE...") run_tsne( hierarchy_embeddings, hierarchies, output_hierarchy, perplexity, "t-SNE of the hierarchy embeddings of the main model", scatter_color_labels=hierarchies, ) # ========== CLIP BASELINE EVALUATION ========== print("\n" + "="*60) print("🔄 Starting CLIP Baseline Evaluation") print("="*60) print("🎯 Extracting CLIP embeddings...") ( clip_color_embeddings, clip_hierarchy_embeddings, clip_color_hier_embeddings, clip_colors, clip_hierarchies, ) = compute_clip_embeddings(clip_model, clip_processor, dataloader) # Output paths for CLIP baseline clip_output_color = "evaluation/evaluation_results/tsne/clip_baseline_tsne_color_space.png" clip_output_hierarchy = "evaluation/evaluation_results/tsne/clip_baseline_tsne_hierarchy_space.png" print("🌀 Calculating CLIP baseline color t-SNE...") run_tsne( clip_color_embeddings, clip_colors, clip_output_color, perplexity, "t-SNE of the color embeddings (CLIP Baseline)", scatter_color_labels=clip_colors, prefer_true_colors=True, ) print("🌀 Calculating CLIP baseline hierarchy t-SNE...") run_tsne( clip_hierarchy_embeddings, clip_hierarchies, clip_output_hierarchy, perplexity, "t-SNE of the hierarchy embeddings (CLIP Baseline)", scatter_color_labels=clip_hierarchies, ) print("\n✅ All t-SNE visualizations completed!") print(" - Main model: evaluation/evaluation_results/tsne/tsne_*.png") print(" - CLIP baseline: evaluation/evaluation_results/tsne/clip_baseline_tsne_*.png")