Leacb4
/

gap-clip

+#!/usr/bin/env python3
+"""
+Color retrieval accuracy across different hierarchies — Baseline vs GAP-CLIP.
+For each color, pairs it with every hierarchy category and measures how well
+each model classifies the correct color and hierarchy via nearest-neighbor.
+Three classification strategies are compared:
+  1. Naive       — bare label words ("dress", "shirt", ...) as label embeddings
+  2. Ensembled   — average of multiple prompt templates per label (standard CLIP trick)
+  3. Structured  — (GAP-CLIP only) color-marginalized label centroids in the
+                   hierarchy subspace. For each hierarchy, embed "{c} {h}" for
+                   ALL colors, extract the 64D hierarchy slice, and average.
+                   This builds color-agnostic hierarchy prototypes that exploit
+                   GAP-CLIP's learned subspace decomposition.
+Run:
+    python3 -m evaluation.test_color_across_hierarchies                  # single color (red)
+    python3 -m evaluation.test_color_across_hierarchies --color blue
+    python3 -m evaluation.test_color_across_hierarchies --all-colors     # full sweep + graph
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from pathlib import Path
+from typing import Dict, List, Tuple
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import CLIPModel as CLIPModelTransformers, CLIPProcessor
+_PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(_PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(_PROJECT_ROOT))
+import config
+from evaluation.utils.model_loader import (
+    load_baseline_fashion_clip,
+    load_gap_clip,
+    get_text_embedding,
+    get_text_embeddings_batch,
+)
+# ── Constants ────────────────────────────────────────────────────────────────
+COLORS = [
+    "beige", "black", "blue", "brown", "green",
+    "orange", "pink", "purple", "red", "white", "yellow",
+]
+HIERARCHIES = [
+    "dress", "shirt", "pants", "skirt", "jacket",
+    "coat", "jeans", "sweater", "shorts", "top",
+]
+# Templates used to build query sentences
+QUERY_TEMPLATES = [
+    "{color} {hierarchy}",
+    "a {color} {hierarchy}",
+    "{color} {hierarchy} for women",
+    "casual {color} {hierarchy}",
+    "elegant {color} {hierarchy}",
+]
+# Templates for label ensembling (strategy 2)
+LABEL_TEMPLATES = [
+    "{}",
+    "a {}",
+    "a photo of a {}",
+    "a fashion {}",
+    "a piece of clothing: {}",
+]
+FIGURES_DIR = _PROJECT_ROOT / "figures"
+# ── Helpers ──────────────────────────────────────────────────────────────────
+def classify_nearest(
+    query_emb: torch.Tensor,
+    label_embs: torch.Tensor,
+    labels: List[str],
+) -> Tuple[str, float]:
+    sims = F.cosine_similarity(query_emb.unsqueeze(0), label_embs, dim=1)
+    idx = sims.argmax().item()
+    return labels[idx], sims[idx].item()
+# ── Label embedding builders ─────────────────────────────────────────────────
+def build_naive_labels(model, processor, device, labels):
+    """Strategy 1: bare words."""
+    return get_text_embeddings_batch(model, processor, device, labels)
+def build_ensembled_labels(model, processor, device, labels):
+    """Strategy 2: average of LABEL_TEMPLATES per label."""
+    out = []
+    for label in labels:
+        prompts = [t.format(label) for t in LABEL_TEMPLATES]
+        embs = get_text_embeddings_batch(model, processor, device, prompts)
+        out.append(F.normalize(embs.mean(dim=0), dim=-1))
+    return torch.stack(out)
+def build_color_marginalized_labels(model, processor, device, hier_start, hier_end):
+    """Strategy 3 (GAP-CLIP only): for each hierarchy, embed '{c} {h}' for all
+    colors, extract the hierarchy subspace, average → color-agnostic centroid."""
+    out = []
+    for h in HIERARCHIES:
+        all_embs = []
+        for c in COLORS:
+            for tmpl in QUERY_TEMPLATES:
+                query = tmpl.format(color=c, hierarchy=h)
+                emb = get_text_embedding(model, processor, device, query)
+                all_embs.append(emb[hier_start:hier_end])
+        stacked = torch.stack(all_embs)
+        centroid = F.normalize(stacked.mean(dim=0), dim=-1)
+        out.append(centroid)
+    return torch.stack(out)
+# ── Per-model evaluation ─────────────────────────────────────────────────────
+def evaluate_model(
+    model, processor, device, target_color, model_name,
+    color_dim=0, hier_start=0, hier_end=0,
+) -> Dict:
+    is_gap_clip = color_dim > 0
+    # Build all label embedding variants
+    naive_color_labels = build_naive_labels(model, processor, device, COLORS)
+    naive_hier_labels = build_naive_labels(model, processor, device, HIERARCHIES)
+    ens_color_labels = build_ensembled_labels(model, processor, device, COLORS)
+    ens_hier_labels = build_ensembled_labels(model, processor, device, HIERARCHIES)
+    if is_gap_clip:
+        naive_color_sub = F.normalize(naive_color_labels[:, :color_dim], dim=-1)
+        naive_hier_sub = F.normalize(naive_hier_labels[:, hier_start:hier_end], dim=-1)
+        ens_hier_sub = F.normalize(ens_hier_labels[:, hier_start:hier_end], dim=-1)
+        marg_hier_sub = build_color_marginalized_labels(
+            model, processor, device, hier_start, hier_end
+        )
+    rows: List[Dict] = []
+    for hierarchy in HIERARCHIES:
+        for template in QUERY_TEMPLATES:
+            query = template.format(color=target_color, hierarchy=hierarchy)
+            emb = get_text_embedding(model, processor, device, query)
+            # ── Strategy 1: naive 512D ──
+            pc_naive, _ = classify_nearest(emb, naive_color_labels, COLORS)
+            ph_naive, _ = classify_nearest(emb, naive_hier_labels, HIERARCHIES)
+            # ── Strategy 2: ensembled 512D ──
+            pc_ens, _ = classify_nearest(emb, ens_color_labels, COLORS)
+            ph_ens, _ = classify_nearest(emb, ens_hier_labels, HIERARCHIES)
+            row = {
+                "query": query,
+                "true_color": target_color,
+                "true_hierarchy": hierarchy,
+                "color_naive": pc_naive == target_color,
+                "hier_naive": ph_naive == hierarchy,
+                "color_ens": pc_ens == target_color,
+                "hier_ens": ph_ens == hierarchy,
+            }
+            if is_gap_clip:
+                # ── Naive subspace ──
+                c_sub = F.normalize(emb[:color_dim].unsqueeze(0), dim=-1).squeeze(0)
+                h_sub = F.normalize(emb[hier_start:hier_end].unsqueeze(0), dim=-1).squeeze(0)
+                pc_sub, _ = classify_nearest(c_sub, naive_color_sub, COLORS)
+                ph_sub, _ = classify_nearest(h_sub, naive_hier_sub, HIERARCHIES)
+                # ── Ensembled subspace ──
+                ph_ens_sub, _ = classify_nearest(h_sub, ens_hier_sub, HIERARCHIES)
+                # ── Strategy 3: color-marginalized subspace ──
+                ph_marg, _ = classify_nearest(h_sub, marg_hier_sub, HIERARCHIES)
+                row.update({
+                    "color_sub_naive": pc_sub == target_color,
+                    "hier_sub_naive": ph_sub == hierarchy,
+                    "hier_sub_ens": ph_ens_sub == hierarchy,
+                    "hier_sub_marg": ph_marg == hierarchy,
+                })
+            rows.append(row)
+    # Aggregate
+    n = len(rows)
+    summary = {
+        "model": model_name,
+        "target_color": target_color,
+        "n": n,
+        "color_naive": sum(r["color_naive"] for r in rows) / n,
+        "hier_naive": sum(r["hier_naive"] for r in rows) / n,
+        "color_ens": sum(r["color_ens"] for r in rows) / n,
+        "hier_ens": sum(r["hier_ens"] for r in rows) / n,
+    }
+    if is_gap_clip:
+        summary.update({
+            "color_sub_naive": sum(r["color_sub_naive"] for r in rows) / n,
+            "hier_sub_naive": sum(r["hier_sub_naive"] for r in rows) / n,
+            "hier_sub_ens": sum(r["hier_sub_ens"] for r in rows) / n,
+            "hier_sub_marg": sum(r["hier_sub_marg"] for r in rows) / n,
+        })
+    return {"summary": summary, "rows": rows}
+# ── Pretty printing ──────────────────────────────────────────────────────────
+def print_single_color(bl, gc):
+    bs, gs = bl["summary"], gc["summary"]
+    color = bs["target_color"]
+    print("\n" + "=" * 92)
+    print(f"  COLOR ACROSS HIERARCHIES — target: \"{color}\"")
+    print(f"  {bs['n']} queries ({len(HIERARCHIES)} hierarchies x {len(QUERY_TEMPLATES)} templates)")
+    print("=" * 92)
+    print(f"\n  {'Strategy':<40} {'Baseline':<14} {'GAP-CLIP':<14}")
+    print(f"  {'-' * 68}")
+    def row(label, bk, gk):
+        print(f"  {label:<40} {bs[bk]:>8.1%}{'':6} {gs[gk]:>8.1%}")
+    row("Color acc — naive (512D)", "color_naive", "color_naive")
+    row("Color acc — ensembled (512D)", "color_ens", "color_ens")
+    print(f"  {'Color acc — subspace (16D)':<40} {'N/A':>8}{'':6} {gs['color_sub_naive']:>8.1%}")
+    print()
+    row("Hier  acc — naive (512D)", "hier_naive", "hier_naive")
+    row("Hier  acc — ensembled (512D)", "hier_ens", "hier_ens")
+    print(f"  {'Hier  acc — subspace naive (64D)':<40} {'N/A':>8}{'':6} {gs['hier_sub_naive']:>8.1%}")
+    print(f"  {'Hier  acc — subspace ensembled (64D)':<40} {'N/A':>8}{'':6} {gs['hier_sub_ens']:>8.1%}")
+    print(f"  {'Hier  acc — subspace marginalized (64D)':<40} {'N/A':>8}{'':6} {gs['hier_sub_marg']:>8.1%}")
+    # Per-hierarchy breakdown for the best strategies
+    print(f"\n  Per-hierarchy (best strategies):")
+    print(f"  {'Hierarchy':<12} {'BL ens(512)':<14} {'GC ens(512)':<14} {'GC marg(64)':<14}")
+    print(f"  {'-' * 54}")
+    for h in HIERARCHIES:
+        bl_rows = [r for r in bl["rows"] if r["true_hierarchy"] == h]
+        gc_rows = [r for r in gc["rows"] if r["true_hierarchy"] == h]
+        nh = len(bl_rows)
+        b = sum(r["hier_ens"] for r in bl_rows) / nh
+        g512 = sum(r["hier_ens"] for r in gc_rows) / nh
+        g64 = sum(r["hier_sub_marg"] for r in gc_rows) / nh
+        print(f"  {h:<12} {b:>8.1%}{'':6} {g512:>8.1%}{'':6} {g64:>8.1%}")
+    print("=" * 92)
+# ── Graph ────────────────────────────────────────────────────────────────────
+def plot_all_colors_graph(all_bl, all_gc):
+    """Create a publication-quality comparison chart."""
+    FIGURES_DIR.mkdir(exist_ok=True)
+    bl_color_naive = [all_bl[c]["color_naive"] for c in COLORS]
+    bl_hier_naive = [all_bl[c]["hier_naive"] for c in COLORS]
+    bl_hier_ens = [all_bl[c]["hier_ens"] for c in COLORS]
+    gc_color_naive = [all_gc[c]["color_naive"] for c in COLORS]
+    gc_color_sub = [all_gc[c]["color_sub_naive"] for c in COLORS]
+    gc_hier_naive = [all_gc[c]["hier_naive"] for c in COLORS]
+    gc_hier_ens = [all_gc[c]["hier_ens"] for c in COLORS]
+    gc_hier_marg = [all_gc[c]["hier_sub_marg"] for c in COLORS]
+    # Use a clean style
+    plt.rcParams.update({
+        "font.family": "sans-serif",
+        "axes.facecolor": "#FAFAFA",
+        "figure.facecolor": "white",
+    })
+    fig = plt.figure(figsize=(20, 14))
+    gs = fig.add_gridspec(2, 2, hspace=0.42, wspace=0.28,
+                          height_ratios=[1, 1.1])
+    x = np.arange(len(COLORS))
+    color_labels = [c.capitalize() for c in COLORS]
+    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    # TOP-LEFT: Color accuracy (zoomed to 85-102%)
+    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    ax1 = fig.add_subplot(gs[0, 0])
+    bar_w = 0.22
+    b1 = ax1.bar(x - bar_w, bl_color_naive, bar_w, label="Baseline (512D)",
+                 color="#5B9BD5", edgecolor="white", linewidth=0.6, zorder=3)
+    b2 = ax1.bar(x,         gc_color_naive, bar_w, label="GAP-CLIP (512D)",
+                 color="#ED7D31", edgecolor="white", linewidth=0.6, zorder=3)
+    b3 = ax1.bar(x + bar_w, gc_color_sub,   bar_w, label="GAP-CLIP 16D subspace",
+                 color="#70AD47", edgecolor="white", linewidth=0.6, zorder=3)
+    ax1.set_title("A.  Color Classification Accuracy", fontsize=14, fontweight="bold",
+                  loc="left", pad=12)
+    ax1.set_xticks(x)
+    ax1.set_xticklabels(color_labels, rotation=35, ha="right", fontsize=10)
+    ax1.set_ylabel("Accuracy", fontsize=11)
+    ax1.set_ylim(0.85, 1.04)
+    ax1.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))
+    ax1.legend(fontsize=9, framealpha=0.95, loc="lower left")
+    ax1.grid(axis="y", alpha=0.25, linestyle="--", zorder=0)
+    ax1.spines["top"].set_visible(False)
+    ax1.spines["right"].set_visible(False)
+    # Annotate means
+    for vals, clr, lbl, yoff in [
+        (bl_color_naive, "#5B9BD5", "BL", 0.006),
+        (gc_color_sub, "#70AD47", "GC-16D", -0.012),
+    ]:
+        m = np.mean(vals)
+        ax1.axhline(m, color=clr, linestyle=":", alpha=0.5, linewidth=1.0, zorder=1)
+        ax1.text(len(COLORS) - 0.3, m + yoff, f"{lbl} mean: {m:.1%}",
+                 fontsize=8, color=clr, ha="right", fontstyle="italic")
+    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    # TOP-RIGHT: Hierarchy accuracy — zoomed to 70-102%
+    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    ax2 = fig.add_subplot(gs[0, 1])
+    bar_w = 0.14
+    offsets = np.array([-2, -1, 0, 1, 2])
+    bars_cfg = [
+        (bl_hier_naive, "Baseline naive (512D)",        "#93C4ED"),
+        (bl_hier_ens,   "Baseline ensembled (512D)",    "#2E75B6"),
+        (gc_hier_naive, "GAP-CLIP naive (512D)",        "#F4B183"),
+        (gc_hier_ens,   "GAP-CLIP ensembled (512D)",    "#C55A11"),
+        (gc_hier_marg,  "GAP-CLIP structured (64D)",    "#70AD47"),
+    ]
+    for i, (data, label, color) in enumerate(bars_cfg):
+        ax2.bar(x + offsets[i] * bar_w, data, bar_w, label=label, color=color,
+                edgecolor="white", linewidth=0.6, zorder=3)
+    ax2.set_title("B.  Hierarchy Classification Accuracy", fontsize=14,
+                  fontweight="bold", loc="left", pad=12)
+    ax2.set_xticks(x)
+    ax2.set_xticklabels(color_labels, rotation=35, ha="right", fontsize=10)
+    ax2.set_ylabel("Accuracy", fontsize=11)
+    ax2.set_ylim(0.70, 1.05)
+    ax2.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))
+    ax2.legend(fontsize=8.5, framealpha=0.95, loc="lower left", ncol=1)
+    ax2.grid(axis="y", alpha=0.25, linestyle="--", zorder=0)
+    ax2.spines["top"].set_visible(False)
+    ax2.spines["right"].set_visible(False)
+    bl_hm = np.mean(bl_hier_ens)
+    gc_hm = np.mean(gc_hier_marg)
+    ax2.axhline(bl_hm, color="#2E75B6", linestyle="--", alpha=0.6, linewidth=1.2, zorder=1)
+    ax2.axhline(gc_hm, color="#70AD47", linestyle="--", alpha=0.6, linewidth=1.2, zorder=1)
+    ax2.text(len(COLORS) - 0.3, bl_hm - 0.016, f"BL-ens mean: {bl_hm:.1%}",
+             fontsize=8.5, color="#2E75B6", ha="right", fontweight="bold")
+    ax2.text(len(COLORS) - 0.3, gc_hm + 0.006, f"GC-struct mean: {gc_hm:.1%}",
+             fontsize=8.5, color="#70AD47", ha="right", fontweight="bold")
+    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    # BOTTOM: Mean accuracy summary bar chart
+    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    ax3 = fig.add_subplot(gs[1, :])
+    metrics = [
+        ("Color\n(Naive 512D)",     np.mean(bl_color_naive), np.mean(gc_color_naive)),
+        ("Color\n(16D Subspace)",   None,                    np.mean(gc_color_sub)),
+        ("Hierarchy\n(Naive 512D)", np.mean(bl_hier_naive),  np.mean(gc_hier_naive)),
+        ("Hierarchy\n(Ens. 512D)",  np.mean(bl_hier_ens),    np.mean(gc_hier_ens)),
+        ("Hierarchy\n(Structured 64D)", None,                np.mean(gc_hier_marg)),
+    ]
+    xm = np.arange(len(metrics))
+    bar_w = 0.30
+    bl_vals = [m[1] for m in metrics]
+    gc_vals = [m[2] for m in metrics]
+    for i, (label, bv, gv) in enumerate(metrics):
+        if bv is not None:
+            bar_bl = ax3.bar(i - bar_w / 2, bv, bar_w, color="#2E75B6",
+                             edgecolor="white", linewidth=0.8, zorder=3,
+                             label="Baseline" if i == 0 else "")
+            ax3.text(i - bar_w / 2, bv + 0.008, f"{bv:.1%}", ha="center",
+                     fontsize=10, fontweight="bold", color="#2E75B6", zorder=4)
+        bar_gc = ax3.bar(i + (bar_w / 2 if bv is not None else 0), gv, bar_w,
+                         color="#70AD47", edgecolor="white", linewidth=0.8, zorder=3,
+                         label="GAP-CLIP" if i == 0 else "")
+        xpos = i + (bar_w / 2 if bv is not None else 0)
+        ax3.text(xpos, gv + 0.008, f"{gv:.1%}", ha="center",
+                 fontsize=10, fontweight="bold", color="#70AD47", zorder=4)
+        # Delta annotation for hierarchy metrics where both exist
+        if bv is not None and "Hierarchy" in label:
+            delta = gv - bv
+            sign = "+" if delta >= 0 else ""
+            clr = "#70AD47" if delta > 0 else "#C00000"
+            ax3.annotate(
+                f"{sign}{delta:.1%}",
+                xy=(i + bar_w / 2, gv),
+                xytext=(i + bar_w / 2 + 0.25, gv + 0.03),
+                fontsize=9, fontweight="bold", color=clr,
+                arrowprops=dict(arrowstyle="->", color=clr, lw=1.2),
+                zorder=5,
+            )
+    ax3.set_title("C.  Mean Accuracy Summary (across all 11 colors)",
+                  fontsize=14, fontweight="bold", loc="left", pad=12)
+    ax3.set_xticks(xm)
+    ax3.set_xticklabels([m[0] for m in metrics], fontsize=10.5)
+    ax3.set_ylabel("Mean Accuracy", fontsize=11)
+    ax3.set_ylim(0.75, 1.08)
+    ax3.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))
+    ax3.legend(fontsize=11, framealpha=0.95, loc="lower left")
+    ax3.grid(axis="y", alpha=0.25, linestyle="--", zorder=0)
+    ax3.spines["top"].set_visible(False)
+    ax3.spines["right"].set_visible(False)
+    # Global title
+    fig.suptitle(
+        "Color Retrieval Test — Baseline (Fashion-CLIP) vs GAP-CLIP\n"
+        f"{len(COLORS)} colors  x  {len(HIERARCHIES)} hierarchies  x  "
+        f"{len(QUERY_TEMPLATES)} templates  =  {len(COLORS)*len(HIERARCHIES)*len(QUERY_TEMPLATES)} queries per model",
+        fontsize=16, fontweight="bold", y=1.01,
+    )
+    out_path = FIGURES_DIR / "color_across_hierarchies.png"
+    fig.savefig(out_path, dpi=200, bbox_inches="tight", facecolor="white")
+    plt.close(fig)
+    print(f"\nFigure saved -> {out_path}")
+    return out_path
+# ── All-colors sweep ─────���───────────────────────────────────────────────────
+def run_all_colors(device):
+    print("Loading models...")
+    bl_model, bl_proc = load_baseline_fashion_clip(device)
+    gc_model, gc_proc = load_gap_clip(config.main_model_path, device)
+    all_bl, all_gc = {}, {}
+    for color in COLORS:
+        print(f"\n--- Evaluating: {color} ---")
+        bl = evaluate_model(bl_model, bl_proc, device, color, "Baseline")
+        gc = evaluate_model(
+            gc_model, gc_proc, device, color, "GAP-CLIP",
+            color_dim=config.color_emb_dim,
+            hier_start=config.color_emb_dim,
+            hier_end=config.color_emb_dim + config.hierarchy_emb_dim,
+        )
+        all_bl[color] = bl["summary"]
+        all_gc[color] = gc["summary"]
+    # ── Summary table ──
+    print("\n" + "=" * 115)
+    print("  ALL-COLORS SUMMARY")
+    print("=" * 115)
+    print(f"\n  {'':12}"
+          f"{'--- COLOR ACC ---':^36}"
+          f"{'--- HIERARCHY ACC ---':^60}")
+    print(f"  {'Color':<12}"
+          f"{'BL(512)':>10} {'GC(512)':>10} {'GC(16D)':>10}  "
+          f"{'BL naive':>10} {'BL ens':>10} {'GC naive':>10} {'GC ens':>10} {'GC struct':>10}")
+    print(f"  {'-' * 105}")
+    totals = {k: 0.0 for k in [
+        "bl_cn", "gc_cn", "gc_cs",
+        "bl_hn", "bl_he", "gc_hn", "gc_he", "gc_hm",
+    ]}
+    for color in COLORS:
+        b, g = all_bl[color], all_gc[color]
+        totals["bl_cn"] += b["color_naive"]
+        totals["gc_cn"] += g["color_naive"]
+        totals["gc_cs"] += g["color_sub_naive"]
+        totals["bl_hn"] += b["hier_naive"]
+        totals["bl_he"] += b["hier_ens"]
+        totals["gc_hn"] += g["hier_naive"]
+        totals["gc_he"] += g["hier_ens"]
+        totals["gc_hm"] += g["hier_sub_marg"]
+        print(
+            f"  {color:<12}"
+            f"{b['color_naive']:>9.1%} {g['color_naive']:>10.1%} {g['color_sub_naive']:>10.1%}  "
+            f"{b['hier_naive']:>9.1%} {b['hier_ens']:>10.1%} {g['hier_naive']:>10.1%} "
+            f"{g['hier_ens']:>10.1%} {g['hier_sub_marg']:>10.1%}"
+        )
+    n = len(COLORS)
+    print(f"  {'-' * 105}")
+    print(
+        f"  {'MEAN':<12}"
+        f"{totals['bl_cn']/n:>9.1%} {totals['gc_cn']/n:>10.1%} {totals['gc_cs']/n:>10.1%}  "
+        f"{totals['bl_hn']/n:>9.1%} {totals['bl_he']/n:>10.1%} {totals['gc_hn']/n:>10.1%} "
+        f"{totals['gc_he']/n:>10.1%} {totals['gc_hm']/n:>10.1%}"
+    )
+    print("=" * 115)
+    # ── Graph ──
+    plot_all_colors_graph(all_bl, all_gc)
+# ── Main ─────────────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(
+        description="Color retrieval accuracy across hierarchies — Baseline vs GAP-CLIP"
+    )
+    parser.add_argument(
+        "--color", type=str, default="red",
+        help=f"Target color (default: red). Choices: {', '.join(COLORS)}",
+    )
+    parser.add_argument(
+        "--all-colors", action="store_true",
+        help="Run for all 11 colors and produce a comparison graph",
+    )
+    args = parser.parse_args()
+    device = config.device
+    print(f"Device: {device}")
+    if args.all_colors:
+        run_all_colors(device)
+        return
+    target_color = args.color.lower()
+    if target_color not in COLORS:
+        print(f"Error: '{target_color}' not in {COLORS}")
+        sys.exit(1)
+    print("Loading Baseline (Fashion-CLIP)...")
+    bl_model, bl_proc = load_baseline_fashion_clip(device)
+    print("Loading GAP-CLIP...")
+    gc_model, gc_proc = load_gap_clip(config.main_model_path, device)
+    print(f"\nEvaluating \"{target_color}\" across {len(HIERARCHIES)} hierarchies...\n")
+    bl = evaluate_model(bl_model, bl_proc, device, target_color, "Baseline")
+    gc = evaluate_model(
+        gc_model, gc_proc, device, target_color, "GAP-CLIP",
+        color_dim=config.color_emb_dim,
+        hier_start=config.color_emb_dim,
+        hier_end=config.color_emb_dim + config.hierarchy_emb_dim,
+    )
+    print_single_color(bl, gc)
+if __name__ == "__main__":
+    main()