Leacb4
/

gap-clip

@@ -21,12 +21,7 @@ designed for, and tests zero-shot vision-language alignment.
     for items sharing a color but differing in category.
     Expected result: 1000/1000 pass.
-  Test C1 — Zero-shot image-to-text classification:
-    Each image is used as a query; the highest-scoring text label (cosine in
-    shared latent space) is the predicted class. Accuracy is computed across
-    three datasets (Fashion-MNIST, KAGL Marqo, Internal).
-  Test D  — Subspace Decomposition Consistency:
     Encode a full description (e.g. "red dress in cotton"), a standalone color
     ("red"), and a standalone hierarchy ("dress").  Verify that:
       - The color subspace (first 16D) of the full embedding is more similar
@@ -35,6 +30,11 @@ designed for, and tests zero-shot vision-language alignment.
         similar to the hierarchy-only embedding than to the color-only embedding.
     Expected result: 1000/1000 pass.
 Paper reference: Section 5.3.6 and Table 4.
 Run directly:
@@ -51,6 +51,9 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 from dataclasses import dataclass
 from pathlib import Path
 import random
 from typing import Dict, List, Optional, Sequence, Tuple
 import numpy as np
@@ -62,16 +65,39 @@ import torch.nn.functional as F
 from io import BytesIO
 from PIL import Image
 from torchvision import transforms
 from transformers import CLIPModel as CLIPModelTransformers
 from transformers import CLIPProcessor
 @dataclass
 class RuntimeConfig:
-    color_emb_dim: int = 16
-    hierarchy_emb_dim: int = 64
-    main_model_path: str = "models/gap_clip.pth"
-    device: torch.device = torch.device("cpu")
 DEFAULT_NUM_EXAMPLES = 10000
 DEFAULT_NUM_PRINTED = 3
@@ -106,6 +132,7 @@ def resolve_runtime_config() -> RuntimeConfig:
         cfg.color_emb_dim = getattr(config, "color_emb_dim", cfg.color_emb_dim)
         cfg.hierarchy_emb_dim = getattr(config, "hierarchy_emb_dim", cfg.hierarchy_emb_dim)
         cfg.main_model_path = getattr(config, "main_model_path", cfg.main_model_path)
         cfg.device = getattr(config, "device", cfg.device)
     except Exception:
@@ -120,27 +147,50 @@ def resolve_runtime_config() -> RuntimeConfig:
 def load_main_model(device: torch.device, main_model_path: str) -> Tuple[CLIPModelTransformers, CLIPProcessor]:
-    """Load GAP-CLIP (LAION CLIP + finetuned checkpoint) and processor.
-    Delegates to utils.model_loader.load_gap_clip for consistent loading.
-    """
-    from evaluation.utils.model_loader import load_gap_clip  # type: ignore
-    return load_gap_clip(main_model_path, device)
-def get_text_embedding(
-    model: CLIPModelTransformers, processor: CLIPProcessor, device: torch.device, text: str
-) -> torch.Tensor:
-    """Extract normalized text embedding for a single query."""
-    text_inputs = processor(text=[text], padding=True, return_tensors="pt")
-    text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
     with torch.no_grad():
-        text_outputs = model.text_model(**text_inputs)
-        text_features = model.text_projection(text_outputs.pooler_output)
-        text_features = F.normalize(text_features, dim=-1)
-    return text_features.squeeze(0)
 def cosine(a: torch.Tensor, b: torch.Tensor) -> float:
@@ -179,8 +229,7 @@ def run_test_a(
     cfg: RuntimeConfig,
     num_examples: int,
     num_printed: int,
-    test_name: str = "Test A",
-) -> Dict[str, bool]:
     """
     A: different colors + same hierarchy.
     Expect hierarchy subspace to be more similar than color subspace.
@@ -292,8 +341,7 @@ def run_test_b(
     cfg: RuntimeConfig,
     num_examples: int,
     num_printed: int,
-    test_name: str = "Test B",
-) -> Dict[str, bool]:
     """
     B: same color + different hierarchies.
     Expect similarity in first16 (color) to be higher than full512.
@@ -398,16 +446,15 @@ def run_test_b(
-def run_test_d(
     model: CLIPModelTransformers,
     processor: CLIPProcessor,
     cfg: RuntimeConfig,
     num_examples: int,
     num_printed: int,
-    test_name: str = "Test D",
-) -> Dict[str, object]:
     """
-    D: Subspace Decomposition Consistency.
     Encode a full description (e.g. "red dress in cotton"), a standalone color
     ("red"), and a standalone hierarchy ("dress").  Then verify:
       - The color subspace (first 16D) of the full embedding aligns with the
@@ -568,36 +615,26 @@ def fashion_mnist_pixels_to_tensor(pixel_values: np.ndarray, image_size: int = 2
 def get_image_embedding(
     model: CLIPModelTransformers, processor: CLIPProcessor, device: torch.device, image_tensor: torch.Tensor
 ) -> torch.Tensor:
     image_tensor = image_tensor.unsqueeze(0).to(device)
-    with torch.no_grad():
-        vision_outputs = model.vision_model(pixel_values=image_tensor)
-        image_features = model.visual_projection(vision_outputs.pooler_output)
-        image_features = F.normalize(image_features, dim=-1)
-    return image_features.squeeze(0)
 def get_image_embedding_from_pil(
     model: CLIPModelTransformers, processor: CLIPProcessor, device: torch.device, image: Image.Image
 ) -> torch.Tensor:
-    image_inputs = processor(images=[image], return_tensors="pt")
-    image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
-    with torch.no_grad():
-        vision_outputs = model.vision_model(**image_inputs)
-        image_features = model.visual_projection(vision_outputs.pooler_output)
-        image_features = F.normalize(image_features, dim=-1)
-    return image_features.squeeze(0)
 def get_text_embeddings_batch(
     model: CLIPModelTransformers, processor: CLIPProcessor, device: torch.device, texts: List[str]
 ) -> torch.Tensor:
-    text_inputs = processor(text=texts, padding=True, return_tensors="pt")
-    text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
-    with torch.no_grad():
-        text_outputs = model.text_model(**text_inputs)
-        text_features = model.text_projection(text_outputs.pooler_output)
-        text_features = F.normalize(text_features, dim=-1)
-    return text_features
 def get_prompt_ensembled_text_embeddings(
@@ -678,79 +715,187 @@ def get_adaptive_label_prior(labels: List[str]) -> Tuple[torch.Tensor, float]:
     return probs, recommended_weight
-def run_test_c(
-    model: CLIPModelTransformers,
-    processor: CLIPProcessor,
-    cfg: RuntimeConfig,
-    num_examples: int,
-    num_printed: int,
-    csv_path: str = FASHION_MNIST_CSV,
-) -> Dict[str, object]:
-    """
-    C: Zero-shot image classification.
-    For each image, compute cosine similarity against all candidate text labels
-    and check whether the highest-scoring text matches the ground truth.
-    """
-    csv_file = Path(csv_path)
-    if not csv_file.exists():
-        print(f"  Skipping Test C: {csv_path} not found")
-        return {"overall": True, "accuracy": None}
-    df = pd.read_csv(csv_path)
-    df = df.sample(n=min(num_examples, len(df)), random_state=42).reset_index(drop=True)
-    candidate_labels = sorted(set(FASHION_MNIST_ORIGINAL_LABELS.values()))
-    candidate_texts = [f"a photo of a {label}" for label in candidate_labels]
-    text_embs = get_text_embeddings_batch(model, processor, cfg.device, candidate_texts)
-    pixel_cols = [f"pixel{i}" for i in range(1, 785)]
-    rows: List[List[str]] = []
-    failed_rows: List[List[str]] = []
     correct = 0
-    for idx in range(len(df)):
-        row = df.iloc[idx]
-        label_id = int(row["label"])
-        ground_truth = FASHION_MNIST_ORIGINAL_LABELS.get(label_id, "unknown")
-        pixels = row[pixel_cols].values.astype(float)
-        img_tensor = fashion_mnist_pixels_to_tensor(pixels)
-        img_emb = get_image_embedding(model, processor, cfg.device, img_tensor)
-        sims = F.cosine_similarity(img_emb.unsqueeze(0), text_embs, dim=1)
-        best_idx = sims.argmax().item()
-        predicted = candidate_labels[best_idx]
-        best_sim = sims[best_idx].item()
-        ok = predicted == ground_truth
-        if ok:
-            correct += 1
-        rows.append([
-            str(idx),
-            ground_truth,
-            predicted,
-            f"{best_sim:.4f}",
-            format_bool(ok),
-        ])
-        if not ok:
-            failed_rows.append([
-                str(idx),
-                ground_truth,
-                predicted,
-                f"{best_sim:.4f}",
-            ])
-    accuracy = correct / len(df)
-    print_table(
-        f"Test C: Zero-shot image classification (showing {min(num_printed, len(rows))}/{len(rows)} examples)",
-        ["#", "Ground Truth", "Predicted", "Best CosSim", "Result"],
-        rows[:num_printed],
-    )
-    print(f"Test C aggregate: {correct}/{len(df)} correct ({accuracy:.2%})")
-    return {"overall": True, "accuracy": accuracy}
 def normalize_hierarchy_label(raw_label: str) -> str:
@@ -805,544 +950,203 @@ def normalize_hierarchy_label(raw_label: str) -> str:
         "innerwear": "underwear",
         "loungewear and nightwear": "underwear",
         "saree": "dress",
     }
-    return synonyms.get(label, label)
-def get_candidate_labels_from_internal_csv() -> List[str]:
-    csv_file = Path(INTERNAL_DATASET_CSV)
-    if csv_file.exists():
-        df = pd.read_csv(INTERNAL_DATASET_CSV, usecols=["hierarchy"]).dropna()
-        labels = sorted(set(normalize_hierarchy_label(v) for v in df["hierarchy"].astype(str)))
-        if labels:
-            return labels
-    return sorted(set(FASHION_MNIST_LABELS.values()))
-def load_hierarchy_model_for_eval(device: torch.device):
-    """Load the trained hierarchy model for evaluation strategies. Returns None on failure."""
-    try:
-        from training.hierarchy_model import Model as _HierarchyModel, HierarchyExtractor as _HierarchyExtractor
-        import config as _cfg
-    except ImportError:
-        return None
-    model_path = Path(getattr(_cfg, "hierarchy_model_path", "models/hierarchy_model.pth"))
-    if not model_path.exists():
-        return None
-    try:
-        checkpoint = torch.load(str(model_path), map_location=device)
-        hierarchy_classes = checkpoint.get("hierarchy_classes", [])
-        if not hierarchy_classes:
-            return None
-        _model = _HierarchyModel(
-            num_hierarchy_classes=len(hierarchy_classes),
-            embed_dim=getattr(_cfg, "hierarchy_emb_dim", 64),
-        ).to(device)
-        _model.load_state_dict(checkpoint["model_state"])
-        _model.set_hierarchy_extractor(_HierarchyExtractor(hierarchy_classes, verbose=False))
-        _model.eval()
-        return _model
-    except Exception:
-        return None
-def evaluate_zero_shot_gap(
-    model: CLIPModelTransformers,
-    processor: CLIPProcessor,
-    device: torch.device,
-    samples: List[Tuple[Image.Image, str]],
-    candidate_labels: List[str],
-    title_prefix: str,
-    num_printed: int,
-    color_emb_dim: int = 16,
-    hierarchy_emb_dim: int = 64,
-    hierarchy_model=None,
-) -> Dict[str, Optional[float]]:
-    if len(samples) == 0:
-        print(f"  Skipping {title_prefix}: no valid samples")
-        return {"accuracy_c1": None, "strategy": None}
-    # Strategy 1 (baseline prompt) and prompt-ensemble embeddings.
-    base_templates = ["a photo of a {label}"]
-    ensemble_templates = [
-        "a photo of a {label}",
-        "a product photo of a {label}",
-        "a studio photo of a {label}",
-        "a fashion item: {label}",
-        "an image of a {label}",
-    ]
-    text_embs_single = get_prompt_ensembled_text_embeddings(
-        model=model,
-        processor=processor,
-        device=device,
-        labels=candidate_labels,
-        templates=base_templates,
-    )
-    text_embs_ensemble = get_prompt_ensembled_text_embeddings(
-        model=model,
-        processor=processor,
-        device=device,
-        labels=candidate_labels,
-        templates=ensemble_templates,
-    )
-    # Precompute image embeddings once for C1.
-    image_embs: List[torch.Tensor] = []
-    for image, _ in samples:
-        image_embs.append(get_image_embedding_from_pil(model, processor, device, image))
-    image_embs_tensor = torch.stack(image_embs, dim=0)
-    # Similarity matrices (N images x C labels)
-    sims_single = image_embs_tensor @ text_embs_single.T
-    sims_ensemble = image_embs_tensor @ text_embs_ensemble.T
-    # Calibration and prior terms.
-    class_bias = sims_ensemble.mean(dim=0, keepdim=True)
-    class_prior = get_internal_label_prior(candidate_labels).to(device)
-    log_prior = torch.log(class_prior + 1e-8).unsqueeze(0)
-    # Baseline inference-time strategies (full 512-d embedding).
-    strategy_scores: Dict[str, torch.Tensor] = {
-        "single_prompt": sims_single,
-        "prompt_ensemble": sims_ensemble,
-        "ensemble_plus_calibration": sims_ensemble - 0.2 * class_bias,
-        "ensemble_plus_prior": sims_ensemble + 0.15 * log_prior,
-        "ensemble_calibration_plus_prior": sims_ensemble - 0.2 * class_bias + 0.15 * log_prior,
-    }
-    # Extended prompt ensemble for broader category coverage.
-    extended_templates = [
-        "a photo of a {label}",
-        "a product photo of a {label}",
-        "a studio photo of a {label}",
-        "a fashion item: {label}",
-        "an image of a {label}",
-        "{label}",
-        "a picture of a {label}",
-        "this is a {label}",
-        "a fashion product: {label}",
-        "a {label} clothing item",
     ]
-    text_embs_extended = get_prompt_ensembled_text_embeddings(
-        model=model, processor=processor, device=device,
-        labels=candidate_labels, templates=extended_templates,
-    )
-    sims_extended = image_embs_tensor @ text_embs_extended.T
-    # Subspace: exclude color dimensions (keep hierarchy + residual).
-    hier_end = color_emb_dim + hierarchy_emb_dim
-    img_no_color = F.normalize(image_embs_tensor[:, color_emb_dim:], dim=-1)
-    text_ext_no_color = F.normalize(text_embs_extended[:, color_emb_dim:], dim=-1)
-    text_ens_no_color = F.normalize(text_embs_ensemble[:, color_emb_dim:], dim=-1)
-    sims_no_color = img_no_color @ text_ens_no_color.T
-    sims_no_color_ext = img_no_color @ text_ext_no_color.T
-    # Subspace: hierarchy-only dimensions.
-    img_hier = F.normalize(image_embs_tensor[:, color_emb_dim:hier_end], dim=-1)
-    text_ens_hier = F.normalize(text_embs_ensemble[:, color_emb_dim:hier_end], dim=-1)
-    text_ext_hier = F.normalize(text_embs_extended[:, color_emb_dim:hier_end], dim=-1)
-    sims_hier_ens = img_hier @ text_ens_hier.T
-    sims_hier_ext = img_hier @ text_ext_hier.T
-    # Adaptive prior (reduces influence for out-of-domain label sets).
-    adaptive_prior, adaptive_weight = get_adaptive_label_prior(candidate_labels)
-    adaptive_prior = adaptive_prior.to(device)
-    log_adaptive_prior = torch.log(adaptive_prior + 1e-8).unsqueeze(0)
-    class_bias_no_color = sims_no_color.mean(dim=0, keepdim=True)
-    strategy_scores.update({
-        "extended_ensemble": sims_extended,
-        "no_color_ensemble": sims_no_color,
-        "no_color_extended": sims_no_color_ext,
-        "hierarchy_only_ensemble": sims_hier_ens,
-        "hierarchy_only_extended": sims_hier_ext,
-        "no_color_calibrated": sims_no_color - 0.2 * class_bias_no_color,
-        "no_color_adaptive_prior": sims_no_color + adaptive_weight * log_adaptive_prior,
-        "no_color_ext_adaptive_prior": sims_no_color_ext + adaptive_weight * log_adaptive_prior,
-        "extended_adaptive_prior": sims_extended + adaptive_weight * log_adaptive_prior,
-    })
-    # Weighted embeddings: amplify hierarchy dims relative to residual.
-    for amp_factor in (2.0, 4.0):
-        weights = torch.ones(image_embs_tensor.shape[1], device=device)
-        weights[:color_emb_dim] = 0.0
-        weights[color_emb_dim:hier_end] = amp_factor
-        weighted_img = F.normalize(image_embs_tensor * weights.unsqueeze(0), dim=-1)
-        weighted_text = F.normalize(text_embs_extended * weights.unsqueeze(0), dim=-1)
-        tag = f"weighted_hier_{amp_factor:.0f}x"
-        strategy_scores[tag] = weighted_img @ weighted_text.T
-    # Hierarchy model direct strategy (uses dedicated hierarchy encoder).
-    if hierarchy_model is not None:
-        hier_text_embs: List[torch.Tensor] = []
-        known_label_mask: List[bool] = []
-        for label in candidate_labels:
-            try:
-                emb = hierarchy_model.get_text_embeddings(label).squeeze(0)
-                hier_text_embs.append(emb)
-                known_label_mask.append(True)
-            except (ValueError, Exception):
-                hier_text_embs.append(text_ext_hier[candidate_labels.index(label)])
-                known_label_mask.append(False)
-        hier_text_matrix = F.normalize(torch.stack(hier_text_embs).to(device), dim=-1)
-        sims_hier_model = img_hier @ hier_text_matrix.T
-        strategy_scores["hierarchy_model_direct"] = sims_hier_model
-        class_bias_hier = sims_hier_model.mean(dim=0, keepdim=True)
-        strategy_scores["hier_model_calibrated"] = sims_hier_model - 0.2 * class_bias_hier
-        strategy_scores["hier_model_adaptive_prior"] = sims_hier_model + adaptive_weight * log_adaptive_prior
-        # Hybrid: hierarchy model scores for known labels, CLIP for unknown.
-        hybrid_scores = sims_no_color_ext.clone()
-        for label_idx, is_known in enumerate(known_label_mask):
-            if is_known:
-                hybrid_scores[:, label_idx] = sims_hier_model[:, label_idx]
-        strategy_scores["hybrid_hier_clip"] = hybrid_scores
-        # Blended: z-score-normalised mix of hierarchy and full-space scores.
-        hier_mu = sims_hier_model.mean()
-        hier_std = sims_hier_model.std() + 1e-8
-        full_mu = sims_extended.mean()
-        full_std = sims_extended.std() + 1e-8
-        hier_z = (sims_hier_model - hier_mu) / hier_std
-        full_z = (sims_extended - full_mu) / full_std
-        for alpha in (0.3, 0.5, 0.7):
-            strategy_scores[f"blend_hier_full_{alpha:.1f}"] = alpha * hier_z + (1 - alpha) * full_z
-    # Select best strategy for C1.
-    best_strategy_c1 = "single_prompt"
-    best_acc_c1 = -1.0
-    best_scores_c1 = sims_single
-    # Track per-strategy accuracies and weighted-F1 for fair comparison.
-    all_strategy_acc_c1: Dict[str, float] = {}
-    all_strategy_wf1_c1: Dict[str, float] = {}
-    ground_truths = [gt for _, gt in samples]
-    for strategy_name, score_mat in strategy_scores.items():
-        pred_idx = score_mat.argmax(dim=1).tolist()
-        preds = [candidate_labels[i] for i in pred_idx]
-        correct = sum(1 for p, g in zip(preds, ground_truths) if p == g)
-        acc = correct / len(samples)
-        wf1 = f1_score(ground_truths, preds, average="weighted", zero_division=0)
-        all_strategy_acc_c1[strategy_name] = acc
-        all_strategy_wf1_c1[strategy_name] = wf1
-        if acc > best_acc_c1:
-            best_acc_c1 = acc
-            best_strategy_c1 = strategy_name
-            best_scores_c1 = score_mat
-    best_wf1_c1 = all_strategy_wf1_c1[best_strategy_c1]
-    print(f"{title_prefix} selected C1 strategy: {best_strategy_c1} (acc={best_acc_c1:.2%}, wF1={best_wf1_c1:.2%})")
-    # C1: image -> all texts (classification)
-    rows: List[List[str]] = []
-    correct = 0
-    all_preds: List[str] = []
-    for idx, (_, ground_truth) in enumerate(samples):
-        sims = best_scores_c1[idx]
-        best_idx = int(sims.argmax().item())
-        predicted = candidate_labels[best_idx]
-        best_sim = float(sims[best_idx].item())
-        ok = predicted == ground_truth
-        if ok:
-            correct += 1
-        all_preds.append(predicted)
-        rows.append([str(idx), ground_truth, predicted, f"{best_sim:.4f}", format_bool(ok)])
-    accuracy_c1 = correct / len(samples)
-    wf1_c1 = f1_score(ground_truths, all_preds, average="weighted", zero_division=0)
-    print_table(
-        f"{title_prefix} C1 image->texts (showing {min(num_printed, len(rows))}/{len(rows)} examples)",
-        ["#", "Ground Truth", "Predicted", "Best CosSim", "Result"],
-        rows[:num_printed],
-    )
-    print(f"{title_prefix} C1 aggregate: {correct}/{len(samples)} correct (acc={accuracy_c1:.2%}, wF1={wf1_c1:.2%})")
-    return {
-        "accuracy_c1": accuracy_c1,
-        "wf1_c1": wf1_c1,
-        "strategy": best_strategy_c1,
-        "all_strategy_acc_c1": all_strategy_acc_c1,
-        "all_strategy_wf1_c1": all_strategy_wf1_c1,
-    }
-def evaluate_zero_shot_baseline(
-    baseline_model: CLIPModelTransformers,
-    baseline_processor: CLIPProcessor,
-    device: torch.device,
-    samples: List[Tuple[Image.Image, str]],
-    candidate_labels: List[str],
-    title_prefix: str,
-    num_printed: int,
-) -> Dict[str, Optional[float]]:
-    if len(samples) == 0:
-        print(f"  Skipping baseline {title_prefix}: no valid samples")
-        return {"accuracy_c1": None}
-    candidate_texts = [f"a photo of a {label}" for label in candidate_labels]
-    text_inputs = baseline_processor(text=candidate_texts, return_tensors="pt", padding=True, truncation=True)
-    text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
-    with torch.no_grad():
-        text_embs = baseline_model.get_text_features(**text_inputs)
-        text_embs = F.normalize(text_embs, dim=-1)
-    # Precompute image embeddings once for C1.
-    image_embs: List[torch.Tensor] = []
-    for image, _ in samples:
-        image_inputs = baseline_processor(images=[image], return_tensors="pt")
-        image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
-        with torch.no_grad():
-            img_emb = baseline_model.get_image_features(**image_inputs)
-            img_emb = F.normalize(img_emb, dim=-1)
-        image_embs.append(img_emb.squeeze(0))
-    image_embs_tensor = torch.stack(image_embs, dim=0)
-    # C1: image -> all texts (classification)
-    rows: List[List[str]] = []
-    correct = 0
-    all_preds: List[str] = []
-    ground_truths = [gt for _, gt in samples]
-    for idx, (_, ground_truth) in enumerate(samples):
-        img_emb = image_embs_tensor[idx].unsqueeze(0)
-        sims = F.cosine_similarity(img_emb, text_embs, dim=1)
-        best_idx = sims.argmax().item()
-        predicted = candidate_labels[best_idx]
-        best_sim = sims[best_idx].item()
-        ok = predicted == ground_truth
-        if ok:
-            correct += 1
-        all_preds.append(predicted)
-        rows.append([str(idx), ground_truth, predicted, f"{best_sim:.4f}", format_bool(ok)])
-    accuracy_c1 = correct / len(samples)
-    wf1_c1 = f1_score(ground_truths, all_preds, average="weighted", zero_division=0)
-    baseline_title = f"Baseline {title_prefix}"
-    print_table(
-        f"{baseline_title} C1 image->texts (showing {min(num_printed, len(rows))}/{len(rows)} examples)",
-        ["#", "Ground Truth", "Predicted", "Best CosSim", "Result"],
-        rows[:num_printed],
-    )
-    print(f"{baseline_title} C1 aggregate: {correct}/{len(samples)} correct (acc={accuracy_c1:.2%}, wF1={wf1_c1:.2%})")
-    return {"accuracy_c1": accuracy_c1, "wf1_c1": wf1_c1}
-def load_fashion_mnist_samples(
-    num_examples: int,
-) -> Tuple[List[Tuple[Image.Image, str]], List[Tuple[Image.Image, str]]]:
-    """Return (baseline_samples, gap_samples) with native and GAP-CLIP labels."""
-    csv_file = Path(FASHION_MNIST_CSV)
-    if not csv_file.exists():
-        return [], []
-    df = pd.read_csv(FASHION_MNIST_CSV)
-    df = df.sample(n=min(num_examples, len(df)), random_state=42).reset_index(drop=True)
-    pixel_cols = [f"pixel{i}" for i in range(1, 785)]
-    baseline_samples: List[Tuple[Image.Image, str]] = []
-    gap_samples: List[Tuple[Image.Image, str]] = []
-    for _, row in df.iterrows():
-        label_id = int(row["label"])
-        pixels = row[pixel_cols].values.astype(float)
-        img_array = pixels.reshape(28, 28).astype(np.uint8)
-        img_array = np.stack([img_array] * 3, axis=-1)
-        image = Image.fromarray(img_array)
-        baseline_samples.append((image, FASHION_MNIST_ORIGINAL_LABELS.get(label_id, "unknown")))
-        gap_samples.append((image, FASHION_MNIST_LABELS.get(label_id, "unknown")))
-    return baseline_samples, gap_samples
-def load_kagl_marqo_samples(
     num_examples: int,
-) -> Tuple[List[Tuple[Image.Image, str]], List[Tuple[Image.Image, str]]]:
-    """Return (baseline_samples, gap_samples) with native and GAP-CLIP labels."""
-    try:
-        from datasets import load_dataset  # type: ignore
-    except Exception:
-        print("  Skipping KAGL Marqo: datasets package not available")
-        return [], []
-    try:
-        dataset = load_dataset("Marqo/KAGL", split="data")
-    except Exception as exc:
-        print(f"  Skipping KAGL Marqo: failed to load dataset ({exc})")
-        return [], []
-    dataset = dataset.shuffle(seed=42).select(range(min(num_examples, len(dataset))))
     baseline_samples: List[Tuple[Image.Image, str]] = []
     gap_samples: List[Tuple[Image.Image, str]] = []
-    for item in dataset:
-        raw_label = item.get("category2")
-        if raw_label is None:
-            continue
-        native_label = str(raw_label).strip()
-        gap_label = normalize_hierarchy_label(native_label)
-        image_obj = item.get("image")
-        if image_obj is None:
-            continue
-        if hasattr(image_obj, "convert"):
-            image = image_obj.convert("RGB")
-        elif isinstance(image_obj, dict) and "bytes" in image_obj:
-            image = Image.open(BytesIO(image_obj["bytes"])).convert("RGB")
-        else:
-            continue
-        baseline_samples.append((image, native_label))
-        gap_samples.append((image, gap_label))
-    return baseline_samples, gap_samples
-def load_internal_samples(
-    num_examples: int,
-) -> Tuple[List[Tuple[Image.Image, str]], List[Tuple[Image.Image, str]]]:
-    """Return (baseline_samples, gap_samples) — same labels for both on this dataset."""
-    csv_file = Path(INTERNAL_DATASET_CSV)
-    if not csv_file.exists():
-        print(f"  Skipping internal dataset: {INTERNAL_DATASET_CSV} not found")
-        return [], []
-    df = pd.read_csv(INTERNAL_DATASET_CSV)
-    if "hierarchy" not in df.columns:
-        print("  Skipping internal dataset: missing 'hierarchy' column")
-        return [], []
-    df = df.dropna(subset=["hierarchy", "image_url"]).sample(frac=1.0, random_state=42)
-    samples: List[Tuple[Image.Image, str]] = []
-    for _, row in df.iterrows():
-        if len(samples) >= num_examples:
             break
-        ground_truth = normalize_hierarchy_label(str(row["hierarchy"]))
-        image_url = str(row["image_url"])
         try:
-            response = requests.get(image_url, timeout=5)
-            response.raise_for_status()
-            image = Image.open(BytesIO(response.content)).convert("RGB")
-            samples.append((image, ground_truth))
         except Exception:
             continue
-    return samples, samples
-def run_test_c_baseline_fashion_clip(
-    device: torch.device,
-    num_examples: int,
-    num_printed: int,
-    csv_path: str = FASHION_MNIST_CSV,
-) -> Dict[str, Optional[float]]:
-    """
-    Same zero-shot protocol as Test C, but using baseline Fashion-CLIP.
-    """
-    csv_file = Path(csv_path)
-    if not csv_file.exists():
-        print(f"  Skipping Baseline Test C: {csv_path} not found")
-        return {"accuracy": None}
-    print("\nLoading baseline model (patrickjohncyh/fashion-clip)...")
-    baseline_name = "patrickjohncyh/fashion-clip"
-    baseline_processor = CLIPProcessor.from_pretrained(baseline_name)
-    baseline_model = CLIPModelTransformers.from_pretrained(baseline_name).to(device)
-    baseline_model.eval()
-    print("Baseline model loaded.")
-    df = pd.read_csv(csv_path)
-    df = df.sample(n=min(num_examples, len(df)), random_state=42).reset_index(drop=True)
-    candidate_labels = sorted(set(FASHION_MNIST_ORIGINAL_LABELS.values()))
-    candidate_texts = [f"a photo of a {label}" for label in candidate_labels]
-    text_inputs = baseline_processor(text=candidate_texts, return_tensors="pt", padding=True, truncation=True)
-    text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
-    with torch.no_grad():
-        text_embs = baseline_model.get_text_features(**text_inputs)
-        text_embs = F.normalize(text_embs, dim=-1)
-    pixel_cols = [f"pixel{i}" for i in range(1, 785)]
-    rows: List[List[str]] = []
-    failed_rows: List[List[str]] = []
-    correct = 0
-    for idx in range(len(df)):
-        row = df.iloc[idx]
-        label_id = int(row["label"])
-        ground_truth = FASHION_MNIST_ORIGINAL_LABELS.get(label_id, "unknown")
-        pixels = row[pixel_cols].values.astype(float)
-        img_array = pixels.reshape(28, 28).astype(np.uint8)
-        img_array = np.stack([img_array] * 3, axis=-1)
-        image = Image.fromarray(img_array)
-        image_inputs = baseline_processor(images=[image], return_tensors="pt")
-        image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
-        with torch.no_grad():
-            img_emb = baseline_model.get_image_features(**image_inputs)
-            img_emb = F.normalize(img_emb, dim=-1)
-        sims = F.cosine_similarity(img_emb, text_embs, dim=1)
-        best_idx = sims.argmax().item()
-        predicted = candidate_labels[best_idx]
-        best_sim = sims[best_idx].item()
-        ok = predicted == ground_truth
-        if ok:
-            correct += 1
-        rows.append([str(idx), ground_truth, predicted, f"{best_sim:.4f}", format_bool(ok)])
-        if not ok:
-            failed_rows.append([str(idx), ground_truth, predicted, f"{best_sim:.4f}"])
-    accuracy = correct / len(df)
-    print_table(
-        f"Baseline Test C (Fashion-CLIP): zero-shot (showing {min(num_printed, len(rows))}/{len(rows)} examples)",
-        ["#", "Ground Truth", "Predicted", "Best CosSim", "Result"],
-        rows[:num_printed],
-    )
-    print(f"Baseline Test C aggregate: {correct}/{len(df)} correct ({accuracy:.2%})")
-    return {"accuracy": accuracy}
-def main(selected_tests: set[str]) -> None:
     random.seed(42)
     cfg = resolve_runtime_config()
-    model_path = Path(cfg.main_model_path)
-    if not model_path.exists():
-        raise FileNotFoundError(f"Main model checkpoint not found: {cfg.main_model_path}")
-    print("Loading model...")
-    print(f"  device: {cfg.device}")
-    print(f"  checkpoint: {cfg.main_model_path}")
-    print(f"  dims: color={cfg.color_emb_dim}, hierarchy={cfg.hierarchy_emb_dim}, total=512")
-    model, processor = load_main_model(cfg.device, cfg.main_model_path)
-    print("Model loaded.")
     result_a: Optional[Dict[str, object]] = None
     result_b: Optional[Dict[str, object]] = None
-    result_d: Optional[Dict[str, object]] = None
     baseline_result_a: Optional[Dict[str, object]] = None
     baseline_result_b: Optional[Dict[str, object]] = None
-    baseline_result_d: Optional[Dict[str, object]] = None
-    baseline_processor = None
-    baseline_model = None
-    if any(t in selected_tests for t in ("A", "B", "C", "D")):
-        print("\nLoading baseline model (patrickjohncyh/fashion-clip)...")
-        baseline_name = "patrickjohncyh/fashion-clip"
-        baseline_processor = CLIPProcessor.from_pretrained(baseline_name)
-        baseline_model = CLIPModelTransformers.from_pretrained(baseline_name).to(cfg.device)
-        baseline_model.eval()
-        print("Baseline model loaded.")
     if "A" in selected_tests:
         result_a = run_test_a(
@@ -1378,8 +1182,8 @@ def main(selected_tests: set[str]) -> None:
                 num_printed=DEFAULT_NUM_PRINTED,
                 test_name="Baseline Test B",
             )
-    if "D" in selected_tests:
-        result_d = run_test_d(
             model,
             processor,
             cfg,
@@ -1387,83 +1191,59 @@ def main(selected_tests: set[str]) -> None:
             num_printed=DEFAULT_NUM_PRINTED,
         )
         if baseline_model is not None and baseline_processor is not None:
-            baseline_result_d = run_test_d(
                 baseline_model,
                 baseline_processor,
                 cfg,
                 num_examples=DEFAULT_NUM_EXAMPLES,
                 num_printed=DEFAULT_NUM_PRINTED,
-                test_name="Baseline Test D",
             )
-    # Collect results for fair comparison.
-    gap_fixed_c1: Dict[str, Optional[float]] = {}
-    gap_fixed_wf1_c1: Dict[str, Optional[float]] = {}
-    gap_best_wf1_c1: Dict[str, Optional[float]] = {}
-    gap_best_strategy: Dict[str, Optional[str]] = {}
-    base_fixed_c1: Dict[str, Optional[float]] = {}
-    base_fixed_wf1_c1: Dict[str, Optional[float]] = {}
-    if "C" in selected_tests:
         assert baseline_model is not None and baseline_processor is not None
-        hierarchy_model_eval = load_hierarchy_model_for_eval(cfg.device)
-        if hierarchy_model_eval is not None:
-            print("Hierarchy model loaded for evaluation strategies.")
-        else:
-            print("Hierarchy model not available; subspace strategies will use CLIP-only fallback.")
-        datasets_for_c = {
-            "Fashion-MNIST": load_fashion_mnist_samples(DEFAULT_NUM_EXAMPLES),
-            "KAGL Marqo": load_kagl_marqo_samples(DEFAULT_NUM_EXAMPLES),
-            "Internal dataset": load_internal_samples(min(DEFAULT_NUM_EXAMPLES, 200)),
         }
-        for dataset_name, (baseline_samples, gap_samples) in datasets_for_c.items():
-            print(f"\n{'=' * 120}")
-            print(f"Test C on {dataset_name}")
-            print(f"{'=' * 120}")
-            print(f"Valid samples: {len(baseline_samples)} (baseline), {len(gap_samples)} (GAP-CLIP)")
-            # Baseline uses dataset-native labels (matches published benchmarks).
-            baseline_candidate_labels = sorted(set(label for _, label in baseline_samples))
-            # GAP-CLIP uses its training-vocabulary labels.
-            gap_candidate_labels = sorted(set(label for _, label in gap_samples))
-            print(f"Baseline candidate labels ({len(baseline_candidate_labels)}): {baseline_candidate_labels}")
-            print(f"GAP-CLIP candidate labels ({len(gap_candidate_labels)}): {gap_candidate_labels}")
-            # GAP-CLIP: full strategy search with its own label vocabulary.
-            gap_metrics = evaluate_zero_shot_gap(
-                model=model,
-                processor=processor,
-                device=cfg.device,
-                samples=gap_samples,
-                candidate_labels=gap_candidate_labels,
-                title_prefix=f"Test C ({dataset_name}) GAP-CLIP",
-                num_printed=DEFAULT_NUM_PRINTED,
-                color_emb_dim=cfg.color_emb_dim,
-                hierarchy_emb_dim=cfg.hierarchy_emb_dim,
-                hierarchy_model=hierarchy_model_eval,
-            )
-            # Baseline: single_prompt with native labels.
-            baseline_metrics = evaluate_zero_shot_baseline(
-                baseline_model=baseline_model,
-                baseline_processor=baseline_processor,
-                device=cfg.device,
-                samples=baseline_samples,
-                candidate_labels=baseline_candidate_labels,
-                title_prefix=f"Test C ({dataset_name}) Baseline",
-                num_printed=DEFAULT_NUM_PRINTED,
-            )
-            # Store results.
-            gap_fixed_c1[dataset_name] = gap_metrics.get("all_strategy_acc_c1", {}).get("single_prompt")
-            gap_fixed_wf1_c1[dataset_name] = gap_metrics.get("all_strategy_wf1_c1", {}).get("single_prompt")
-            gap_best_wf1_c1[dataset_name] = gap_metrics.get("wf1_c1")
-            gap_best_strategy[dataset_name] = gap_metrics.get("strategy")
-            base_fixed_c1[dataset_name] = baseline_metrics["accuracy_c1"]
-            base_fixed_wf1_c1[dataset_name] = baseline_metrics.get("wf1_c1")
     print("\n" + "=" * 120)
     print("Final Summary")
     print("=" * 120)
@@ -1478,70 +1258,37 @@ def main(selected_tests: set[str]) -> None:
         print(f"Test B full512 accuracy: {float(result_b['accuracy_full512']):.2%}")
         if baseline_result_b is not None:
             print(f"Baseline Test B full512 accuracy: {float(baseline_result_b['accuracy_full512']):.2%}")
-    if "C" in selected_tests:
-        def _pct(v: Optional[float]) -> str:
-            return f"{v:.2%}" if v is not None else "N/A"
-        def _gain(gap_v: Optional[float], base_v: Optional[float]) -> str:
-            if gap_v is None or base_v is None or base_v == 0:
-                return "N/A"
-            return f"{((gap_v - base_v) / abs(base_v)) * 100:+.1f}%"
-        print("\n" + "-" * 120)
-        print("Test C — Fair comparison (weighted F1): single_prompt for both models")
-        print("-" * 120)
-        fair_rows: List[List[str]] = []
-        for ds in ["Fashion-MNIST", "KAGL Marqo", "Internal dataset"]:
-            fair_rows.append([
-                ds,
-                _pct(gap_fixed_wf1_c1.get(ds)), _pct(base_fixed_wf1_c1.get(ds)),
-                _gain(gap_fixed_wf1_c1.get(ds), base_fixed_wf1_c1.get(ds)),
-            ])
-        print_table(
-            "Test C: single_prompt weighted F1 for both",
-            ["Dataset", "GAP-CLIP (wF1)", "Baseline (wF1)", "Gain"],
-            fair_rows,
-        )
-        print("\n" + "-" * 120)
-        print("Test C — GAP-CLIP best strategy vs Baseline")
-        print("-" * 120)
-        best_rows: List[List[str]] = []
-        for ds in ["Fashion-MNIST", "KAGL Marqo", "Internal dataset"]:
-            strat = gap_best_strategy.get(ds) or "N/A"
-            best_rows.append([
-                ds,
-                f"{strat}",
-                _pct(gap_best_wf1_c1.get(ds)), _pct(base_fixed_wf1_c1.get(ds)),
-                _gain(gap_best_wf1_c1.get(ds), base_fixed_wf1_c1.get(ds)),
-            ])
-        print_table(
-            "Test C: GAP-CLIP best strategy vs Baseline single_prompt",
-            ["Dataset", "GAP-CLIP strategy", "GAP-CLIP (wF1)", "Baseline (wF1)", "Gain"],
-            best_rows,
-        )
-    if result_d is not None:
-        print(f"Test D overall: {format_bool(bool(result_d['overall']))}")
-        print(f"  pass rate: {float(result_d['pass_rate']):.2%}")
-        print(f"  avg color_match={float(result_d['avg_color_match']):.4f} vs cross={float(result_d['avg_color_cross']):.4f}")
-        print(f"  avg hier_match={float(result_d['avg_hier_match']):.4f}  vs cross={float(result_d['avg_hier_cross']):.4f}")
-        if baseline_result_d is not None:
-            print(f"Baseline Test D overall: {format_bool(bool(baseline_result_d['overall']))}")
-            print(f"  baseline pass rate: {float(baseline_result_d['pass_rate']):.2%}")
     if result_a is not None:
-        assert bool(result_a["overall"]), "Test A failed: hierarchy behavior did not match expected pattern."
     if result_b is not None:
-        assert bool(result_b["overall"]), "Test B failed: first16 correlation was not consistently above full512."
-    if result_d is not None:
-        assert float(result_d["pass_rate"]) >= 0.95, (
-            f"Test D failed: subspace decomposition pass rate {float(result_d['pass_rate']):.2%} < 95%."
         )
     print("\nAll embedding-structure tests passed.")
 if __name__ == "__main__":
-    selected_tests = 'ABCD'
     main(selected_tests)

     for items sharing a color but differing in category.
     Expected result: 1000/1000 pass.
+  Test C  — Subspace Decomposition Consistency:
     Encode a full description (e.g. "red dress in cotton"), a standalone color
     ("red"), and a standalone hierarchy ("dress").  Verify that:
       - The color subspace (first 16D) of the full embedding is more similar
         similar to the hierarchy-only embedding than to the color-only embedding.
     Expected result: 1000/1000 pass.
+  Test D — Zero-shot image-to-text classification:
+    Each image is used as a query; the highest-scoring text label (cosine in
+    shared latent space) is the predicted class. Accuracy is computed across
+    three datasets (Fashion-MNIST, KAGL Marqo, Internal).
 Paper reference: Section 5.3.6 and Table 4.
 Run directly:
 from dataclasses import dataclass
 from pathlib import Path
 import random
+import sys
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 from typing import Dict, List, Optional, Sequence, Tuple
 import numpy as np
 from io import BytesIO
 from PIL import Image
 from torchvision import transforms
+from torchvision import datasets
+from torch.utils.data import DataLoader
+from tqdm import tqdm
 from transformers import CLIPModel as CLIPModelTransformers
 from transformers import CLIPProcessor
+from training.hierarchy_model import HierarchyExtractor
+try:
+    import config as project_config  # type: ignore
+except Exception:
+    project_config = None
+DEFAULT_COLOR_EMB_DIM = getattr(project_config, "color_emb_dim", 16)
+DEFAULT_HIERARCHY_EMB_DIM = getattr(project_config, "hierarchy_emb_dim", 64)
+DEFAULT_MAIN_EMB_DIM = getattr(project_config, "main_emb_dim", 512)
+DEFAULT_MAIN_MODEL_PATH = getattr(project_config, "main_model_path", "models/gap_clip.pth")
+DEFAULT_DEVICE = getattr(project_config, "device", torch.device("cpu"))
+_HIERARCHY_EXTRACTOR = HierarchyExtractor([
+    "accessories", "bodysuits", "bras", "coat", "dress", "jacket",
+    "legging", "pant", "polo", "shirt", "shoes", "short", "skirt",
+    "socks", "sweater", "swimwear", "top", "underwear",
+], verbose=False)
 @dataclass
 class RuntimeConfig:
+    color_emb_dim: int = DEFAULT_COLOR_EMB_DIM
+    hierarchy_emb_dim: int = DEFAULT_HIERARCHY_EMB_DIM
+    main_emb_dim: int = DEFAULT_MAIN_EMB_DIM
+    main_model_path: str = DEFAULT_MAIN_MODEL_PATH
+    device: torch.device = DEFAULT_DEVICE
 DEFAULT_NUM_EXAMPLES = 10000
 DEFAULT_NUM_PRINTED = 3
         cfg.color_emb_dim = getattr(config, "color_emb_dim", cfg.color_emb_dim)
         cfg.hierarchy_emb_dim = getattr(config, "hierarchy_emb_dim", cfg.hierarchy_emb_dim)
+        cfg.main_emb_dim = getattr(config, "main_emb_dim", cfg.main_emb_dim)
         cfg.main_model_path = getattr(config, "main_model_path", cfg.main_model_path)
         cfg.device = getattr(config, "device", cfg.device)
     except Exception:
 def load_main_model(device: torch.device, main_model_path: str) -> Tuple[CLIPModelTransformers, CLIPProcessor]:
+    """Load GAP-CLIP from local checkpoint path only."""
+    model_path = Path(main_model_path)
+    if not model_path.exists():
+        raise FileNotFoundError(f"Main model checkpoint not found: {main_model_path}")
+    clip_name = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
+    model = CLIPModelTransformers.from_pretrained(clip_name)
+    checkpoint = torch.load(str(model_path), map_location=device)
+    if isinstance(checkpoint, dict) and "model_state_dict" in checkpoint:
+        model.load_state_dict(checkpoint["model_state_dict"], strict=False)
+    else:
+        model.load_state_dict(checkpoint, strict=False)
+    model = model.to(device)
+    model.eval()
+    processor = CLIPProcessor.from_pretrained(clip_name)
+    return model, processor
+def encode_text(model, processor, text_queries, device):
+    """Encode text queries into embeddings (unnormalized)."""
+    if isinstance(text_queries, str):
+        text_queries = [text_queries]
+    inputs = processor(text=text_queries, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        text_features = model.get_text_features(**inputs)
+    return text_features
+def encode_image(model, processor, images, device):
+    """Encode images into embeddings (unnormalized)."""
+    if not isinstance(images, list):
+        images = [images]
+    inputs = processor(images=images, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
+        image_features = model.get_image_features(**inputs)
+    return image_features
+def get_text_embedding(
+    model: CLIPModelTransformers, processor: CLIPProcessor, device: torch.device, text: str) -> torch.Tensor:
+    """Normalized single text embedding (shape: [512])."""
+    return F.normalize(encode_text(model, processor, text, device), dim=-1).squeeze(0)
 def cosine(a: torch.Tensor, b: torch.Tensor) -> float:
     cfg: RuntimeConfig,
     num_examples: int,
     num_printed: int,
+    test_name: str = "Test A") -> Dict[str, bool]:
     """
     A: different colors + same hierarchy.
     Expect hierarchy subspace to be more similar than color subspace.
     cfg: RuntimeConfig,
     num_examples: int,
     num_printed: int,
+    test_name: str = "Test B",) -> Dict[str, bool]:
     """
     B: same color + different hierarchies.
     Expect similarity in first16 (color) to be higher than full512.
+def run_test_c(
     model: CLIPModelTransformers,
     processor: CLIPProcessor,
     cfg: RuntimeConfig,
     num_examples: int,
     num_printed: int,
+    test_name: str = "Test C",) -> Dict[str, object]:
     """
+    C: Subspace Decomposition Consistency.
     Encode a full description (e.g. "red dress in cotton"), a standalone color
     ("red"), and a standalone hierarchy ("dress").  Then verify:
       - The color subspace (first 16D) of the full embedding aligns with the
 def get_image_embedding(
     model: CLIPModelTransformers, processor: CLIPProcessor, device: torch.device, image_tensor: torch.Tensor
 ) -> torch.Tensor:
+    """Normalized image embedding from a preprocessed tensor (shape: [512])."""
     image_tensor = image_tensor.unsqueeze(0).to(device)
+    # Convert tensor to PIL for encode_image
+    from torchvision.transforms.functional import to_pil_image
+    pil_img = to_pil_image(image_tensor.squeeze(0).cpu())
+    return F.normalize(encode_image(model, processor, pil_img, device), dim=-1).squeeze(0)
 def get_image_embedding_from_pil(
     model: CLIPModelTransformers, processor: CLIPProcessor, device: torch.device, image: Image.Image
 ) -> torch.Tensor:
+    """Normalized image embedding from a PIL image (shape: [512])."""
+    return F.normalize(encode_image(model, processor, image, device), dim=-1).squeeze(0)
 def get_text_embeddings_batch(
     model: CLIPModelTransformers, processor: CLIPProcessor, device: torch.device, texts: List[str]
 ) -> torch.Tensor:
+    """Normalized text embeddings for a batch (shape: [N, 512])."""
+    return F.normalize(encode_text(model, processor, texts, device), dim=-1)
 def get_prompt_ensembled_text_embeddings(
     return probs, recommended_weight
+def zero_shot_fashion_mnist(
+    model,
+    processor,
+    device,
+    batch_size: int = 64,
+    data_root: str = "./data") -> float:
+    """Notebook-equivalent zero-shot accuracy on all Fashion-MNIST test samples."""
+    dataset = datasets.FashionMNIST(
+        root=data_root, train=False, download=True,
+        transform=transforms.Grayscale(num_output_channels=3),
+    )
+    loader = DataLoader(
+        dataset, batch_size=batch_size, shuffle=False,
+        collate_fn=lambda batch: (
+            [item[0] for item in batch],
+            torch.tensor([item[1] for item in batch]),
+        ),
+    )
+    prompts = [f"a photo of a {label}" for label in dataset.classes]
+    text_embs = encode_text(model, processor, prompts, device).to(device).float()
+    text_embs = F.normalize(text_embs, dim=-1)
     correct = 0
+    total = 0
+    for pil_images, labels in tqdm(loader, desc="Zero-shot Fashion-MNIST"):
+        img_embs = encode_image(model, processor, pil_images, device)
+        img_embs = img_embs.to(device).float()
+        img_embs = F.normalize(img_embs, dim=-1)
+        sim = img_embs @ text_embs.T
+        preds = sim.argmax(dim=-1).cpu()
+        correct += (preds == labels).sum().item()
+        total += labels.size(0)
+    accuracy = correct / total
+    print(f"Zero-shot accuracy on Fashion MNIST: {accuracy:.4f} ({correct}/{total})")
+    return accuracy
+def zero_shot_kagl(
+    model,
+    processor,
+    device,
+    batch_size: int = 64,
+    num_examples: int = 10000,
+) -> Optional[Dict[str, float]]:
+    """Notebook-equivalent zero-shot accuracy/F1 on KAGL Marqo (category2)."""
+    try:
+        from datasets import load_dataset  # type: ignore
+    except Exception:
+        print("Skipping zero_shot_kagl: datasets package not available")
+        return None
+    try:
+        dataset = load_dataset("Marqo/KAGL", split="data")
+    except Exception as exc:
+        print(f"Skipping zero_shot_kagl: failed to load dataset ({exc})")
+        return None
+    dataset = dataset.shuffle(seed=42).select(range(min(num_examples, len(dataset))))
+    pil_images: List[Image.Image] = []
+    labels_text: List[str] = []
+    for item in dataset:
+        raw_label = item.get("category2")
+        image_obj = item.get("image")
+        if raw_label is None or image_obj is None:
+            continue
+        if hasattr(image_obj, "convert"):
+            image = image_obj.convert("RGB")
+        elif isinstance(image_obj, dict) and "bytes" in image_obj:
+            image = Image.open(BytesIO(image_obj["bytes"])).convert("RGB")
+        else:
+            continue
+        pil_images.append(image)
+        labels_text.append(str(raw_label).strip())
+    if not pil_images:
+        print("Skipping zero_shot_kagl: no valid samples")
+        return None
+    candidate_labels = sorted(set(labels_text))
+    label_to_idx = {label: idx for idx, label in enumerate(candidate_labels)}
+    all_labels = np.array([label_to_idx[label] for label in labels_text], dtype=np.int64)
+    prompts = [f"a photo of a {label}" for label in candidate_labels]
+    text_embs = encode_text(model, processor, prompts, device).to(device).float()
+    text_embs = F.normalize(text_embs, dim=-1)
+    all_preds: List[np.ndarray] = []
+    for start in tqdm(range(0, len(pil_images), batch_size), desc="Zero-shot KAGL"):
+        batch_images = pil_images[start : start + batch_size]
+        img_embs = encode_image(model, processor, batch_images, device).to(device).float()
+        img_embs = F.normalize(img_embs, dim=-1)
+        sim = img_embs @ text_embs.T
+        preds = sim.argmax(dim=-1).cpu().numpy()
+        all_preds.append(preds)
+    pred_array = np.concatenate(all_preds, axis=0) if all_preds else np.array([], dtype=np.int64)
+    accuracy = float((pred_array == all_labels).mean()) if len(all_labels) else 0.0
+    weighted_f1 = f1_score(all_labels, pred_array, average="weighted") if len(all_labels) else 0.0
+    print(f"KAGL accuracy:          {accuracy:.4f}")
+    print(f"KAGL weighted macro F1: {weighted_f1:.4f}")
+    return {"accuracy": accuracy, "weighted_f1": float(weighted_f1)}
+def zero_shot_internal(
+    model,
+    processor,
+    device,
+    batch_size: int = 64,
+    num_examples: int = 10000,
+    csv_path: str = INTERNAL_DATASET_CSV,) -> Optional[Dict[str, float]]:
+    """Notebook-equivalent zero-shot accuracy/F1 on internal dataset."""
+    csv_file = Path(csv_path)
+    if not csv_file.exists():
+        print(f"Skipping zero_shot_internal: {csv_path} not found")
+        return None
+    df = pd.read_csv(csv_file)
+    use_local = "local_image_path" in df.columns
+    required_cols = {"hierarchy", "local_image_path"} if use_local else {"hierarchy", "image_url"}
+    if not required_cols.issubset(df.columns):
+        print(f"Skipping zero_shot_internal: missing required columns {required_cols}")
+        return None
+    img_col = "local_image_path" if use_local else "image_url"
+    df = df.dropna(subset=["hierarchy", img_col]).sample(frac=1.0, random_state=42)
+    pil_images: List[Image.Image] = []
+    labels_text: List[str] = []
+    for _, row in df.iterrows():
+        if len(pil_images) >= num_examples:
+            break
+        try:
+            if use_local:
+                img_path = Path(str(row["local_image_path"]))
+                if not img_path.exists():
+                    continue
+                image = Image.open(img_path).convert("RGB")
+            else:
+                response = requests.get(str(row["image_url"]), timeout=5)
+                response.raise_for_status()
+                image = Image.open(BytesIO(response.content)).convert("RGB")
+        except Exception:
+            continue
+        label = normalize_hierarchy_label(str(row["hierarchy"]))
+        pil_images.append(image)
+        labels_text.append(label)
+    if not pil_images:
+        print("Skipping zero_shot_internal: no valid samples")
+        return None
+    candidate_labels = sorted(set(labels_text))
+    label_to_idx = {label: idx for idx, label in enumerate(candidate_labels)}
+    all_labels = np.array([label_to_idx[label] for label in labels_text], dtype=np.int64)
+    prompts = [f"a photo of a {label}" for label in candidate_labels]
+    text_embs = encode_text(model, processor, prompts, device).to(device).float()
+    text_embs = F.normalize(text_embs, dim=-1)
+    all_preds: List[np.ndarray] = []
+    for start in tqdm(range(0, len(pil_images), batch_size), desc="Zero-shot Internal"):
+        batch_images = pil_images[start : start + batch_size]
+        img_embs = encode_image(model, processor, batch_images, device).to(device).float()
+        img_embs = F.normalize(img_embs, dim=-1)
+        sim = img_embs @ text_embs.T
+        preds = sim.argmax(dim=-1).cpu().numpy()
+        all_preds.append(preds)
+    pred_array = np.concatenate(all_preds, axis=0) if all_preds else np.array([], dtype=np.int64)
+    accuracy = float((pred_array == all_labels).mean()) if len(all_labels) else 0.0
+    weighted_f1 = f1_score(all_labels, pred_array, average="weighted") if len(all_labels) else 0.0
+    print(f"Internal accuracy:          {accuracy:.4f}")
+    print(f"Internal weighted macro F1: {weighted_f1:.4f}")
+    return {"accuracy": accuracy, "weighted_f1": float(weighted_f1)}
 def normalize_hierarchy_label(raw_label: str) -> str:
         "innerwear": "underwear",
         "loungewear and nightwear": "underwear",
         "saree": "dress",
+        "boots": "shoes",
+        "outer": "coat",
+        "sunglasses": "accessories",
+        "scarf & tie": "accessories",
+        "scarf/tie": "accessories",
+        "belt": "accessories",
     }
+    exact = synonyms.get(label, None)
+    if exact is not None:
+        return exact
+    # Phase 2: substring/regex fallback via HierarchyExtractor
+    # Handles Internal dataset's multi-word hierarchy strings like
+    # "womens wms woven shirts sleeveless linen" -> "shirt"
+    result = _HIERARCHY_EXTRACTOR.extract_hierarchy(label)
+    if result:
+        return result
+    # Phase 3: extra keywords for the ~9 labels HierarchyExtractor misses
+    _EXTRA_KEYWORDS = [
+        ("capri", "pant"),
+        ("denim", "pant"),
+        ("skinny", "pant"),
+        ("boyfriend", "pant"),
+        ("graphic", "top"),
+        ("longsleeve", "top"),
+        ("leather", "jacket"),
     ]
+    for keyword, category in _EXTRA_KEYWORDS:
+        if keyword in label:
+            return category
+    return label
+# ModaNet 13 categories (category_id -> label)
+MODANET_CATEGORIES = {
+    1: "bag", 2: "belt", 3: "boots", 4: "footwear", 5: "outer",
+    6: "dress", 7: "sunglasses", 8: "pants", 9: "top", 10: "shorts",
+    11: "skirt", 12: "headwear", 13: "scarf/tie",
+}
+MODANET_ANNOTATIONS_JSON = "data/modanet_instances_train.json"
+MODANET_IMAGES_DIR = "data/modanet_images/images"
+def load_modanet_samples(
     num_examples: int,
+) -> Tuple[List[Tuple[Image.Image, str]], List[Tuple[Image.Image, str]], List[Tuple[Image.Image, str]]]:
+    """Return (baseline_samples, gap_samples, color_samples) from ModaNet.
+    Loads from local COCO JSON annotations + image directory.
+    Each image may have multiple annotations — we pick the largest bbox area.
+    """
+    import json as _json
+    ann_path = Path(MODANET_ANNOTATIONS_JSON)
+    img_dir = Path(MODANET_IMAGES_DIR)
+    if not ann_path.exists():
+        print(f"  Skipping ModaNet: annotations not found at {MODANET_ANNOTATIONS_JSON}")
+        return [], [], []
+    if not img_dir.exists():
+        print(f"  Skipping ModaNet: images directory not found at {MODANET_IMAGES_DIR}")
+        return [], [], []
+    print("  Loading ModaNet annotations...")
+    with open(ann_path) as f:
+        coco = _json.load(f)
+    cat_map = {c["id"]: c["name"] for c in coco["categories"]}
+    img_map = {img["id"]: img["file_name"] for img in coco["images"]}
+    # For each image, find the annotation with the largest area.
+    best_per_image: Dict[int, Tuple[int, float]] = {}  # image_id -> (category_id, area)
+    for ann in coco["annotations"]:
+        img_id = ann["image_id"]
+        cat_id = ann["category_id"]
+        area = ann.get("area", 0)
+        if img_id not in best_per_image or area > best_per_image[img_id][1]:
+            best_per_image[img_id] = (cat_id, area)
+    # Shuffle deterministically and load images.
+    image_ids = list(best_per_image.keys())
+    rng = random.Random(42)
+    rng.shuffle(image_ids)
     baseline_samples: List[Tuple[Image.Image, str]] = []
     gap_samples: List[Tuple[Image.Image, str]] = []
+    for img_id in image_ids:
+        if len(baseline_samples) >= num_examples:
             break
+        file_name = img_map.get(img_id)
+        if file_name is None:
+            continue
+        img_path = img_dir / file_name
+        if not img_path.exists():
+            continue
         try:
+            image = Image.open(img_path).convert("RGB")
         except Exception:
             continue
+        cat_id, _ = best_per_image[img_id]
+        native_label = cat_map.get(cat_id, "unknown")
+        gap_label = normalize_hierarchy_label(native_label)
+        baseline_samples.append((image, native_label))
+        gap_samples.append((image, gap_label))
+    print(f"  ModaNet: loaded {len(baseline_samples)} valid samples (from {len(best_per_image)} annotated images)")
+    return baseline_samples, gap_samples, []
+def zero_shot_modanet(
+    model,
+    processor,
+    device,
+    batch_size: int = 64,
+    num_examples: int = 10000,
+    use_gap_labels: bool = True,
+) -> Optional[Dict[str, float]]:
+    """Zero-shot accuracy/F1 on ModaNet dataset."""
+    baseline_samples, gap_samples, _ = load_modanet_samples(num_examples)
+    samples = gap_samples if use_gap_labels else baseline_samples
+    if not samples:
+        print("Skipping zero_shot_modanet: no valid samples")
+        return None
+    pil_images = [img for img, _ in samples]
+    labels_text = [label for _, label in samples]
+    candidate_labels = sorted(set(labels_text))
+    label_to_idx = {label: idx for idx, label in enumerate(candidate_labels)}
+    all_labels = np.array([label_to_idx[label] for label in labels_text], dtype=np.int64)
+    prompts = [f"a photo of a {label}" for label in candidate_labels]
+    text_embs = encode_text(model, processor, prompts, device).to(device).float()
+    text_embs = F.normalize(text_embs, dim=-1)
+    all_preds: List[np.ndarray] = []
+    for start in tqdm(range(0, len(pil_images), batch_size), desc="Zero-shot ModaNet"):
+        batch_images = pil_images[start : start + batch_size]
+        img_embs = encode_image(model, processor, batch_images, device).to(device).float()
+        img_embs = F.normalize(img_embs, dim=-1)
+        sim = img_embs @ text_embs.T
+        preds = sim.argmax(dim=-1).cpu().numpy()
+        all_preds.append(preds)
+    pred_array = np.concatenate(all_preds, axis=0) if all_preds else np.array([], dtype=np.int64)
+    accuracy = float((pred_array == all_labels).mean()) if len(all_labels) else 0.0
+    weighted_f1 = f1_score(all_labels, pred_array, average="weighted") if len(all_labels) else 0.0
+    label_kind = "GAP" if use_gap_labels else "native"
+    print(f"ModaNet ({label_kind}) accuracy:          {accuracy:.4f}")
+    print(f"ModaNet ({label_kind}) weighted macro F1: {weighted_f1:.4f}")
+    return {"accuracy": accuracy, "weighted_f1": float(weighted_f1)}
+def main(
+    selected_tests: set[str],
+    model=None,
+    processor=None,
+    baseline_model=None,
+    baseline_processor=None,
+) -> None:
     random.seed(42)
     cfg = resolve_runtime_config()
+    if model is None or processor is None:
+        model_path = Path(cfg.main_model_path)
+        if not model_path.exists():
+            raise FileNotFoundError(f"Main model checkpoint not found: {cfg.main_model_path}")
+        print("Loading model...")
+        print(f"  device: {cfg.device}")
+        print(f"  checkpoint: {cfg.main_model_path}")
+        print(f"  dims: color={cfg.color_emb_dim}, hierarchy={cfg.hierarchy_emb_dim}, total={cfg.main_emb_dim}")
+        model, processor = load_main_model(cfg.device, cfg.main_model_path)
+        print("Model loaded.")
+    else:
+        print(f"Using pre-loaded GAP-CLIP model (dims: color={cfg.color_emb_dim}, hierarchy={cfg.hierarchy_emb_dim}, total={cfg.main_emb_dim})")
     result_a: Optional[Dict[str, object]] = None
     result_b: Optional[Dict[str, object]] = None
+    result_c: Optional[Dict[str, object]] = None
     baseline_result_a: Optional[Dict[str, object]] = None
     baseline_result_b: Optional[Dict[str, object]] = None
+    baseline_result_c: Optional[Dict[str, object]] = None
+    if baseline_model is None or baseline_processor is None:
+        if any(t in selected_tests for t in ("A", "B", "C", "D")):
+            print("\nLoading baseline model (patrickjohncyh/fashion-clip)...")
+            baseline_name = "patrickjohncyh/fashion-clip"
+            baseline_processor = CLIPProcessor.from_pretrained(baseline_name)
+            baseline_model = CLIPModelTransformers.from_pretrained(baseline_name).to(cfg.device)
+            baseline_model.eval()
+            print("Baseline model loaded.")
     if "A" in selected_tests:
         result_a = run_test_a(
                 num_printed=DEFAULT_NUM_PRINTED,
                 test_name="Baseline Test B",
             )
+    if "C" in selected_tests:
+        result_c = run_test_c(
             model,
             processor,
             cfg,
             num_printed=DEFAULT_NUM_PRINTED,
         )
         if baseline_model is not None and baseline_processor is not None:
+            baseline_result_c = run_test_c(
                 baseline_model,
                 baseline_processor,
                 cfg,
                 num_examples=DEFAULT_NUM_EXAMPLES,
                 num_printed=DEFAULT_NUM_PRINTED,
+                test_name="Baseline Test C",
             )
+    if "D" in selected_tests:
         assert baseline_model is not None and baseline_processor is not None
+        print("\n" + "=" * 120)
+        print("Test D — Notebook-style zero-shot accuracy")
+        print("=" * 120)
+        d_results: Dict[str, Dict[str, Optional[Dict[str, float]]]] = {
+            "Fashion-MNIST": {
+                "gap": {"accuracy": zero_shot_fashion_mnist(model=model, processor=processor, device=cfg.device, batch_size=64)},
+                "base": {"accuracy": zero_shot_fashion_mnist(model=baseline_model, processor=baseline_processor, device=cfg.device, batch_size=64)},
+            },
+            "KAGL Marqo": {
+                "gap": zero_shot_kagl(model=model, processor=processor, device=cfg.device, batch_size=64, num_examples=DEFAULT_NUM_EXAMPLES),
+                "base": zero_shot_kagl(model=baseline_model, processor=baseline_processor, device=cfg.device, batch_size=64, num_examples=DEFAULT_NUM_EXAMPLES),
+            },
+            "Internal dataset": {
+                "gap": zero_shot_internal(model=model, processor=processor, device=cfg.device, batch_size=64, num_examples=DEFAULT_NUM_EXAMPLES),
+                "base": zero_shot_internal(model=baseline_model, processor=baseline_processor, device=cfg.device, batch_size=64, num_examples=DEFAULT_NUM_EXAMPLES),
+            },
+            "ModaNet": {
+                "gap": zero_shot_modanet(model=model, processor=processor, device=cfg.device, batch_size=64, num_examples=DEFAULT_NUM_EXAMPLES, use_gap_labels=True),
+                "base": zero_shot_modanet(model=baseline_model, processor=baseline_processor, device=cfg.device, batch_size=64, num_examples=DEFAULT_NUM_EXAMPLES, use_gap_labels=True),
+            },
         }
+        print("\n" + "-" * 120)
+        print("Test D summary")
+        print("-" * 120)
+        summary_rows: List[List[str]] = []
+        for ds in ["Fashion-MNIST", "KAGL Marqo", "ModaNet", "Internal dataset"]:
+            gap_result = d_results[ds]["gap"]
+            base_result = d_results[ds]["base"]
+            gap_acc = None if gap_result is None else gap_result.get("accuracy")
+            base_acc = None if base_result is None else base_result.get("accuracy")
+            summary_rows.append([
+                ds,
+                f"{gap_acc:.2%}" if gap_acc is not None else "N/A",
+                f"{base_acc:.2%}" if base_acc is not None else "N/A",
+            ])
+        print_table(
+            "Test D — zero-shot accuracy (notebook protocol)",
+            ["Dataset", "GAP-CLIP", "Fashion-CLIP (baseline)"],
+            summary_rows,
+        )
     print("\n" + "=" * 120)
     print("Final Summary")
     print("=" * 120)
         print(f"Test B full512 accuracy: {float(result_b['accuracy_full512']):.2%}")
         if baseline_result_b is not None:
             print(f"Baseline Test B full512 accuracy: {float(baseline_result_b['accuracy_full512']):.2%}")
+    if result_c is not None:
+        print(f"Test C overall: {format_bool(bool(result_c['overall']))}")
+        print(f"  pass rate: {float(result_c['pass_rate']):.2%}")
+        print(f"  avg color_match={float(result_c['avg_color_match']):.4f} vs cross={float(result_c['avg_color_cross']):.4f}")
+        print(f"  avg hier_match={float(result_c['avg_hier_match']):.4f}  vs cross={float(result_c['avg_hier_cross']):.4f}")
+        if baseline_result_c is not None:
+            print(f"Baseline Test C overall: {format_bool(bool(baseline_result_c['overall']))}")
+            print(f"  baseline pass rate: {float(baseline_result_c['pass_rate']):.2%}")
     if result_a is not None:
+        assert float(result_a["pass_rate"]) >= 0.95, (
+            f"Test A failed: pass rate {float(result_a['pass_rate']):.2%} < 95%."
+        )
     if result_b is not None:
+        assert float(result_b["pass_rate"]) >= 0.95, (
+            f"Test B failed: pass rate {float(result_b['pass_rate']):.2%} < 95%."
+        )
+    if result_c is not None:
+        assert float(result_c["pass_rate"]) >= 0.95, (
+            f"Test C failed: subspace decomposition pass rate {float(result_c['pass_rate']):.2%} < 95%."
         )
     print("\nAll embedding-structure tests passed.")
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Embedding structure evaluation")
+    parser.add_argument("--tests", default="ABCD", help="Which tests to run, e.g. 'C' or 'ABCD'")
+    parser.add_argument("--num-examples", type=int, default=None, help="Override DEFAULT_NUM_EXAMPLES")
+    args = parser.parse_args()
+    if args.num_examples is not None:
+        DEFAULT_NUM_EXAMPLES = args.num_examples
+    selected_tests = set(args.tests.upper())
     main(selected_tests)