Leacb4
/

gap-clip

@@ -8,8 +8,6 @@ the loading logic.
 from __future__ import annotations
-import json
-import os
 import sys
 from pathlib import Path
 from typing import Tuple
@@ -44,7 +42,7 @@ def load_gap_clip(
         (model, processor) ready for inference.
     """
     model = CLIPModelTransformers.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
-    checkpoint = torch.load(model_path, map_location=device)
     if isinstance(checkpoint, dict) and "model_state_dict" in checkpoint:
         model.load_state_dict(checkpoint["model_state_dict"])
@@ -82,140 +80,79 @@ def load_baseline_fashion_clip(
 def load_color_model(
     color_model_path: str,
-    tokenizer_path: str,
-    color_emb_dim: int,
     device: torch.device,
-    repo_id: str = "Leacb4/gap-clip",
-    cache_dir: str = "./models_cache",
 ):
-    """Load the specialized 16D color model (ColorCLIP) and its tokenizer.
-    Falls back to Hugging Face Hub if local files are not found.
     Returns:
-        (color_model, color_tokenizer)
     """
-    from training.color_model import ColorCLIP, Tokenizer  # type: ignore
-    local_model_exists = os.path.exists(color_model_path)
-    local_tokenizer_exists = os.path.exists(tokenizer_path)
-    if local_model_exists and local_tokenizer_exists:
-        print("Loading specialized color model (16D) from local files...")
-        state_dict = torch.load(color_model_path, map_location=device)
-        with open(tokenizer_path, "r") as f:
-            vocab = json.load(f)
-    else:
-        from huggingface_hub import hf_hub_download  # type: ignore
-        print(f"Local color model/tokenizer not found. Loading from Hugging Face ({repo_id})...")
-        hf_model_path = hf_hub_download(
-            repo_id=repo_id, filename="color_model.pt", cache_dir=cache_dir
-        )
-        hf_vocab_path = hf_hub_download(
-            repo_id=repo_id, filename="tokenizer_vocab.json", cache_dir=cache_dir
-        )
-        state_dict = torch.load(hf_model_path, map_location=device)
-        with open(hf_vocab_path, "r") as f:
-            vocab = json.load(f)
-    vocab_size = state_dict["text_encoder.embedding.weight"].shape[0]
-    print(f"  Detected vocab size from checkpoint: {vocab_size}")
-    tokenizer = Tokenizer()
-    tokenizer.load_vocab(vocab)
-    color_model = ColorCLIP(vocab_size=vocab_size, embedding_dim=color_emb_dim)
-    color_model.load_state_dict(state_dict)
-    color_model.to(device)
-    color_model.eval()
-    print("Color model loaded successfully")
-    return color_model, tokenizer
 # ---------------------------------------------------------------------------
-# Embedding extraction helpers
 # ---------------------------------------------------------------------------
-def get_text_embedding(
-    model: CLIPModelTransformers,
-    processor: CLIPProcessor,
-    device: torch.device,
-    text: str,
-) -> torch.Tensor:
-    """Extract a single normalized text embedding (shape: [512])."""
-    text_inputs = processor(text=[text], padding=True, return_tensors="pt")
-    text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
     with torch.no_grad():
-        text_outputs = model.text_model(**text_inputs)
-        text_features = model.text_projection(text_outputs.pooler_output)
-        text_features = F.normalize(text_features, dim=-1)
-    return text_features.squeeze(0)
-def get_text_embeddings_batch(
-    model: CLIPModelTransformers,
-    processor: CLIPProcessor,
-    device: torch.device,
-    texts: list[str],
-) -> torch.Tensor:
-    """Extract normalized text embeddings for a batch of strings (shape: [N, 512])."""
-    text_inputs = processor(text=texts, padding=True, return_tensors="pt", truncation=True, max_length=77)
-    text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
     with torch.no_grad():
-        text_outputs = model.text_model(**text_inputs)
-        text_features = model.text_projection(text_outputs.pooler_output)
-        text_features = F.normalize(text_features, dim=-1)
-    return text_features
-def get_image_embedding(
-    model: CLIPModelTransformers,
-    image: torch.Tensor,
-    device: torch.device,
-) -> torch.Tensor:
-    """Extract a normalized image embedding from a preprocessed tensor.
-    Args:
-        model: GAP-CLIP model.
-        image: Tensor of shape (C, H, W) or (1, C, H, W) or (N, C, H, W).
-        device: Target device.
-    Returns:
-        Normalized embedding tensor of shape (1, 512) or (N, 512).
-    """
-    model.eval()
-    with torch.no_grad():
-        if image.dim() == 3 and image.size(0) == 1:
-            image = image.expand(3, -1, -1)
-        elif image.dim() == 4 and image.size(1) == 1:
-            image = image.expand(-1, 3, -1, -1)
-        if image.dim() == 3:
-            image = image.unsqueeze(0)
-        image = image.to(device)
-        vision_outputs = model.vision_model(pixel_values=image)
-        image_features = model.visual_projection(vision_outputs.pooler_output)
-        return F.normalize(image_features, dim=-1)
-def get_image_embedding_from_pil(
-    model: CLIPModelTransformers,
-    processor: CLIPProcessor,
-    device: torch.device,
-    pil_image: Image.Image,
-) -> torch.Tensor:
-    """Extract a normalized image embedding from a PIL image (shape: [512])."""
-    inputs = processor(images=pil_image, return_tensors="pt")
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    with torch.no_grad():
-        vision_outputs = model.vision_model(**inputs)
-        image_features = model.visual_projection(vision_outputs.pooler_output)
-        image_features = F.normalize(image_features, dim=-1)
-    return image_features.squeeze(0)

 from __future__ import annotations
 import sys
 from pathlib import Path
 from typing import Tuple
         (model, processor) ready for inference.
     """
     model = CLIPModelTransformers.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
+    checkpoint = torch.load(model_path, map_location=device, weights_only=False)
     if isinstance(checkpoint, dict) and "model_state_dict" in checkpoint:
         model.load_state_dict(checkpoint["model_state_dict"])
 def load_color_model(
     color_model_path: str,
     device: torch.device,
 ):
+    """Load the specialized 16D color model (CLIP-backbone).
+    Returns:
+        (color_model, None)  -- second element kept for API compatibility
+    """
+    from training.color_model import ColorCLIP  # type: ignore
+    print("Loading ColorCLIP (CLIP-backbone, 16D) ...")
+    color_model = ColorCLIP.from_checkpoint(color_model_path, device=device)
+    print("Color model loaded successfully")
+    return color_model, None
+def load_hierarchy_model(
+    hierarchy_model_path: str,
+    device: torch.device,
+):
+    """Load the hierarchy model (CLIP-backbone).
     Returns:
+        hierarchy_model ready for inference.
     """
+    from training.hierarchy_model import HierarchyModel  # type: ignore
+    print("Loading HierarchyModel (CLIP-backbone, 64D) ...")
+    model = HierarchyModel.from_checkpoint(hierarchy_model_path, device=device)
+    print("Hierarchy model loaded successfully")
+    return model
 # ---------------------------------------------------------------------------
+# Core encoding helpers (same as notebook)
 # ---------------------------------------------------------------------------
+def encode_text(model, processor, text_queries, device):
+    """Encode text queries into embeddings (unnormalized)."""
+    if isinstance(text_queries, str):
+        text_queries = [text_queries]
+    inputs = processor(text=text_queries, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
+        text_features = model.get_text_features(**inputs)
+    return text_features
+def encode_image(model, processor, images, device):
+    """Encode images into embeddings (unnormalized)."""
+    if not isinstance(images, list):
+        images = [images]
+    inputs = processor(images=images, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
+        image_features = model.get_image_features(**inputs)
+    return image_features
+# ---------------------------------------------------------------------------
+# Normalized wrappers (preserve old call signatures used across eval scripts)
+# ---------------------------------------------------------------------------
+def get_text_embedding(model, processor, device, text):
+    """Single normalized text embedding (shape: [512])."""
+    return F.normalize(encode_text(model, processor, text, device), dim=-1).squeeze(0)
+def get_text_embeddings_batch(model, processor, device, texts):
+    """Normalized text embeddings for a batch (shape: [N, 512])."""
+    return F.normalize(encode_text(model, processor, texts, device), dim=-1)
+def get_image_embedding_from_pil(model, processor, device, pil_image):
+    """Normalized image embedding from a PIL image (shape: [512])."""
+    return F.normalize(encode_image(model, processor, pil_image, device), dim=-1).squeeze(0)