lodestones
/

tagger-experiment

@@ -213,6 +213,20 @@ class DINOv3ViTH(nn.Module):
             x = block(x, cos, sin)
         return self.norm(x)
 # =============================================================================
 # Head — auto-detected from the checkpoint
@@ -526,6 +540,71 @@ class Tagger:
         self.model.eval()
         print(f"[Tagger] Ready on {self.device} (backbone={dtype}, head=fp32)")
     @torch.no_grad()
     def predict(self, image, topk: int | None = 30,
                 threshold: float | None = None) -> list[tuple[str, float]]:

             x = block(x, cos, sin)
         return self.norm(x)
+    def get_image_tokens(self, pixel_values):
+        """Return patch tokens only (no CLS/registers) as [B, h_p*w_p, D_MODEL]
+        and the spatial grid dimensions (h_p, w_p)."""
+        _, _, H, W = pixel_values.shape
+        h_p, w_p = H // PATCH_SIZE, W // PATCH_SIZE
+        x = self.embeddings(pixel_values)
+        cos, sin = _build_rope(h_p, w_p, x.dtype, pixel_values.device)
+        for block in self.layer:
+            x = block(x, cos, sin)
+        x = self.norm(x)
+        # token layout: [CLS, reg_0..reg_R-1, patch_0..patch_N]
+        patch_tokens = x[:, 1 + N_REGISTERS:, :]  # [B, h_p*w_p, D_MODEL]
+        return patch_tokens, h_p, w_p
 # =============================================================================
 # Head — auto-detected from the checkpoint
         self.model.eval()
         print(f"[Tagger] Ready on {self.device} (backbone={dtype}, head=fp32)")
+    @torch.no_grad()
+    def embed_pca(
+        self,
+        image,
+        n_components: int = 3,
+        max_size: int | None = None,
+    ) -> "Image.Image":
+        """Run PCA on the patch-token features of *image* and return a
+        false-colour RGB PIL image where R/G/B channels correspond to the
+        first three principal components, each normalised to [0, 255].
+        Parameters
+        ----------
+        image :
+            Local path, URL, or PIL.Image.Image.
+        n_components :
+            Number of PCA components (must be 3 for RGB output).
+        max_size :
+            Long-edge cap in pixels (defaults to ``self.max_size``).
+        """
+        if n_components != 3:
+            raise ValueError("n_components must be 3 for false-colour RGB output")
+        if max_size is None:
+            max_size = self.max_size
+        if isinstance(image, Image.Image):
+            img = image.convert("RGB")
+            w, h = img.size
+            scale = min(1.0, max_size / max(w, h))
+            new_w = _snap(round(w * scale), PATCH_SIZE)
+            new_h = _snap(round(h * scale), PATCH_SIZE)
+            pv = v2.Compose([
+                v2.Resize((new_h, new_w), interpolation=v2.InterpolationMode.LANCZOS),
+                v2.ToImage(),
+                v2.ToDtype(torch.float32, scale=True),
+                v2.Normalize(mean=_IMAGENET_MEAN, std=_IMAGENET_STD),
+            ])(img).unsqueeze(0).to(self.device)
+        else:
+            pv = preprocess_image(image, max_size=max_size).to(self.device)
+        with torch.autocast(device_type=self.device.type, dtype=self.dtype):
+            patch_tokens, h_p, w_p = self.model.backbone.get_image_tokens(pv)
+        # patch_tokens: [1, h_p*w_p, D_MODEL] → [N, D]
+        tokens = patch_tokens[0].float()  # fp32 for PCA
+        # Centre
+        mean = tokens.mean(dim=0, keepdim=True)
+        tokens_c = tokens - mean
+        # PCA via SVD (economy)
+        _, _, Vt = torch.linalg.svd(tokens_c, full_matrices=False)
+        components = Vt[:n_components]  # [3, D]
+        projected = tokens_c @ components.T  # [N, 3]
+        # Normalise each component to [0, 1]
+        lo = projected.min(dim=0).values
+        hi = projected.max(dim=0).values
+        projected = (projected - lo) / (hi - lo + 1e-8)
+        # Reshape to spatial grid and convert to uint8 PIL image
+        rgb = projected.reshape(h_p, w_p, 3).cpu().numpy()
+        rgb_uint8 = (rgb * 255).clip(0, 255).astype("uint8")
+        return Image.fromarray(rgb_uint8, mode="RGB")
     @torch.no_grad()
     def predict(self, image, topk: int | None = 30,
                 threshold: float | None = None) -> list[tuple[str, float]]: