lodestones
/

taggerine

@@ -64,17 +64,19 @@ from safetensors.torch import load_file
 # All hyperparameters match facebook/dinov3-vith16plus-pretrain-lvd1689m
 # =============================================================================
-D_MODEL     = 1280
-N_HEADS     = 20
-HEAD_DIM    = D_MODEL // N_HEADS   # 64
-N_LAYERS    = 32
-D_FFN       = 5120
 N_REGISTERS = 4
-PATCH_SIZE  = 16
-ROPE_THETA  = 100.0
-ROPE_RESCALE = 2.0   # pos_embed_rescale applied at inference
-LN_EPS      = 1e-5
-LAYERSCALE  = 1.0
 # ---------------------------------------------------------------------------
@@ -83,25 +85,23 @@ LAYERSCALE  = 1.0
 @lru_cache(maxsize=32)
 def _patch_coords_cached(h: int, w: int, device_str: str) -> torch.Tensor:
-    """Normalised [-1,+1] patch-centre coordinates (float32, cached)."""
     device = torch.device(device_str)
     cy = torch.arange(0.5, h, dtype=torch.float32, device=device) / h
     cx = torch.arange(0.5, w, dtype=torch.float32, device=device) / w
     coords = torch.stack(torch.meshgrid(cy, cx, indexing="ij"), dim=-1).flatten(0, 1)
-    coords = 2.0 * coords - 1.0   # [0,1] → [-1,+1]
     coords = coords * ROPE_RESCALE
     return coords  # [h*w, 2]
 def _build_rope(h_patches: int, w_patches: int,
                 dtype: torch.dtype, device: torch.device):
-    """Return (cos, sin) of shape [1, 1, h*w, HEAD_DIM] for broadcasting."""
-    coords = _patch_coords_cached(h_patches, w_patches, str(device))  # [P, 2]
     inv_freq = 1.0 / (ROPE_THETA ** torch.arange(
-        0, 1, 4 / HEAD_DIM, dtype=torch.float32, device=device))      # [D/4]
-    angles = 2 * math.pi * coords[:, :, None] * inv_freq[None, None, :]  # [P, 2, D/4]
-    angles = angles.flatten(1, 2).tile(2)                                 # [P, D]
-    cos = torch.cos(angles).to(dtype).unsqueeze(0).unsqueeze(0)  # [1,1,P,D]
     sin = torch.sin(angles).to(dtype).unsqueeze(0).unsqueeze(0)
     return cos, sin
@@ -113,7 +113,6 @@ def _rotate_half(x: torch.Tensor) -> torch.Tensor:
 def _apply_rope(q: torch.Tensor, k: torch.Tensor,
                 cos: torch.Tensor, sin: torch.Tensor):
-    """Apply RoPE only to patch tokens (skip CLS + register prefix)."""
     n_pre = 1 + N_REGISTERS
     q_pre, q_pat = q[..., :n_pre, :], q[..., n_pre:, :]
     k_pre, k_pat = k[..., :n_pre, :], k[..., n_pre:, :]
@@ -123,7 +122,7 @@ def _apply_rope(q: torch.Tensor, k: torch.Tensor,
 # ---------------------------------------------------------------------------
-# Building blocks
 # ---------------------------------------------------------------------------
 class _Attention(nn.Module):
@@ -134,7 +133,7 @@ class _Attention(nn.Module):
         self.v_proj = nn.Linear(D_MODEL, D_MODEL, bias=True)
         self.o_proj = nn.Linear(D_MODEL, D_MODEL, bias=True)
-    def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
         B, S, _ = x.shape
         q = self.q_proj(x).view(B, S, N_HEADS, HEAD_DIM).transpose(1, 2)
         k = self.k_proj(x).view(B, S, N_HEADS, HEAD_DIM).transpose(1, 2)
@@ -148,125 +147,259 @@ class _GatedMLP(nn.Module):
     def __init__(self):
         super().__init__()
         self.gate_proj = nn.Linear(D_MODEL, D_FFN, bias=True)
-        self.up_proj   = nn.Linear(D_MODEL, D_FFN, bias=True)
-        self.down_proj = nn.Linear(D_FFN,   D_MODEL, bias=True)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
 class _Block(nn.Module):
     def __init__(self):
         super().__init__()
-        self.norm1        = nn.LayerNorm(D_MODEL, eps=LN_EPS)
-        self.attention    = _Attention()
         self.layer_scale1 = nn.Parameter(torch.full((D_MODEL,), LAYERSCALE))
-        self.norm2        = nn.LayerNorm(D_MODEL, eps=LN_EPS)
-        self.mlp          = _GatedMLP()
         self.layer_scale2 = nn.Parameter(torch.full((D_MODEL,), LAYERSCALE))
-    def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
         x = x + self.attention(self.norm1(x), cos, sin) * self.layer_scale1
         x = x + self.mlp(self.norm2(x)) * self.layer_scale2
         return x
-# ---------------------------------------------------------------------------
-# Full backbone
-# ---------------------------------------------------------------------------
 class DINOv3ViTH(nn.Module):
     """DINOv3 ViT-H/16+ backbone.
-    Accepts any H, W that are multiples of 16.
-    Returns last_hidden_state [B, 1+R+P, D_MODEL].
     Token layout: [CLS, reg_0..reg_3, patch_0..patch_N].
-    State-dict keys are intentionally identical to the HuggingFace
-    transformers layout so .safetensors checkpoints load without remapping.
     """
     def __init__(self):
         super().__init__()
-        # These names must match HF exactly
         self.embeddings = _Embeddings()
         self.layer = nn.ModuleList([_Block() for _ in range(N_LAYERS)])
-        self.norm  = nn.LayerNorm(D_MODEL, eps=LN_EPS)
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata,
-                               strict, missing_keys, unexpected_keys, error_msgs):
-        # HF stores layer_scale as a sub-module with a "lambda1" parameter;
-        # we store it as a plain Parameter directly on _Block.
-        # Remap "layer.i.layer_scale{1,2}.lambda1" → "layer.i.layer_scale{1,2}"
-        for k in list(state_dict.keys()):
-            if k.startswith(prefix) and ".layer_scale" in k and k.endswith(".lambda1"):
-                new_k = k[:-len(".lambda1")]
-                state_dict[new_k] = state_dict.pop(k)
-        # Drop rope_embeddings buffer (computed on-the-fly)
-        for k in list(state_dict.keys()):
-            if k.startswith(prefix) and "rope_embeddings" in k:
-                state_dict.pop(k)
-        super()._load_from_state_dict(
-            state_dict, prefix, local_metadata, strict,
-            missing_keys, unexpected_keys, error_msgs)
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        B, _, H, W = pixel_values.shape
-        x = self.embeddings(pixel_values)  # [B, 1+R+P, D]
         h_p, w_p = H // PATCH_SIZE, W // PATCH_SIZE
         cos, sin = _build_rope(h_p, w_p, x.dtype, pixel_values.device)
         for block in self.layer:
             x = block(x, cos, sin)
         return self.norm(x)
-class _Embeddings(nn.Module):
-    """Patch + CLS + register token embeddings.
-    Key names match HF: embeddings.cls_token, embeddings.register_tokens,
-    embeddings.patch_embeddings.{weight,bias}.
     """
-    def __init__(self):
         super().__init__()
-        self.cls_token       = nn.Parameter(torch.empty(1, 1, D_MODEL))
-        self.mask_token      = nn.Parameter(torch.zeros(1, 1, D_MODEL))  # unused at inference
-        self.register_tokens = nn.Parameter(torch.empty(1, N_REGISTERS, D_MODEL))
-        self.patch_embeddings = nn.Conv2d(3, D_MODEL, kernel_size=PATCH_SIZE, stride=PATCH_SIZE)
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        B = pixel_values.shape[0]
-        dtype = self.patch_embeddings.weight.dtype
-        patches = self.patch_embeddings(pixel_values.to(dtype)).flatten(2).transpose(1, 2)
-        cls  = self.cls_token.expand(B, -1, -1)
-        regs = self.register_tokens.expand(B, -1, -1)
-        return torch.cat([cls, regs, patches], dim=1)
 # =============================================================================
-# Tagger head
 # =============================================================================
 class DINOv3Tagger(nn.Module):
-    """DINOv3 ViT-H/16+ backbone + linear projection head.
-    features = concat(CLS, reg_0..reg_3) → [B, (1+R)*D]
-    projection: Linear → [B, num_tags]
     """
-    def __init__(self, num_tags: int, projection_bias: bool = False):
-        super().__init__()
-        self.backbone   = DINOv3ViTH()
-        self.projection = nn.Linear((1 + N_REGISTERS) * D_MODEL, num_tags, bias=projection_bias)
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        hidden   = self.backbone(pixel_values)                      # [B, S, D]
-        cls      = hidden[:, 0, :]                                  # [B, D]
-        regs     = hidden[:, 1: 1 + N_REGISTERS, :].flatten(1)     # [B, R*D]
-        features = torch.cat([cls, regs], dim=-1)                   # [B, (1+R)*D]
-        return self.projection(features.float())                     # fp32 for stability
 # =============================================================================
@@ -274,7 +407,7 @@ class DINOv3Tagger(nn.Module):
 # =============================================================================
 _IMAGENET_MEAN = [0.485, 0.456, 0.406]
-_IMAGENET_STD  = [0.229, 0.224, 0.225]
 def _snap(x: int, m: int) -> int:
@@ -291,12 +424,22 @@ def _open_image(source) -> Image.Image:
 def preprocess_image(source, max_size: int = 1024) -> torch.Tensor:
-    """Load and preprocess an image → [1, 3, H, W] float32, ImageNet-normalised."""
     img = _open_image(source)
     w, h = img.size
-    scale = min(1.0, max_size / max(w, h))
-    new_w = _snap(round(w * scale), PATCH_SIZE)
-    new_h = _snap(round(h * scale), PATCH_SIZE)
     return v2.Compose([
         v2.Resize((new_h, new_w), interpolation=v2.InterpolationMode.LANCZOS),
         v2.ToImage(),
@@ -315,13 +458,15 @@ class Tagger:
     Parameters
     ----------
     checkpoint_path : str
-        Path to a .safetensors or .pth checkpoint saved by TaggerTrainer.
     vocab_path : str
-        Path to tagger_vocab.json  ({"idx2tag": [...]}).
     device : str
-        "cuda", "cuda:0", "cpu", etc.
     dtype : torch.dtype
-        bfloat16 recommended on Ampere+; float16 for older GPUs; float32 for CPU.
     max_size : int
         Long-edge cap in pixels before feeding to the model.
     """
@@ -334,8 +479,13 @@ class Tagger:
         dtype: torch.dtype = torch.bfloat16,
         max_size: int = 1024,
     ):
-        self.device   = torch.device(device if torch.cuda.is_available() or device == "cpu" else "cpu")
-        self.dtype    = dtype
         self.max_size = max_size
         with open(vocab_path) as f:
@@ -344,36 +494,47 @@ class Tagger:
         self.num_tags = len(self.idx2tag)
         print(f"[Tagger] Vocabulary: {self.num_tags:,} tags")
-        self.model = DINOv3Tagger(num_tags=self.num_tags)
         print(f"[Tagger] Loading checkpoint: {checkpoint_path}")
         if checkpoint_path.endswith((".safetensors", ".sft")):
-            sd = load_file(checkpoint_path, device=str(self.device))
         else:
-            sd = torch.load(checkpoint_path, map_location=str(self.device))
-        missing, unexpected = self.model.load_state_dict(sd, strict=False, assign=True)
-        if missing:
-            print(f"[Tagger] Missing keys ({len(missing)}): {missing[:5]}{'...' if len(missing) > 5 else ''}")
-        if unexpected:
-            print(f"[Tagger] Unexpected keys ({len(unexpected)}): {unexpected[:5]}{'...' if len(unexpected) > 5 else ''}")
-        self.model.backbone = self.model.backbone.to(dtype=dtype)
-        self.model = self.model.to(self.device)
         self.model.eval()
-        print(f"[Tagger] Ready on {self.device} ({dtype})")
     @torch.no_grad()
     def predict(self, image, topk: int | None = 30,
                 threshold: float | None = None) -> list[tuple[str, float]]:
-        """Tag a single image (local path or URL).
-        Specify either topk OR threshold. Returns [(tag, score), ...] desc."""
         if topk is None and threshold is None:
             topk = 30
         pv = preprocess_image(image, max_size=self.max_size).to(self.device)
-        with torch.autocast(device_type=self.device.type, dtype=self.dtype):
-            logits = self.model(pv)[0]
         scores = torch.sigmoid(logits.float())
         if topk is not None:
@@ -381,17 +542,18 @@ class Tagger:
         else:
             assert threshold is not None
             indices = (scores >= threshold).nonzero(as_tuple=True)[0]
-            values  = scores[indices]
-            order   = values.argsort(descending=True)
             indices, values = indices[order], values[order]
-        return [(self.idx2tag[i], float(v)) for i, v in zip(indices.tolist(), values.tolist())]
     @torch.no_grad()
     def predict_batch(self, images, topk: int | None = 30,
-                      threshold: float | None = None) -> list[list[tuple[str, float]]]:
-        """Tag multiple images (processed individually for mixed resolutions)."""
-        return [self.predict(img, topk=topk, threshold=threshold) for img in images]
 # =============================================================================
@@ -399,17 +561,20 @@ class Tagger:
 # =============================================================================
 def _fmt_pretty(path: str, results) -> str:
-    lines = [f"\n{'─' * 60}", f"  {path}", f"{'─' * 60}"]
     for rank, (tag, score) in enumerate(results, 1):
         bar = "█" * int(score * 20)
-        lines.append(f"  {rank:>3}.  {score:.3f}  {bar:<20}  {tag}")
     return "\n".join(lines)
 def _fmt_tags(results) -> str:
     return ", ".join(tag for tag, _ in results)
 def _fmt_json(path: str, results) -> dict:
-    return {"file": path, "tags": [{"tag": t, "score": round(s, 4)} for t, s in results]}
 # =============================================================================
@@ -418,28 +583,40 @@ def _fmt_json(path: str, results) -> dict:
 def main():
     parser = argparse.ArgumentParser(
-        description="DINOv3 ViT-H/16+ tagger inference (standalone, no transformers dep)",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
-    parser.add_argument("--checkpoint", required=True, help="Path to .safetensors or .pth checkpoint")
-    parser.add_argument("--vocab",      required=True, help="Path to tagger_vocab.json")
-    parser.add_argument("--images", nargs="+", required=True, help="Image paths and/or http(s) URLs")
-    parser.add_argument("--device",   default="cuda",  help="Device: cuda, cuda:0, cpu, … (default: cuda)")
     parser.add_argument("--max-size", type=int, default=1024,
-                        help="Long-edge cap in pixels, multiple of 16 (default: 1024)")
     mode = parser.add_mutually_exclusive_group()
-    mode.add_argument("--topk",      type=int,   default=30, help="Return top-k tags (default: 30)")
-    mode.add_argument("--threshold", type=float,             help="Return all tags with score >= threshold")
     parser.add_argument("--format", choices=["pretty", "tags", "json"],
                         default="pretty", help="Output format (default: pretty)")
     args = parser.parse_args()
-    tagger = Tagger(checkpoint_path=args.checkpoint, vocab_path=args.vocab,
-                    device=args.device, max_size=args.max_size)
-    topk, threshold = (None, args.threshold) if args.threshold else (args.topk, None)
     json_out = []
     for src in args.images:
@@ -448,13 +625,16 @@ def main():
             print(f"[warning] File not found: {src}", file=sys.stderr)
             continue
         results = tagger.predict(src, topk=topk, threshold=threshold)
-        if   args.format == "pretty": print(_fmt_pretty(src, results))
-        elif args.format == "tags":   print(_fmt_tags(results))
-        elif args.format == "json":   json_out.append(_fmt_json(src, results))
     if args.format == "json":
         print(json.dumps(json_out, indent=2, ensure_ascii=False))
 if __name__ == "__main__":
-    main()

 # All hyperparameters match facebook/dinov3-vith16plus-pretrain-lvd1689m
 # =============================================================================
+D_MODEL = 1280
+N_HEADS = 20
+HEAD_DIM = D_MODEL // N_HEADS  # 64
+N_LAYERS = 32
+D_FFN = 5120
 N_REGISTERS = 4
+PATCH_SIZE = 16
+ROPE_THETA = 100.0
+ROPE_RESCALE = 2.0
+LN_EPS = 1e-5
+LAYERSCALE = 1.0
+FEATURE_DIM = (1 + N_REGISTERS) * D_MODEL  # 6400
 # ---------------------------------------------------------------------------
 @lru_cache(maxsize=32)
 def _patch_coords_cached(h: int, w: int, device_str: str) -> torch.Tensor:
     device = torch.device(device_str)
     cy = torch.arange(0.5, h, dtype=torch.float32, device=device) / h
     cx = torch.arange(0.5, w, dtype=torch.float32, device=device) / w
     coords = torch.stack(torch.meshgrid(cy, cx, indexing="ij"), dim=-1).flatten(0, 1)
+    coords = 2.0 * coords - 1.0
     coords = coords * ROPE_RESCALE
     return coords  # [h*w, 2]
 def _build_rope(h_patches: int, w_patches: int,
                 dtype: torch.dtype, device: torch.device):
+    coords = _patch_coords_cached(h_patches, w_patches, str(device))
     inv_freq = 1.0 / (ROPE_THETA ** torch.arange(
+        0, 1, 4 / HEAD_DIM, dtype=torch.float32, device=device))
+    angles = 2 * math.pi * coords[:, :, None] * inv_freq[None, None, :]
+    angles = angles.flatten(1, 2).tile(2)
+    cos = torch.cos(angles).to(dtype).unsqueeze(0).unsqueeze(0)
     sin = torch.sin(angles).to(dtype).unsqueeze(0).unsqueeze(0)
     return cos, sin
 def _apply_rope(q: torch.Tensor, k: torch.Tensor,
                 cos: torch.Tensor, sin: torch.Tensor):
     n_pre = 1 + N_REGISTERS
     q_pre, q_pat = q[..., :n_pre, :], q[..., n_pre:, :]
     k_pre, k_pat = k[..., :n_pre, :], k[..., n_pre:, :]
 # ---------------------------------------------------------------------------
+# Transformer blocks
 # ---------------------------------------------------------------------------
 class _Attention(nn.Module):
         self.v_proj = nn.Linear(D_MODEL, D_MODEL, bias=True)
         self.o_proj = nn.Linear(D_MODEL, D_MODEL, bias=True)
+    def forward(self, x, cos, sin):
         B, S, _ = x.shape
         q = self.q_proj(x).view(B, S, N_HEADS, HEAD_DIM).transpose(1, 2)
         k = self.k_proj(x).view(B, S, N_HEADS, HEAD_DIM).transpose(1, 2)
     def __init__(self):
         super().__init__()
         self.gate_proj = nn.Linear(D_MODEL, D_FFN, bias=True)
+        self.up_proj = nn.Linear(D_MODEL, D_FFN, bias=True)
+        self.down_proj = nn.Linear(D_FFN, D_MODEL, bias=True)
+    def forward(self, x):
         return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
 class _Block(nn.Module):
     def __init__(self):
         super().__init__()
+        self.norm1 = nn.LayerNorm(D_MODEL, eps=LN_EPS)
+        self.attention = _Attention()
         self.layer_scale1 = nn.Parameter(torch.full((D_MODEL,), LAYERSCALE))
+        self.norm2 = nn.LayerNorm(D_MODEL, eps=LN_EPS)
+        self.mlp = _GatedMLP()
         self.layer_scale2 = nn.Parameter(torch.full((D_MODEL,), LAYERSCALE))
+    def forward(self, x, cos, sin):
         x = x + self.attention(self.norm1(x), cos, sin) * self.layer_scale1
         x = x + self.mlp(self.norm2(x)) * self.layer_scale2
         return x
+class _Embeddings(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # zeros() rather than empty() so a forgotten checkpoint key fails
+        # predictably instead of producing undefined outputs.
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, D_MODEL))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, D_MODEL))
+        self.register_tokens = nn.Parameter(torch.zeros(1, N_REGISTERS, D_MODEL))
+        self.patch_embeddings = nn.Conv2d(
+            3, D_MODEL, kernel_size=PATCH_SIZE, stride=PATCH_SIZE)
+    def forward(self, pixel_values):
+        B = pixel_values.shape[0]
+        dtype = self.patch_embeddings.weight.dtype
+        patches = self.patch_embeddings(
+            pixel_values.to(dtype)).flatten(2).transpose(1, 2)
+        cls = self.cls_token.expand(B, -1, -1)
+        regs = self.register_tokens.expand(B, -1, -1)
+        return torch.cat([cls, regs, patches], dim=1)
 class DINOv3ViTH(nn.Module):
     """DINOv3 ViT-H/16+ backbone.
     Token layout: [CLS, reg_0..reg_3, patch_0..patch_N].
+    Returns last_hidden_state [B, 1+R+P, D_MODEL].
     """
     def __init__(self):
         super().__init__()
         self.embeddings = _Embeddings()
         self.layer = nn.ModuleList([_Block() for _ in range(N_LAYERS)])
+        self.norm = nn.LayerNorm(D_MODEL, eps=LN_EPS)
+    def forward(self, pixel_values):
+        _, _, H, W = pixel_values.shape
+        x = self.embeddings(pixel_values)
         h_p, w_p = H // PATCH_SIZE, W // PATCH_SIZE
         cos, sin = _build_rope(h_p, w_p, x.dtype, pixel_values.device)
         for block in self.layer:
             x = block(x, cos, sin)
         return self.norm(x)
+# =============================================================================
+# Head — auto-detected from the checkpoint
+# =============================================================================
+class _LowRankHead(nn.Module):
+    """Two-matrix low-rank projection head.
+        features (in_dim)
+          → Linear(in_dim, rank, bias=?)
+          → Linear(rank, num_tags, bias=?)
     """
+    def __init__(self, in_dim: int, rank: int, num_tags: int,
+                 down_bias: bool, up_bias: bool):
         super().__init__()
+        self.proj_down = nn.Linear(in_dim, rank, bias=down_bias)
+        self.proj_up = nn.Linear(rank, num_tags, bias=up_bias)
+    def forward(self, x):
+        return self.proj_up(self.proj_down(x))
+def _build_head_from_checkpoint(
+    head_sd: dict,
+    in_dim: int,
+    num_tags: int,
+) -> tuple[nn.Module, dict]:
+    """Inspect head_sd and build a matching Module.
+    Supports two layouts, in order of preference:
+      1. Single linear          — any ``*.weight`` with shape [num_tags, in_dim]
+      2. Low-rank pair (2 mats) — one ``*.weight`` [rank, in_dim] plus
+                                   one ``*.weight`` [num_tags, rank]
+    Returns (module, remapped_state_dict) where the remapped state dict
+    matches the module's own key names so strict loading works.
+    """
+    weights_2d = [(k, v) for k, v in head_sd.items()
+                  if k.endswith(".weight") and v.ndim == 2]
+    # --- Case 1: single dense linear ---------------------------------------
+    singles = [(k, v) for k, v in weights_2d
+               if tuple(v.shape) == (num_tags, in_dim)]
+    if len(weights_2d) <= 2 and len(singles) == 1:
+        wkey, wval = singles[0]
+        base = wkey[:-len(".weight")]
+        bias_key = base + ".bias"
+        has_bias = bias_key in head_sd
+        module = nn.Linear(in_dim, num_tags, bias=has_bias)
+        remapped = {"weight": wval}
+        if has_bias:
+            remapped["bias"] = head_sd[bias_key]
+        # Sanity check: no extra keys we don't understand
+        expected_src = {wkey} | ({bias_key} if has_bias else set())
+        extra = set(head_sd) - expected_src
+        if extra:
+            raise RuntimeError(
+                f"Head has single-linear shape but extra unknown keys: {sorted(extra)}")
+        return module, remapped
+    # --- Case 2: low-rank pair ---------------------------------------------
+    down = None  # (key, tensor) with shape [rank, in_dim]
+    up = None    # (key, tensor) with shape [num_tags, rank]
+    for k, v in weights_2d:
+        if v.shape[1] == in_dim and v.shape[0] != num_tags:
+            down = (k, v)
+        elif v.shape[0] == num_tags and v.shape[1] != in_dim:
+            up = (k, v)
+    if down is not None and up is not None:
+        rank_down = down[1].shape[0]
+        rank_up = up[1].shape[1]
+        if rank_down != rank_up:
+            raise RuntimeError(
+                f"Low-rank head: inner dims disagree "
+                f"(down out={rank_down}, up in={rank_up})")
+        down_key, down_w = down
+        up_key, up_w = up
+        down_base = down_key[:-len(".weight")]
+        up_base = up_key[:-len(".weight")]
+        down_bias_key = down_base + ".bias"
+        up_bias_key = up_base + ".bias"
+        has_down_bias = down_bias_key in head_sd
+        has_up_bias = up_bias_key in head_sd
+        module = _LowRankHead(
+            in_dim=in_dim,
+            rank=rank_down,
+            num_tags=num_tags,
+            down_bias=has_down_bias,
+            up_bias=has_up_bias,
+        )
+        remapped = {
+            "proj_down.weight": down_w,
+            "proj_up.weight": up_w,
+        }
+        if has_down_bias:
+            remapped["proj_down.bias"] = head_sd[down_bias_key]
+        if has_up_bias:
+            remapped["proj_up.bias"] = head_sd[up_bias_key]
+        # Sanity check
+        expected_src = {down_key, up_key}
+        if has_down_bias:
+            expected_src.add(down_bias_key)
+        if has_up_bias:
+            expected_src.add(up_bias_key)
+        extra = set(head_sd) - expected_src
+        if extra:
+            raise RuntimeError(
+                f"Low-rank head detected but checkpoint has extra unknown "
+                f"head keys: {sorted(extra)}")
+        print(f"[Tagger] Detected low-rank head: "
+              f"in_dim={in_dim}, rank={rank_down}, num_tags={num_tags} "
+              f"(down_bias={has_down_bias}, up_bias={has_up_bias})")
+        return module, remapped
+    raise RuntimeError(
+        "Could not infer head architecture from checkpoint. "
+        f"Non-backbone keys found: {sorted(head_sd.keys())}"
+    )
 # =============================================================================
+# Tagger wrapper module
 # =============================================================================
 class DINOv3Tagger(nn.Module):
+    """Backbone + head. The head is attached after the checkpoint is
+    inspected (so we can build the right shape)."""
+    def __init__(self):
+        super().__init__()
+        self.backbone = DINOv3ViTH()
+        self.head: nn.Module | None = None  # attached by Tagger
+    def forward(self, pixel_values):
+        hidden = self.backbone(pixel_values)
+        cls = hidden[:, 0, :]
+        regs = hidden[:, 1: 1 + N_REGISTERS, :].flatten(1)
+        features = torch.cat([cls, regs], dim=-1).float()  # fp32 for head
+        return self.head(features)
+# =============================================================================
+# Checkpoint loading helpers
+# =============================================================================
+def _split_and_clean_state_dict(sd: dict) -> tuple[dict, dict]:
+    """Split full state dict into (backbone_sd, head_sd), stripping the
+    ``backbone.`` prefix and applying the remaps needed to match
+    ``DINOv3ViTH``'s parameter layout:
+      1. ``backbone.model.layer.N.*`` → ``layer.N.*``
+         (the checkpoint has an HF-style intermediate ``model`` wrapper
+         that our flat backbone class does not)
+      2. ``...layer_scale{1,2}.lambda1`` → ``...layer_scale{1,2}``
+         (HF stores layer_scale as a sub-module with a ``lambda1``
+         parameter; we use a plain ``nn.Parameter``)
+      3. Drop any ``rope_embeddings`` buffers (recomputed on the fly)
     """
+    backbone_sd: dict = {}
+    head_sd: dict = {}
+    for k, v in sd.items():
+        if k.startswith("backbone."):
+            nk = k[len("backbone."):]
+            # Remap (1): strip intermediate "model." before "layer."
+            if nk.startswith("model.layer."):
+                nk = nk[len("model."):]
+            backbone_sd[nk] = v
+        else:
+            head_sd[k] = v
+    # Remap (2): layer.N.layer_scale{1,2}.lambda1 → layer.N.layer_scale{1,2}
+    for k in list(backbone_sd.keys()):
+        if ".layer_scale" in k and k.endswith(".lambda1"):
+            backbone_sd[k[:-len(".lambda1")]] = backbone_sd.pop(k)
+    # Remap (3): drop rope buffers (recomputed on the fly)
+    for k in list(backbone_sd.keys()):
+        if "rope_embeddings" in k:
+            backbone_sd.pop(k)
+    return backbone_sd, head_sd
 # =============================================================================
 # =============================================================================
 _IMAGENET_MEAN = [0.485, 0.456, 0.406]
+_IMAGENET_STD = [0.229, 0.224, 0.225]
 def _snap(x: int, m: int) -> int:
 def preprocess_image(source, max_size: int = 1024) -> torch.Tensor:
+    """Load and preprocess an image → [1, 3, H, W] float32, ImageNet-normalised.
+    Aspect ratio is preserved: a single scale factor is chosen so that the
+    long edge fits inside max_size after snapping to a PATCH_SIZE multiple.
+    """
     img = _open_image(source)
     w, h = img.size
+    # Target long-edge (snapped to patch multiple).
+    long_edge = max(w, h)
+    target_long = _snap(min(long_edge, max_size), PATCH_SIZE)
+    scale = target_long / long_edge
+    new_w = _snap(max(PATCH_SIZE, round(w * scale)), PATCH_SIZE)
+    new_h = _snap(max(PATCH_SIZE, round(h * scale)), PATCH_SIZE)
     return v2.Compose([
         v2.Resize((new_h, new_w), interpolation=v2.InterpolationMode.LANCZOS),
         v2.ToImage(),
     Parameters
     ----------
     checkpoint_path : str
+        Path to a .safetensors or .pt/.pth checkpoint.
     vocab_path : str
+        Path to tagger_vocab.json or tagger_vocab_with_categories.json
+        (either must contain an ``idx2tag`` list).
     device : str
+        "cuda", "cuda:0", "cpu", ...
     dtype : torch.dtype
+        Backbone precision. bfloat16 recommended on Ampere+, float16 for
+        older GPUs, float32 for CPU. The head always runs in fp32.
     max_size : int
         Long-edge cap in pixels before feeding to the model.
     """
         dtype: torch.dtype = torch.bfloat16,
         max_size: int = 1024,
     ):
+        want_cuda = device.startswith("cuda")
+        if want_cuda and not torch.cuda.is_available():
+            print("[Tagger] CUDA not available, falling back to CPU")
+            device = "cpu"
+            dtype = torch.float32
+        self.device = torch.device(device)
+        self.dtype = dtype
         self.max_size = max_size
         with open(vocab_path) as f:
         self.num_tags = len(self.idx2tag)
         print(f"[Tagger] Vocabulary: {self.num_tags:,} tags")
+        # --- Load checkpoint to CPU first so we can inspect shapes ---------
         print(f"[Tagger] Loading checkpoint: {checkpoint_path}")
         if checkpoint_path.endswith((".safetensors", ".sft")):
+            sd = load_file(checkpoint_path, device="cpu")
         else:
+            sd = torch.load(checkpoint_path, map_location="cpu")
+        backbone_sd, head_sd = _split_and_clean_state_dict(sd)
+        if not head_sd:
+            raise RuntimeError(
+                "Checkpoint contains no non-backbone keys — cannot build head.")
+        # --- Build model, inferring head shape from the checkpoint --------
+        self.model = DINOv3Tagger()
+        head_module, head_sd_remapped = _build_head_from_checkpoint(
+            head_sd, in_dim=FEATURE_DIM, num_tags=self.num_tags,
+        )
+        self.model.head = head_module
+        # --- Strict load — mismatches raise instead of silently passing ----
+        self.model.backbone.load_state_dict(backbone_sd, strict=True)
+        self.model.head.load_state_dict(head_sd_remapped, strict=True)
+        # --- Move to device. Backbone → bf16/fp16; head stays fp32. --------
+        self.model.backbone = self.model.backbone.to(
+            device=self.device, dtype=dtype)
+        self.model.head = self.model.head.to(
+            device=self.device, dtype=torch.float32)
         self.model.eval()
+        print(f"[Tagger] Ready on {self.device} (backbone={dtype}, head=fp32)")
     @torch.no_grad()
     def predict(self, image, topk: int | None = 30,
                 threshold: float | None = None) -> list[tuple[str, float]]:
+        """Tag a single image (local path or URL)."""
         if topk is None and threshold is None:
             topk = 30
         pv = preprocess_image(image, max_size=self.max_size).to(self.device)
+        logits = self.model(pv)[0]
         scores = torch.sigmoid(logits.float())
         if topk is not None:
         else:
             assert threshold is not None
             indices = (scores >= threshold).nonzero(as_tuple=True)[0]
+            values = scores[indices]
+            order = values.argsort(descending=True)
             indices, values = indices[order], values[order]
+        return [(self.idx2tag[i], float(v))
+                for i, v in zip(indices.tolist(), values.tolist())]
     @torch.no_grad()
     def predict_batch(self, images, topk: int | None = 30,
+                      threshold: float | None = None):
+        return [self.predict(img, topk=topk, threshold=threshold)
+                for img in images]
 # =============================================================================
 # =============================================================================
 def _fmt_pretty(path: str, results) -> str:
+    lines = [f"\n{'─' * 60}", f" {path}", f"{'─' * 60}"]
     for rank, (tag, score) in enumerate(results, 1):
         bar = "█" * int(score * 20)
+        lines.append(f" {rank:>3}. {score:.3f} {bar:<20} {tag}")
     return "\n".join(lines)
 def _fmt_tags(results) -> str:
     return ", ".join(tag for tag, _ in results)
 def _fmt_json(path: str, results) -> dict:
+    return {"file": path,
+            "tags": [{"tag": t, "score": round(s, 4)} for t, s in results]}
 # =============================================================================
 def main():
     parser = argparse.ArgumentParser(
+        description="DINOv3 ViT-H/16+ tagger inference (standalone)",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
+    parser.add_argument("--checkpoint", required=True,
+                        help="Path to .safetensors or .pt checkpoint")
+    parser.add_argument("--vocab", required=True,
+                        help="Path to tagger_vocab*.json")
+    parser.add_argument("--images", nargs="+", required=True,
+                        help="Image paths and/or http(s) URLs")
+    parser.add_argument("--device", default="cuda",
+                        help="Device: cuda, cuda:0, cpu (default: cuda)")
     parser.add_argument("--max-size", type=int, default=1024,
+                        help="Long-edge cap in pixels (default: 1024)")
     mode = parser.add_mutually_exclusive_group()
+    mode.add_argument("--topk", type=int, default=30,
+                      help="Return top-k tags (default: 30)")
+    mode.add_argument("--threshold", type=float,
+                      help="Return all tags with score >= threshold")
     parser.add_argument("--format", choices=["pretty", "tags", "json"],
                         default="pretty", help="Output format (default: pretty)")
     args = parser.parse_args()
+    tagger = Tagger(
+        checkpoint_path=args.checkpoint,
+        vocab_path=args.vocab,
+        device=args.device,
+        max_size=args.max_size,
+    )
+    topk, threshold = (
+        (None, args.threshold) if args.threshold else (args.topk, None)
+    )
     json_out = []
     for src in args.images:
             print(f"[warning] File not found: {src}", file=sys.stderr)
             continue
         results = tagger.predict(src, topk=topk, threshold=threshold)
+        if args.format == "pretty":
+            print(_fmt_pretty(src, results))
+        elif args.format == "tags":
+            print(_fmt_tags(results))
+        elif args.format == "json":
+            json_out.append(_fmt_json(src, results))
     if args.format == "json":
         print(json.dumps(json_out, indent=2, ensure_ascii=False))
 if __name__ == "__main__":
+    main()