lodestones
/

taggerine

+"""DINOv3 Tagger — FastAPI + Jinja2 Web UI
+Usage
+-----
+python tagger_ui_server.py \
+    --checkpoint tagger_dino_v3/checkpoints/2026-03-28_22-57-47.safetensors \
+    --vocab      tagger_dino_v3/tagger_vocab.json \
+    --host       0.0.0.0 \
+    --port       7860
+Then open http://localhost:7860 in your browser.
+"""
+from __future__ import annotations
+import argparse
+import io
+from pathlib import Path
+import torch
+import uvicorn
+from fastapi import FastAPI, File, HTTPException, Query, UploadFile
+from fastapi.responses import HTMLResponse
+from fastapi.templating import Jinja2Templates
+from fastapi.requests import Request
+from PIL import Image
+# Reuse the standalone inference code — no other deps needed
+from inference_tagger_standalone import Tagger, preprocess_image, _open_image
+# ---------------------------------------------------------------------------
+# App setup
+# ---------------------------------------------------------------------------
+app = FastAPI(title="DINOv3 Tagger UI")
+templates = Jinja2Templates(directory=Path(__file__).parent / "tagger_ui" / "templates")
+templates.env.filters["format_number"] = lambda v: f"{v:,}"
+_tagger: Tagger | None = None
+_vocab_path: str = ""
+# ---------------------------------------------------------------------------
+# Routes
+# ---------------------------------------------------------------------------
+@app.get("/", response_class=HTMLResponse)
+async def index(request: Request):
+    return templates.TemplateResponse("index.html", {
+        "request":    request,
+        "num_tags":   _tagger.num_tags if _tagger else 0,
+        "vocab_path": _vocab_path,
+    })
+@app.post("/tag/url")
+async def tag_url(
+    url:       str   = Query(..., description="Image URL"),
+    topk:      int   | None = Query(default=40),
+    threshold: float | None = Query(default=None),
+    max_size:  int          = Query(default=1024),
+):
+    """Tag an image from a URL."""
+    assert _tagger is not None
+    try:
+        img = _open_image(url)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Could not fetch image: {e}")
+    return _run_tagger(img, topk, threshold, max_size)
+@app.post("/tag/upload")
+async def tag_upload(
+    file:      UploadFile    = File(...),
+    topk:      int   | None  = Query(default=40),
+    threshold: float | None  = Query(default=None),
+    max_size:  int           = Query(default=1024),
+):
+    """Tag an uploaded image file."""
+    assert _tagger is not None
+    try:
+        data = await file.read()
+        img = Image.open(io.BytesIO(data)).convert("RGB")
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Could not read image: {e}")
+    return _run_tagger(img, topk, threshold, max_size)
+# ---------------------------------------------------------------------------
+# Shared inference helper
+# ---------------------------------------------------------------------------
+def _run_tagger(
+    img:       Image.Image,
+    topk:      int | None,
+    threshold: float | None,
+    max_size:  int,
+) -> dict:
+    assert _tagger is not None
+    if topk is None and threshold is None:
+        topk = 40
+    # Preprocess from PIL directly (avoids re-opening)
+    from inference_tagger_standalone import _snap, PATCH_SIZE, _IMAGENET_MEAN, _IMAGENET_STD
+    import torchvision.transforms.v2 as v2
+    w, h = img.size
+    scale = min(1.0, max_size / max(w, h))
+    new_w = _snap(round(w * scale), PATCH_SIZE)
+    new_h = _snap(round(h * scale), PATCH_SIZE)
+    transform = v2.Compose([
+        v2.Resize((new_h, new_w), interpolation=v2.InterpolationMode.LANCZOS),
+        v2.ToImage(),
+        v2.ToDtype(torch.float32, scale=True),
+        v2.Normalize(mean=_IMAGENET_MEAN, std=_IMAGENET_STD),
+    ])
+    pixel_values = transform(img).unsqueeze(0).to(_tagger.device)
+    with torch.no_grad(), torch.autocast(device_type=_tagger.device.type, dtype=_tagger.dtype):
+        logits = _tagger.model(pixel_values)[0]
+    scores = torch.sigmoid(logits.float())
+    if topk is not None:
+        values, indices = scores.topk(min(topk, _tagger.num_tags))
+    else:
+        assert threshold is not None
+        indices = (scores >= threshold).nonzero(as_tuple=True)[0]
+        values  = scores[indices]
+        order   = values.argsort(descending=True)
+        indices, values = indices[order], values[order]
+    tags = [
+        {"tag": _tagger.idx2tag[i], "score": round(float(v), 4)}
+        for i, v in zip(indices.tolist(), values.tolist())
+    ]
+    return {"tags": tags, "count": len(tags)}
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+def main():
+    global _tagger, _vocab_path
+    parser = argparse.ArgumentParser(description="DINOv3 Tagger Web UI")
+    parser.add_argument("--checkpoint", required=True, help="Path to .safetensors checkpoint")
+    parser.add_argument("--vocab",      required=True, help="Path to tagger_vocab.json")
+    parser.add_argument("--device",     default="cuda", help="cuda / cpu (default: cuda)")
+    parser.add_argument("--max-size",   type=int, default=1024, help="Default long-edge cap")
+    parser.add_argument("--host",       default="0.0.0.0")
+    parser.add_argument("--port",       type=int, default=7860)
+    args = parser.parse_args()
+    _vocab_path = args.vocab
+    _tagger = Tagger(
+        checkpoint_path=args.checkpoint,
+        vocab_path=args.vocab,
+        device=args.device,
+        max_size=args.max_size,
+    )
+    print(f"\n  Tagger UI running at  http://{args.host}:{args.port}\n")
+    uvicorn.run(app, host=args.host, port=args.port)
+if __name__ == "__main__":
+    main()