Spaces:

google
/

tipsv2-gpu-explorer

Running on Zero

App Files Files Community

Use AutoModel for model loading, remove 2200+ LOC of dead code, add DPT seg legend, add smaller resolutions

by gberton - opened 4 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+88

-547

Files changed (4) hide show

.ruff_cache/.gitignore +2 -0
.ruff_cache/0.15.9/8006769214093067198 +0 -0
.ruff_cache/CACHEDIR.TAG +1 -0
app.py +85 -547

.ruff_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Automatically created by ruff.
2	+ *

.ruff_cache/0.15.9/8006769214093067198 ADDED Viewed

Binary file (61 Bytes). View file

.ruff_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1 @@


1	+ Signature: 8a477f597d28d172789f06886806bc55

app.py CHANGED Viewed

@@ -1,9 +1,6 @@
 """TIPS Feature Explorer (GPU) — Hugging Face Space demo with ZeroGPU."""
 import colorsys
-import io
-import os
-import urllib.request
 import gradio as gr
 import matplotlib.cm as cm
@@ -16,97 +13,35 @@ from PIL import Image, ImageDraw, ImageFont
 from fast_pytorch_kmeans import KMeans as TorchKMeans
 from sklearn.decomposition import PCA
 from torchvision import transforms
-import dpt_head
-import image_encoder
-import text_encoder as text_encoder_mod
 # ── Constants ───────────────────────────────────────────────────────────────
 DEFAULT_IMAGE_SIZE = 896
-MODEL_IMAGE_SIZE = 448
 PATCH_SIZE = 14
-RESOLUTIONS = [896, 1120, 1372, 1792]
 ZEROSEG_IMAGE_SIZE = 1372
-ZEROSEG_SPATIAL = ZEROSEG_IMAGE_SIZE // PATCH_SIZE  # 96
-DEPTH_IMAGE_SIZE = 1036  # must be divisible by PATCH_SIZE=14 → 74×14
-DEPTH_SPATIAL = DEPTH_IMAGE_SIZE // PATCH_SIZE  # 74
-VOCAB_SIZE = 32000
 MAX_LEN = 64
-CKPT_DIR = "checkpoints"
-GCS = "https://storage.googleapis.com/tips_data"
-# Per-variant DPT config: embed_dim, block_indices, checkpoint URLs
-DPT_CONFIGS = {
-    "TIPS v2 — B/14": dict(
-        embed_dim=768, block_indices=[2, 5, 8, 11],
-        depth_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_b14_depth_dpt.zip",
-        normals_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_b14_normals_dpt.zip",
-        seg_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_b14_segmentation_dpt.zip",
-    ),
-    "TIPS v2 — L/14": dict(
-        embed_dim=1024, block_indices=[5, 11, 17, 23],
-        depth_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_l14_depth_dpt.zip",
-        normals_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_l14_normals_dpt.zip",
-        seg_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_l14_segmentation_dpt.zip",
-    ),
-    "TIPS v2 — SO400m/14": dict(
-        embed_dim=1152, block_indices=[6, 13, 20, 26],
-        depth_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_so400m14_depth_dpt.zip",
-        normals_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_so400m14_normals_dpt.zip",
-        seg_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_so400m14_segmentation_dpt.zip",
-    ),
-    "TIPS v2 — g/14": dict(
-        embed_dim=1536, block_indices=[9, 19, 29, 39],
-        depth_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_g14_depth_dpt.zip",
-        normals_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_g14_normals_dpt.zip",
-        seg_url=f"{GCS}/v2_0/checkpoints/scenic/tips_v2_g14_segmentation_dpt.zip",
-    ),
-}
-DPT_VARIANT_CHOICES = list(DPT_CONFIGS.keys())
-DEFAULT_DPT_VARIANT = "TIPS v2 — L/14"
-def _device():
-    """Resolve device dynamically — GPU is only available inside @spaces.GPU."""
-    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# ── Model variants ──────────────────────────────────────────────────────────
 VARIANTS = {
-    "TIPS v2 — B/14": dict(
-        vision_url=f"{GCS}/v2_0/checkpoints/pytorch/tips_v2_oss_b14_vision.npz",
-        text_url=f"{GCS}/v2_0/checkpoints/pytorch/tips_v2_oss_b14_text.npz",
-        vision_fn="vit_base",
-        text_cfg=dict(hidden_size=768, mlp_dim=3072, num_heads=12, num_layers=12),
-        ffn="mlp",
-    ),
-    "TIPS v2 — L/14": dict(
-        vision_url=f"{GCS}/v2_0/checkpoints/pytorch/tips_v2_oss_l14_vision.npz",
-        text_url=f"{GCS}/v2_0/checkpoints/pytorch/tips_v2_oss_l14_text.npz",
-        vision_fn="vit_large",
-        text_cfg=dict(hidden_size=1024, mlp_dim=4096, num_heads=16, num_layers=12),
-        ffn="mlp",
-    ),
-    "TIPS v2 — SO400m/14": dict(
-        vision_url=f"{GCS}/v2_0/checkpoints/pytorch/tips_v2_oss_so14_vision.npz",
-        text_url=f"{GCS}/v2_0/checkpoints/pytorch/tips_v2_oss_so14_text.npz",
-        vision_fn="vit_so400m",
-        text_cfg=dict(hidden_size=1152, mlp_dim=4304, num_heads=16, num_layers=27),
-        ffn="mlp",
-    ),
-    "TIPS v2 — g/14": dict(
-        vision_url=f"{GCS}/v2_0/checkpoints/pytorch/tips_v2_oss_g14_vision.npz",
-        text_url=f"{GCS}/v2_0/checkpoints/pytorch/tips_v2_oss_g14_text.npz",
-        vision_fn="vit_giant2",
-        text_cfg=dict(hidden_size=1536, mlp_dim=6144, num_heads=24, num_layers=12),
-        ffn="swiglu",
-    ),
 }
 DEFAULT_VARIANT = "TIPS v2 — L/14"
 # ── Pascal Context (59 classes) ─────────────────────────────────────────────
 # TCL prompt templates (from the Scenic zero-shot seg evaluator).
@@ -135,57 +70,6 @@ PASCAL_CONTEXT_CLASSES = (
     "wood",
 )
-# ── Pascal VOC (20 foreground classes) ──────────────────────────────────────
-PASCAL_VOC_CLASSES = (
-    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
-    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
-    "pottedplant", "sheep", "sofa", "train", "tvmonitor",
-)
-PASCAL_VOC_PALETTE = np.array([
-    [128, 0, 0],      # aeroplane
-    [0, 128, 0],      # bicycle
-    [128, 128, 0],    # bird
-    [0, 0, 128],      # boat
-    [128, 0, 128],    # bottle
-    [0, 128, 128],    # bus
-    [128, 128, 128],  # car
-    [64, 0, 0],       # cat
-    [192, 0, 0],      # chair
-    [64, 128, 0],     # cow
-    [192, 128, 0],    # diningtable
-    [64, 0, 128],     # dog
-    [192, 0, 128],    # horse
-    [64, 128, 128],   # motorbike
-    [192, 128, 128],  # person
-    [0, 64, 0],       # pottedplant
-    [128, 64, 0],     # sheep
-    [0, 192, 0],      # sofa
-    [128, 192, 0],    # train
-    [0, 64, 128],     # tvmonitor
-], dtype=np.uint8)
-# Colors from segmentation_dataset_info.py (matching class order above,
-# i.e. index 0 = aeroplane, etc.).
-PASCAL_CONTEXT_PALETTE = np.array([
-    [128, 0, 0], [214, 35, 42], [142, 28, 102], [39, 158, 136],
-    [195, 112, 211], [0, 128, 0], [128, 128, 0], [0, 0, 128],
-    [127, 34, 91], [128, 0, 128], [83, 137, 118], [0, 128, 128],
-    [165, 86, 86], [128, 128, 128], [64, 0, 0], [106, 30, 114],
-    [192, 0, 0], [226, 154, 154], [67, 11, 127], [64, 128, 0],
-    [14, 242, 18], [155, 9, 121], [64, 0, 128], [131, 76, 67],
-    [229, 106, 184], [37, 131, 150], [160, 150, 59], [154, 176, 215],
-    [255, 255, 222], [106, 160, 142], [192, 0, 128], [214, 35, 42],
-    [141, 90, 178], [64, 128, 128], [229, 106, 184], [116, 116, 116],
-    [192, 128, 128], [0, 182, 198], [21, 106, 168], [0, 64, 0],
-    [6, 151, 48], [214, 35, 42], [128, 64, 0], [131, 76, 67],
-    [229, 106, 184], [116, 116, 116], [0, 182, 198], [0, 182, 198],
-    [0, 192, 0], [255, 117, 39], [6, 151, 48], [128, 192, 0],
-    [141, 90, 178], [131, 76, 6], [0, 64, 128], [116, 116, 116],
-    [178, 182, 50], [0, 182, 198], [21, 106, 168],
-], dtype=np.uint8)
 ADE20K_CLASSES = (
     'wall', 'building', 'sky', 'floor', 'tree',
     'ceiling', 'road', 'bed', 'windowpane', 'grass',
@@ -238,144 +122,49 @@ _model = {
     "text": None,
     "tokenizer": None,
     "temperature": None,
-    "ade20k_embs": None,  # (59, D) pre-computed text embeddings for Pascal Context
-    "voc_embs": None,     # (20, D) pre-computed text embeddings for Pascal VOC
 }
-# DPT depth head — keyed per variant
 _dpt = {
-    "variant": None,            # currently loaded DPT variant name
-    "model": None,              # DPTDepthHead on CPU
-    "normals_model": None,      # DPTNormalsHead on CPU
-    "segmentation_model": None, # DPTSegmentationHead on CPU
-    "vision": None,             # vision encoder for current DPT variant
 }
-def _download(url):
-    """Download a file to CKPT_DIR if not already present. Return local path."""
-    fname = url.rsplit("/", 1)[-1]
-    path = os.path.join(CKPT_DIR, fname)
-    if not os.path.exists(path):
-        print(f"Downloading {fname} ...")
-        urllib.request.urlretrieve(url, path)
-    return path
 def load_variant(name):
-    """Download (if needed) and load a model variant.
-    Models are kept on CPU for storage. They are moved to GPU dynamically
-    inside @spaces.GPU-decorated callbacks via _move_models_to_device().
-    """
     global _model
     if _model["name"] == name:
         return
-    os.makedirs(CKPT_DIR, exist_ok=True)
-    cfg = VARIANTS[name]
-    # -- vision encoder (load on CPU) --
-    vis_path = _download(cfg["vision_url"])
-    weights_v = {k: torch.tensor(v) for k, v in np.load(vis_path, allow_pickle=False).items()}
-    build_vision = getattr(image_encoder, cfg["vision_fn"])
-    model_v = build_vision(
-        img_size=MODEL_IMAGE_SIZE, patch_size=PATCH_SIZE, ffn_layer=cfg["ffn"],
-        block_chunks=0, init_values=1.0,
-        interpolate_antialias=True, interpolate_offset=0.0,
-    )
-    model_v.load_state_dict(weights_v)
-    model_v.eval()
-    # -- text encoder (load on CPU) --
-    txt_path = _download(cfg["text_url"])
-    with open(txt_path, "rb") as f:
-        weights_t = {k: torch.from_numpy(v) for k, v in np.load(io.BytesIO(f.read()), allow_pickle=False).items()}
-    temperature = weights_t.pop("temperature")
-    model_t = text_encoder_mod.TextEncoder(cfg["text_cfg"], vocab_size=VOCAB_SIZE)
-    model_t.load_state_dict(weights_t)
-    model_t.eval()
-    # -- tokenizer (shared across variants) --
-    tok_path = _download(f"{GCS}/v1_0/checkpoints/tokenizer.model")
-    tokenizer = text_encoder_mod.Tokenizer(tok_path)
     _model.update(
-        name=name, vision=model_v, text=model_t,
-        tokenizer=tokenizer, temperature=temperature,
-        ade20k_embs=None,  # computed lazily on GPU
     )
-    print(f"Loaded {name} (on CPU, will move to GPU on demand)")
 def _load_dpt(variant_name=None):
-    """Download and build DPT heads + vision encoder for the given variant."""
     global _dpt
     if variant_name is None:
-        variant_name = DEFAULT_DPT_VARIANT
-    cfg = DPT_CONFIGS[variant_name]
-    embed_dim = cfg["embed_dim"]
-    # Skip reload if same variant is already loaded
     if _dpt["variant"] == variant_name and _dpt["model"] is not None:
         return
-    os.makedirs(CKPT_DIR, exist_ok=True)
-    # Load DPT depth head
-    zip_path = _download(cfg["depth_url"])
-    dpt_model = dpt_head.DPTDepthHead(
-        input_embed_dim=embed_dim, channels=256,
-        post_process_channels=(128, 256, 512, 1024),
-        readout_type="project", num_depth_bins=256,
-        min_depth=1e-3, max_depth=10.0,
-    )
-    dpt_head.load_dpt_weights(dpt_model, zip_path)
-    dpt_model.eval()
-    _dpt["model"] = dpt_model
-    # Load DPT normals head
-    normals_zip = _download(cfg["normals_url"])
-    normals_model = dpt_head.DPTNormalsHead(
-        input_embed_dim=embed_dim, channels=256,
-        post_process_channels=(128, 256, 512, 1024),
-        readout_type="project",
-    )
-    dpt_head.load_normals_weights(normals_model, normals_zip)
-    normals_model.eval()
-    _dpt["normals_model"] = normals_model
-    # Load DPT segmentation head
-    seg_zip = _download(cfg["seg_url"])
-    seg_model = dpt_head.DPTSegmentationHead(
-        input_embed_dim=embed_dim, channels=256,
-        post_process_channels=(128, 256, 512, 1024),
-        readout_type="project", num_classes=150,
-    )
-    dpt_head.load_segmentation_weights(seg_model, seg_zip)
-    seg_model.eval()
-    _dpt["segmentation_model"] = seg_model
-    # Vision encoder — reuse if the main model matches
-    var_cfg = VARIANTS[variant_name]
     if _model["name"] == variant_name and _model["vision"] is not None:
-        vision = _model["vision"]
-    else:
-        vis_path = _download(var_cfg["vision_url"])
-        weights_v = {k: torch.tensor(v) for k, v in np.load(vis_path, allow_pickle=False).items()}
-        build_fn = getattr(image_encoder, var_cfg["vision_fn"])
-        vision = build_fn(
-            img_size=MODEL_IMAGE_SIZE, patch_size=PATCH_SIZE,
-            ffn_layer=var_cfg["ffn"], block_chunks=0, init_values=1.0,
-            interpolate_antialias=True, interpolate_offset=0.0,
-        )
-        vision.load_state_dict(weights_v)
-        vision.eval()
-    _dpt["vision"] = vision
-    _dpt["variant"] = variant_name
-    print(f"Loaded DPT heads + {variant_name} vision encoder (on CPU)")
 def _move_models_to_device():
     """Move models to the current device (GPU inside @spaces.GPU, else CPU)."""
@@ -385,7 +174,6 @@ def _move_models_to_device():
     if _model["text"] is not None:
         _model["text"].to(dev)
 def _ensure_ade20k_embs():
     """Pre-compute Pascal Context text embeddings if not yet done (must run on GPU)."""
     if _model["ade20k_embs"] is not None:
@@ -403,32 +191,11 @@ def _ensure_ade20k_embs():
     _model["ade20k_embs"] = l2_normalize(np.mean(all_embs, axis=0))
     print("Pascal Context text embeddings computed.")
-def _ensure_voc_embs():
-    """Pre-compute Pascal VOC text embeddings if not yet done (must run on GPU)."""
-    if _model["voc_embs"] is not None:
-        return
-    dev = _device()
-    model_t = _model["text"]
-    tokenizer = _model["tokenizer"]
-    all_embs = []
-    for template in TCL_PROMPTS:
-        prompts = [template.format(c) for c in PASCAL_VOC_CLASSES]
-        ids, paddings = tokenizer.tokenize(prompts, max_len=MAX_LEN)
-        with torch.no_grad():
-            embs = model_t(torch.from_numpy(ids).to(dev), torch.from_numpy(paddings).to(dev))
-        all_embs.append(embs.cpu().numpy())
-    _model["voc_embs"] = l2_normalize(np.mean(all_embs, axis=0))
-    print("Pascal VOC text embeddings computed.")
 def _init_model():
     """Load model + move to GPU + compute text embeddings."""
     load_variant(_model["name"] or DEFAULT_VARIANT)
     _move_models_to_device()
     _ensure_ade20k_embs()
-    _ensure_voc_embs()
 # ── Preprocessing & helpers ─────────────────────────────────────────────────
@@ -438,16 +205,9 @@ def preprocess(img, size=DEFAULT_IMAGE_SIZE):
         transforms.ToTensor(),
     ])(img)
-preprocess_zeroseg = transforms.Compose([
-    transforms.Resize((ZEROSEG_IMAGE_SIZE, ZEROSEG_IMAGE_SIZE)),
-    transforms.ToTensor(),
-])
 def l2_normalize(x, axis=-1):
     return x / np.linalg.norm(x, ord=2, axis=axis, keepdims=True).clip(min=1e-3)
 def upsample(arr, h, w, mode="bilinear"):
     """Upsample (H, W, C) or (H, W) numpy array to (h, w, ...)."""
     t = torch.from_numpy(arr).float()
@@ -458,11 +218,9 @@ def upsample(arr, h, w, mode="bilinear"):
     up = F.interpolate(t, size=(h, w), mode=mode, **kwargs)
     return up[0].permute(1, 2, 0).numpy()
 def to_uint8(x):
     return (x * 255).clip(0, 255).astype(np.uint8)
 # ── Feature extraction (GPU-accelerated) ────────────────────────────────────
 @torch.no_grad()
@@ -475,7 +233,6 @@ def extract_features(image_np, resolution=DEFAULT_IMAGE_SIZE):
     sp = resolution // PATCH_SIZE
     return patch_tokens.cpu().reshape(sp, sp, -1).numpy()
 @torch.no_grad()
 def extract_features_value_attention(image_np, resolution=ZEROSEG_IMAGE_SIZE):
     """Return spatial features (sp, sp, D) using Value Attention on GPU.
@@ -527,7 +284,6 @@ def extract_features_value_attention(image_np, resolution=ZEROSEG_IMAGE_SIZE):
     spatial = patch_tokens.cpu().reshape(sp, sp, -1).numpy()
     return spatial
 # ── PCA Visualisations ──────────────────────────────────────────────────────
 def vis_pca(spatial, h, w):
@@ -540,9 +296,8 @@ def vis_pca(spatial, h, w):
     rgb = 1 / (1 + np.exp(-2.0 * rgb))
     return to_uint8(upsample(rgb, h, w))
 def vis_depth(spatial, h, w):
-    """1st PCA component as pseudo-depth (inferno colormap)."""
     feat = spatial.reshape(-1, spatial.shape[-1])
     H, W = spatial.shape[0], spatial.shape[1]
     depth = PCA(n_components=1).fit_transform(feat).reshape(H, W)
@@ -550,7 +305,6 @@ def vis_depth(spatial, h, w):
     colored = cm.get_cmap("inferno")(depth)[:, :, :3].astype(np.float32)
     return to_uint8(upsample(colored, h, w))
 def vis_kmeans(spatial, h, w, n_clusters=6):
     """K-means clustering of spatial features."""
     H, W = spatial.shape[:2]
@@ -566,186 +320,8 @@ def vis_kmeans(spatial, h, w, n_clusters=6):
     seg = palette[labels].astype(np.float32)
     return to_uint8(seg)
 # ── Zero-shot Segmentation ──────────────────────────────────────────────────
-def vis_pascal_context_semseg(spatial, orig_image):
-    """Zero-shot semantic segmentation with Pascal Context 59 classes.
-    Uses value-attention features and TCL prompt templates (9-template
-    ensemble) following the Scenic zero-shot seg evaluator.
-    For each spatial position, pick the Pascal Context class whose text
-    embedding has the highest cosine similarity with the image feature.
-    Returns (labelled image, raw mask, detected string, undetected string).
-    """
-    h, w = orig_image.shape[:2]
-    S_h, S_w = spatial.shape[:2]
-    feat = l2_normalize(spatial.reshape(-1, spatial.shape[-1]))  # (N, D)
-    sim = feat @ _model["ade20k_embs"].T                        # (N, 59)
-    sim_map = sim.reshape(S_h, S_w, -1)
-    # Bilinear upsample similarities then argmax for smooth boundaries
-    sim_up = upsample(sim_map, h, w, mode="bilinear")
-    labels = sim_up.argmax(axis=-1)                              # (h, w)
-    # --- raw segmentation mask (no blend) ---
-    seg_rgb = PASCAL_CONTEXT_PALETTE[labels].astype(np.float32) / 255.0
-    mask_img = to_uint8(seg_rgb)
-    # --- blended overlay with legend ---
-    blend = 0.1 * orig_image.astype(np.float32) / 255.0 + 0.9 * seg_rgb
-    blend_img = Image.fromarray(to_uint8(blend))
-    # count pixels per class, sorted by area (descending)
-    unique_ids, counts = np.unique(labels, return_counts=True)
-    order = np.argsort(-counts)
-    unique_ids, counts = unique_ids[order], counts[order]
-    total = counts.sum()
-    # build a legend panel on the right side
-    try:
-        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 60)
-    except OSError:
-        font = ImageFont.load_default()
-    # show top 5 classes by area
-    n_legend = min(len(unique_ids), 5)
-    legend_ids = [(unique_ids[i], counts[i]) for i in range(n_legend)]
-    row_h = 80       # height per legend row
-    swatch_w = 60    # color swatch width
-    pad = 12         # padding
-    legend_w = 450   # legend panel width
-    legend_h = max(h, n_legend * row_h + pad * 2)
-    canvas = Image.new("RGB", (w + legend_w, legend_h), (255, 255, 255))
-    canvas.paste(blend_img, (0, 0))
-    draw = ImageDraw.Draw(canvas)
-    for i, (cid, cnt) in enumerate(legend_ids):
-        pct = cnt / total * 100
-        color = tuple(PASCAL_CONTEXT_PALETTE[cid].tolist())
-        name = PASCAL_CONTEXT_CLASSES[cid]
-        y_top = pad + i * row_h
-        # draw color swatch
-        draw.rectangle(
-            [w + pad, y_top, w + pad + swatch_w, y_top + swatch_w],
-            fill=color, outline=(0, 0, 0),
-        )
-        # draw class name + percentage
-        draw.text(
-            (w + pad + swatch_w + 8, y_top + 6),
-            f"{name}",
-            fill="black", font=font,
-        )
-    overlay_out = np.array(canvas)
-    # format detected (>=2%) / undetected (<2% or absent) strings
-    detected_parts, minor_parts = [], []
-    for i, cid in enumerate(unique_ids):
-        pct = counts[i] / total * 100
-        name = PASCAL_CONTEXT_CLASSES[cid]
-        if pct >= 2:
-            detected_parts.append(f"{name} ({pct:.1f}%)")
-        else:
-            minor_parts.append(f"{name} ({pct:.1f}%)")
-    absent = [
-        f"{PASCAL_CONTEXT_CLASSES[i]} (0.0%)"
-        for i in range(len(PASCAL_CONTEXT_CLASSES))
-        if i not in set(unique_ids.tolist())
-    ]
-    detected_str = ", ".join(detected_parts)
-    undetected_str = ", ".join(minor_parts + absent)
-    return overlay_out, mask_img, detected_str, undetected_str
-def vis_pascal_voc_semseg(spatial, orig_image):
-    """Zero-shot semantic segmentation with Pascal VOC 20 classes.
-    Same approach as Pascal Context but with VOC classes and palette.
-    Returns (labelled image, raw mask, detected string, undetected string).
-    """
-    h, w = orig_image.shape[:2]
-    S_h, S_w = spatial.shape[:2]
-    feat = l2_normalize(spatial.reshape(-1, spatial.shape[-1]))  # (N, D)
-    sim = feat @ _model["voc_embs"].T                            # (N, 20)
-    sim_map = sim.reshape(S_h, S_w, -1)
-    # Bilinear upsample similarities then argmax for smooth boundaries
-    sim_up = upsample(sim_map, h, w, mode="bilinear")
-    labels = sim_up.argmax(axis=-1)                              # (h, w)
-    # --- raw segmentation mask (no blend) ---
-    seg_rgb = PASCAL_VOC_PALETTE[labels].astype(np.float32) / 255.0
-    mask_img = to_uint8(seg_rgb)
-    # --- blended overlay with legend ---
-    blend = 0.1 * orig_image.astype(np.float32) / 255.0 + 0.9 * seg_rgb
-    blend_img = Image.fromarray(to_uint8(blend))
-    # count pixels per class, sorted by area (descending)
-    unique_ids, counts = np.unique(labels, return_counts=True)
-    order = np.argsort(-counts)
-    unique_ids, counts = unique_ids[order], counts[order]
-    total = counts.sum()
-    # build a legend panel on the right side
-    try:
-        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 60)
-    except OSError:
-        font = ImageFont.load_default()
-    n_legend = min(len(unique_ids), 5)
-    legend_ids = [(unique_ids[i], counts[i]) for i in range(n_legend)]
-    row_h = 80
-    swatch_w = 60
-    pad = 12
-    legend_w = 450
-    legend_h = max(h, n_legend * row_h + pad * 2)
-    canvas = Image.new("RGB", (w + legend_w, legend_h), (255, 255, 255))
-    canvas.paste(blend_img, (0, 0))
-    draw = ImageDraw.Draw(canvas)
-    for i, (cid, cnt) in enumerate(legend_ids):
-        pct = cnt / total * 100
-        color = tuple(PASCAL_VOC_PALETTE[cid].tolist())
-        name = PASCAL_VOC_CLASSES[cid]
-        y_top = pad + i * row_h
-        draw.rectangle(
-            [w + pad, y_top, w + pad + swatch_w, y_top + swatch_w],
-            fill=color, outline=(0, 0, 0),
-        )
-        draw.text(
-            (w + pad + swatch_w + 8, y_top + 6),
-            f"{name}",
-            fill="black", font=font,
-        )
-    overlay_out = np.array(canvas)
-    # format detected (>=2%) / undetected (<2% or absent) strings
-    detected_parts, minor_parts = [], []
-    for i, cid in enumerate(unique_ids):
-        pct = counts[i] / total * 100
-        name = PASCAL_VOC_CLASSES[cid]
-        if pct >= 2:
-            detected_parts.append(f"{name} ({pct:.1f}%)")
-        else:
-            minor_parts.append(f"{name} ({pct:.1f}%)")
-    absent = [
-        f"{PASCAL_VOC_CLASSES[i]} (0.0%)"
-        for i in range(len(PASCAL_VOC_CLASSES))
-        if i not in set(unique_ids.tolist())
-    ]
-    detected_str = ", ".join(detected_parts)
-    undetected_str = ", ".join(minor_parts + absent)
-    return overlay_out, mask_img, detected_str, undetected_str
 def vis_custom_semseg(spatial, orig_image, classes, class_embs):
     """Zero-shot semantic segmentation with user-defined classes."""
     h, w = orig_image.shape[:2]
@@ -820,15 +396,8 @@ def vis_custom_semseg(spatial, orig_image, classes, class_embs):
     undetected_str = ", ".join(minor_parts + absent)
     return overlay_out, mask_img, detected_str, undetected_str
 # ── DPT Depth Inference ─────────────────────────────────────────────────────
-preprocess_depth = transforms.Compose([
-    transforms.Resize((DEPTH_IMAGE_SIZE, DEPTH_IMAGE_SIZE)),
-    transforms.ToTensor(),
-])
 def vis_depth_dpt(depth_map, h, w):
     """Colour a depth map with the turbo colormap → PIL Image."""
     d = depth_map.squeeze()
@@ -836,7 +405,6 @@ def vis_depth_dpt(depth_map, h, w):
     colored = cm.get_cmap("turbo")(d)[:, :, :3].astype(np.float32)
     return to_uint8(upsample(colored, h, w))
 def vis_normals_dpt(normals_map, h, w):
     """Map normals from [-1, 1] to [0, 1] and resize to original size."""
     # normals_map shape is (3, H, W)
@@ -845,16 +413,43 @@ def vis_normals_dpt(normals_map, h, w):
     n = np.transpose(n, (1, 2, 0))  # (H, W, 3)
     return to_uint8(upsample(n, h, w))
-def vis_segmentation_dpt(seg_map, h, w):
-    """Colour a segmentation map with the ADE20K colormap."""
-    # seg_map shape is (150, H, W) — bilinear upsample logits then argmax
     logits = seg_map.cpu().numpy().transpose(1, 2, 0)  # (H, W, 150)
     logits_up = upsample(logits, h, w, mode="bilinear")
     pred = logits_up.argmax(axis=-1)  # (h, w)
-    colored = ADE20K_PALETTE[pred.astype(np.int32) + 1]  # (h, w, 3)
-    return colored
 # ── Gradio callbacks ────────────────────────────────────────────────────────
@@ -869,7 +464,6 @@ def on_variant_change(variant_name):
             None,                      # pca_state
             None, None, "", "")        # custom outputs
 # --- PCA tab callbacks ---
 @spaces.GPU
@@ -886,7 +480,6 @@ def on_pca_extract(image, resolution, pca_state):
     state = {"spatial": spatial, "orig_image": image, "variant": _model["name"], "resolution": resolution}
     return pca, depth, kmeans, state
 @spaces.GPU
 def on_recluster(image, resolution, n_clusters, pca_state):
     if image is None:
@@ -904,29 +497,8 @@ def on_recluster(image, resolution, n_clusters, pca_state):
     h, w = image.shape[:2]
     return vis_kmeans(spatial, h, w, int(n_clusters)), pca_state
 # --- Zero-shot Segmentation tab callbacks ---
-@spaces.GPU
-def on_zeroseg(image, resolution):
-    if image is None:
-        return None, None, "", ""
-    _init_model()
-    spatial = extract_features_value_attention(image, int(resolution))
-    blend, mask, detected, undetected = vis_pascal_context_semseg(spatial, image)
-    return blend, mask, detected, undetected
-@spaces.GPU
-def on_zeroseg_voc(image, resolution):
-    if image is None:
-        return None, None, "", ""
-    _init_model()
-    spatial = extract_features_value_attention(image, int(resolution))
-    blend, mask, detected, undetected = vis_pascal_voc_semseg(spatial, image)
-    return blend, mask, detected, undetected
 @spaces.GPU
 def on_zeroseg_custom(image, resolution, class_names_str):
     if image is None or not class_names_str or not class_names_str.strip():
@@ -953,75 +525,41 @@ def on_zeroseg_custom(image, resolution, class_names_str):
     overlay, mask, detected, undetected = vis_custom_semseg(spatial, image, classes, class_embs)
     return overlay, mask, detected, undetected
 # --- Depth Feature Visualization tab callbacks ---
 @spaces.GPU
 def on_depth_normals_predict(image, dpt_variant, resolution):
-    """Run DPT depth and normals prediction on an input image."""
     if image is None:
         return None, None
     _load_dpt(dpt_variant)
     dev = _device()
-    block_indices = DPT_CONFIGS[dpt_variant]["block_indices"]
-    # Move DPT models to GPU
-    _dpt["model"].to(dev)
-    _dpt["normals_model"].to(dev)
-    _dpt["vision"].to(dev)
     h, w = image.shape[:2]
     img = Image.fromarray(image).convert("RGB")
     tensor = preprocess(img, int(resolution)).unsqueeze(0).to(dev)
-    with torch.no_grad():
-        intermediate = _dpt["vision"].get_intermediate_layers(
-            tensor, n=block_indices,
-            reshape=True, return_class_token=True, norm=True,
-        )
-        dpt_inputs = [(cls_tok, patch_feat)
-                      for patch_feat, cls_tok in intermediate]
-        depth_map = _dpt["model"](dpt_inputs, image_size=(h, w))
-        normals_map = _dpt["normals_model"](dpt_inputs, image_size=(h, w))
-    depth_np = depth_map[0, 0].cpu().numpy()
-    normals_np = normals_map[0]
-    return vis_depth_dpt(depth_np, h, w), vis_normals_dpt(normals_np, h, w)
 @spaces.GPU
 def on_segmentation_predict(image, dpt_variant, resolution):
-    """Run DPT segmentation prediction on an input image."""
     if image is None:
         return None
     _load_dpt(dpt_variant)
     dev = _device()
-    block_indices = DPT_CONFIGS[dpt_variant]["block_indices"]
-    # Move DPT models to GPU
-    _dpt["segmentation_model"].to(dev)
-    _dpt["vision"].to(dev)
     h, w = image.shape[:2]
     img = Image.fromarray(image).convert("RGB")
     tensor = preprocess(img, int(resolution)).unsqueeze(0).to(dev)
-    with torch.no_grad():
-        intermediate = _dpt["vision"].get_intermediate_layers(
-            tensor, n=block_indices,
-            reshape=True, return_class_token=True, norm=True,
-        )
-        dpt_inputs = [(cls_tok, patch_feat)
-                      for patch_feat, cls_tok in intermediate]
-        seg_map = _dpt["segmentation_model"](dpt_inputs, image_size=(h, w))
-    seg_np = seg_map[0]
-    return vis_segmentation_dpt(seg_np, h, w)
 # ── UI ──────────────────────────────────────────────────────────────────────
@@ -1071,7 +609,7 @@ with gr.Blocks(head=head, title="TIPSv2 Feature Explorer") as demo:
                     with gr.Tab("PCA"):
                         pca_out = gr.Image(label="PCA (3 components → RGB)")
                     with gr.Tab("PCA (1st component)"):
-                        depth_out = gr.Image(label="Depth (1st PCA component)")
                     with gr.Tab("K-means Clustering"):
                         n_clusters = gr.Slider(2, 20, value=6, step=1, label="Clusters")
                         recluster_btn = gr.Button("Re-cluster")

 """TIPS Feature Explorer (GPU) — Hugging Face Space demo with ZeroGPU."""
 import colorsys
 import gradio as gr
 import matplotlib.cm as cm
 from fast_pytorch_kmeans import KMeans as TorchKMeans
 from sklearn.decomposition import PCA
 from torchvision import transforms
+from transformers import AutoModel
 # ── Constants ───────────────────────────────────────────────────────────────
 DEFAULT_IMAGE_SIZE = 896
 PATCH_SIZE = 14
+RESOLUTIONS = [224, 336, 448, 672, 896, 1120, 1372, 1792]
 ZEROSEG_IMAGE_SIZE = 1372
 MAX_LEN = 64
+# HF model repos
 VARIANTS = {
+    "TIPS v2 — B/14": "google/tipsv2-b14",
+    "TIPS v2 — L/14": "google/tipsv2-l14",
+    "TIPS v2 — SO400m/14": "google/tipsv2-so400m14",
+    "TIPS v2 — g/14": "google/tipsv2-g14",
+}
+DPT_VARIANTS = {
+    "TIPS v2 — B/14": "google/tipsv2-b14-dpt",
+    "TIPS v2 — L/14": "google/tipsv2-l14-dpt",
+    "TIPS v2 — SO400m/14": "google/tipsv2-so400m14-dpt",
+    "TIPS v2 — g/14": "google/tipsv2-g14-dpt",
 }
 DEFAULT_VARIANT = "TIPS v2 — L/14"
+def _device():
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # ── Pascal Context (59 classes) ─────────────────────────────────────────────
 # TCL prompt templates (from the Scenic zero-shot seg evaluator).
     "wood",
 )
 ADE20K_CLASSES = (
     'wall', 'building', 'sky', 'floor', 'tree',
     'ceiling', 'road', 'bed', 'windowpane', 'grass',
     "text": None,
     "tokenizer": None,
     "temperature": None,
+    "ade20k_embs": None,
+    "_hf_model": None,
 }
 _dpt = {
+    "variant": None,
+    "model": None,
+    "_hf_dpt": None,
 }
 def load_variant(name):
+    """Load a model variant from HuggingFace."""
     global _model
     if _model["name"] == name:
         return
+    hf_model = AutoModel.from_pretrained(VARIANTS[name], trust_remote_code=True)
+    hf_model.eval()
     _model.update(
+        name=name,
+        vision=hf_model.vision_encoder,
+        text=hf_model.text_encoder,
+        tokenizer=hf_model._load_tokenizer(),
+        temperature=hf_model.config.temperature,
+        ade20k_embs=None,
+        voc_embs=None,
+        _hf_model=hf_model,
     )
+    print(f"Loaded {name}")
 def _load_dpt(variant_name=None):
+    """Load DPT heads from HuggingFace."""
     global _dpt
     if variant_name is None:
+        variant_name = DEFAULT_VARIANT
     if _dpt["variant"] == variant_name and _dpt["model"] is not None:
         return
+    hf_dpt = AutoModel.from_pretrained(DPT_VARIANTS[variant_name], trust_remote_code=True)
+    hf_dpt.eval()
+    # Reuse backbone from main model if variants match to save memory
     if _model["name"] == variant_name and _model["vision"] is not None:
+        hf_dpt._backbone = _model["_hf_model"]
+    _dpt.update(variant=variant_name, model=hf_dpt, _hf_dpt=hf_dpt)
+    print(f"Loaded DPT heads for {variant_name}")
 def _move_models_to_device():
     """Move models to the current device (GPU inside @spaces.GPU, else CPU)."""
     if _model["text"] is not None:
         _model["text"].to(dev)
 def _ensure_ade20k_embs():
     """Pre-compute Pascal Context text embeddings if not yet done (must run on GPU)."""
     if _model["ade20k_embs"] is not None:
     _model["ade20k_embs"] = l2_normalize(np.mean(all_embs, axis=0))
     print("Pascal Context text embeddings computed.")
 def _init_model():
     """Load model + move to GPU + compute text embeddings."""
     load_variant(_model["name"] or DEFAULT_VARIANT)
     _move_models_to_device()
     _ensure_ade20k_embs()
 # ── Preprocessing & helpers ─────────────────────────────────────────────────
         transforms.ToTensor(),
     ])(img)
 def l2_normalize(x, axis=-1):
     return x / np.linalg.norm(x, ord=2, axis=axis, keepdims=True).clip(min=1e-3)
 def upsample(arr, h, w, mode="bilinear"):
     """Upsample (H, W, C) or (H, W) numpy array to (h, w, ...)."""
     t = torch.from_numpy(arr).float()
     up = F.interpolate(t, size=(h, w), mode=mode, **kwargs)
     return up[0].permute(1, 2, 0).numpy()
 def to_uint8(x):
     return (x * 255).clip(0, 255).astype(np.uint8)
 # ── Feature extraction (GPU-accelerated) ────────────────────────────────────
 @torch.no_grad()
     sp = resolution // PATCH_SIZE
     return patch_tokens.cpu().reshape(sp, sp, -1).numpy()
 @torch.no_grad()
 def extract_features_value_attention(image_np, resolution=ZEROSEG_IMAGE_SIZE):
     """Return spatial features (sp, sp, D) using Value Attention on GPU.
     spatial = patch_tokens.cpu().reshape(sp, sp, -1).numpy()
     return spatial
 # ── PCA Visualisations ──────────────────────────────────────────────────────
 def vis_pca(spatial, h, w):
     rgb = 1 / (1 + np.exp(-2.0 * rgb))
     return to_uint8(upsample(rgb, h, w))
 def vis_depth(spatial, h, w):
+    """1st PCA component visualized with inferno colormap."""
     feat = spatial.reshape(-1, spatial.shape[-1])
     H, W = spatial.shape[0], spatial.shape[1]
     depth = PCA(n_components=1).fit_transform(feat).reshape(H, W)
     colored = cm.get_cmap("inferno")(depth)[:, :, :3].astype(np.float32)
     return to_uint8(upsample(colored, h, w))
 def vis_kmeans(spatial, h, w, n_clusters=6):
     """K-means clustering of spatial features."""
     H, W = spatial.shape[:2]
     seg = palette[labels].astype(np.float32)
     return to_uint8(seg)
 # ── Zero-shot Segmentation ──────────────────────────────────────────────────
 def vis_custom_semseg(spatial, orig_image, classes, class_embs):
     """Zero-shot semantic segmentation with user-defined classes."""
     h, w = orig_image.shape[:2]
     undetected_str = ", ".join(minor_parts + absent)
     return overlay_out, mask_img, detected_str, undetected_str
 # ── DPT Depth Inference ─────────────────────────────────────────────────────
 def vis_depth_dpt(depth_map, h, w):
     """Colour a depth map with the turbo colormap → PIL Image."""
     d = depth_map.squeeze()
     colored = cm.get_cmap("turbo")(d)[:, :, :3].astype(np.float32)
     return to_uint8(upsample(colored, h, w))
 def vis_normals_dpt(normals_map, h, w):
     """Map normals from [-1, 1] to [0, 1] and resize to original size."""
     # normals_map shape is (3, H, W)
     n = np.transpose(n, (1, 2, 0))  # (H, W, 3)
     return to_uint8(upsample(n, h, w))
+def vis_segmentation_dpt(seg_map, orig_image):
+    """Colour a segmentation map with the ADE20K colormap + legend."""
+    h, w = orig_image.shape[:2]
     logits = seg_map.cpu().numpy().transpose(1, 2, 0)  # (H, W, 150)
     logits_up = upsample(logits, h, w, mode="bilinear")
     pred = logits_up.argmax(axis=-1)  # (h, w)
+    seg_rgb = ADE20K_PALETTE[pred.astype(np.int32) + 1].astype(np.float32) / 255.0
+    blend = 0.15 * orig_image.astype(np.float32) / 255.0 + 0.85 * seg_rgb
+    blend_img = Image.fromarray(to_uint8(blend))
+    # Legend: top-10 classes by area
+    unique_ids, counts = np.unique(pred, return_counts=True)
+    order = np.argsort(-counts)
+    unique_ids, counts = unique_ids[order], counts[order]
+    try:
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 60)
+    except OSError:
+        font = ImageFont.load_default()
+    n_legend = min(len(unique_ids), 10)
+    row_h, swatch_w, pad, legend_w = 80, 60, 12, 450
+    legend_h = max(h, n_legend * row_h + pad * 2)
+    canvas = Image.new("RGB", (w + legend_w, legend_h), (255, 255, 255))
+    canvas.paste(blend_img, (0, 0))
+    draw = ImageDraw.Draw(canvas)
+    for i in range(n_legend):
+        cid = unique_ids[i]
+        color = tuple(ADE20K_PALETTE[cid + 1].tolist())
+        name = ADE20K_CLASSES[cid] if cid < len(ADE20K_CLASSES) else f"class_{cid}"
+        y_top = pad + i * row_h
+        draw.rectangle([w + pad, y_top, w + pad + swatch_w, y_top + swatch_w], fill=color, outline=(0, 0, 0))
+        draw.text((w + pad + swatch_w + 8, y_top + 6), name, fill="black", font=font)
+    return np.array(canvas)
 # ── Gradio callbacks ────────────────────────────────────────────────────────
             None,                      # pca_state
             None, None, "", "")        # custom outputs
 # --- PCA tab callbacks ---
 @spaces.GPU
     state = {"spatial": spatial, "orig_image": image, "variant": _model["name"], "resolution": resolution}
     return pca, depth, kmeans, state
 @spaces.GPU
 def on_recluster(image, resolution, n_clusters, pca_state):
     if image is None:
     h, w = image.shape[:2]
     return vis_kmeans(spatial, h, w, int(n_clusters)), pca_state
 # --- Zero-shot Segmentation tab callbacks ---
 @spaces.GPU
 def on_zeroseg_custom(image, resolution, class_names_str):
     if image is None or not class_names_str or not class_names_str.strip():
     overlay, mask, detected, undetected = vis_custom_semseg(spatial, image, classes, class_embs)
     return overlay, mask, detected, undetected
 # --- Depth Feature Visualization tab callbacks ---
 @spaces.GPU
 def on_depth_normals_predict(image, dpt_variant, resolution):
+    """Run DPT depth and normals prediction."""
     if image is None:
         return None, None
     _load_dpt(dpt_variant)
     dev = _device()
+    dpt = _dpt["model"].to(dev)
     h, w = image.shape[:2]
     img = Image.fromarray(image).convert("RGB")
     tensor = preprocess(img, int(resolution)).unsqueeze(0).to(dev)
+    depth_map = dpt.predict_depth(tensor)
+    normals_map = dpt.predict_normals(tensor)
+    return vis_depth_dpt(depth_map[0, 0].cpu().numpy(), h, w), vis_normals_dpt(normals_map[0], h, w)
 @spaces.GPU
 def on_segmentation_predict(image, dpt_variant, resolution):
+    """Run DPT segmentation prediction."""
     if image is None:
         return None
     _load_dpt(dpt_variant)
     dev = _device()
+    dpt = _dpt["model"].to(dev)
     h, w = image.shape[:2]
     img = Image.fromarray(image).convert("RGB")
     tensor = preprocess(img, int(resolution)).unsqueeze(0).to(dev)
+    seg_map = dpt.predict_segmentation(tensor)
+    return vis_segmentation_dpt(seg_map[0], image)
 # ── UI ──────────────────────────────────────────────────────────────────────
                     with gr.Tab("PCA"):
                         pca_out = gr.Image(label="PCA (3 components → RGB)")
                     with gr.Tab("PCA (1st component)"):
+                        depth_out = gr.Image(label="1st PCA component")
                     with gr.Tab("K-means Clustering"):
                         n_clusters = gr.Slider(2, 20, value=6, step=1, label="Clusters")
                         recluster_btn = gr.Button("Re-cluster")