sapiens2-pointmap

Runtime error

App Files Files Community

Rawal Khirodkar commited on 13 days ago

Commit

2482c8d

1 Parent(s): 2593450

Pointmap: pivot to depth-z heatmap (turbo); drop Model3D + Open3D + .ply pipeline

Browse files

Files changed (2) hide show

app.py +45 -86
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 """Sapiens2 pointmap Gradio Space.
-Image → per-pixel 3D pointmap (camera frame, metric units). The result is
-exported as a .ply point cloud and rendered with Gradio's Model3D component
-for interactive 3D viewing. Optionally applies a v1 binary fg/bg mask so only
-foreground points end up in the cloud.
 """
 import sys
@@ -15,7 +14,6 @@ import tempfile
 import cv2
 import gradio as gr
 import numpy as np
-import open3d as o3d
 import spaces
 import torch
 import torch.nn.functional as F
@@ -95,20 +93,16 @@ def _get_fg_model():
 # Iteration mode: only preload the default (0.4B) for fast Space boot.
-# Re-enable full preload by uncommenting the loop below.
 print("[startup] pre-loading 0.4B (iteration mode) + fg/bg ...")
 _get_pointmap_model(DEFAULT_SIZE)
 _get_fg_model()
-# for _size in POINTMAP_MODELS:
-#     _get_pointmap_model(_size)
 print("[startup] ready.")
 # -----------------------------------------------------------------------------
-# Inference
 def _estimate_pointmap(image_bgr: np.ndarray, model) -> np.ndarray:
-    h0, w0 = image_bgr.shape[:2]
     data = model.pipeline(dict(img=image_bgr))
     data = model.data_preprocessor(data)
     inputs, data_samples = data["inputs"], data["data_samples"]
@@ -119,15 +113,13 @@ def _estimate_pointmap(image_bgr: np.ndarray, model) -> np.ndarray:
         pointmap, scale = model(inputs)
         pointmap = pointmap / scale  # → metric units
-    pad = data_samples["meta"]["padding_size"]
-    pad_left, pad_right, pad_top, pad_bottom = pad
     pointmap = pointmap[
         :, :,
         pad_top : inputs.shape[2] - pad_bottom,
         pad_left : inputs.shape[3] - pad_right,
     ]
-    pointmap = F.interpolate(pointmap, size=(h0, w0), mode="bilinear", align_corners=False)
-    return pointmap.squeeze(0).cpu().float().numpy().transpose(1, 2, 0)  # (H, W, 3)
 def _foreground_mask(image_pil: Image.Image, target_h: int, target_w: int) -> np.ndarray:
@@ -139,81 +131,58 @@ def _foreground_mask(image_pil: Image.Image, target_h: int, target_w: int) -> np
     return (out.argmax(dim=1)[0] > 0).cpu().numpy()
-# -----------------------------------------------------------------------------
-# Point cloud export
-def _camera_marker(radius: float = 0.04, n_points: int = 800,
-                   color=(0.20, 0.55, 0.96)) -> o3d.geometry.PointCloud:
-    """Small uniformly-blue sphere at the world origin marking the camera.
-    Manual Fibonacci-sphere sampling — instant, vs Open3D's poisson-disk which
-    can take seconds per call.
     """
-    rng = np.random.default_rng(0)
-    i = np.arange(n_points)
-    phi = np.arccos(1 - 2 * (i + 0.5) / n_points)             # latitude
-    theta = np.pi * (1 + 5 ** 0.5) * (i + 0.5)                # golden-angle longitude
-    pts = np.stack([
-        radius * np.sin(phi) * np.cos(theta),
-        radius * np.sin(phi) * np.sin(theta),
-        radius * np.cos(phi),
-    ], axis=1)
-    pc = o3d.geometry.PointCloud()
-    pc.points = o3d.utility.Vector3dVector(pts.astype(np.float64))
-    pc.colors = o3d.utility.Vector3dVector(np.tile(color, (n_points, 1)).astype(np.float64))
-    return pc
-def _make_ply(image_rgb: np.ndarray, pointmap_hwc: np.ndarray, mask_hw: np.ndarray | None = None,
-              max_points: int = 200_000) -> str:
-    pts = pointmap_hwc.reshape(-1, 3)
-    cols = (image_rgb.reshape(-1, 3).astype(np.float32) / 255.0)
-    z = pts[:, 2]
-    finite = np.isfinite(pts).all(axis=1) & (z > 0.05) & (z < 25.0)
-    if mask_hw is not None:
-        finite &= mask_hw.reshape(-1)
-    pts, cols = pts[finite], cols[finite]
-    if len(pts) > max_points:
-        idx = np.random.default_rng(0).choice(len(pts), size=max_points, replace=False)
-        pts, cols = pts[idx], cols[idx]
-    pc = o3d.geometry.PointCloud()
-    pc.points = o3d.utility.Vector3dVector(pts.astype(np.float64))
-    pc.colors = o3d.utility.Vector3dVector(cols.astype(np.float64))
-    # Add the camera marker (blue ball at origin) so users see where the
-    # observer is in the reconstructed 3D scene.
-    pc += _camera_marker()
-    out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".ply").name
-    o3d.io.write_point_cloud(out_path, pc, write_ascii=False)
-    return out_path
 # -----------------------------------------------------------------------------
 # Gradio handler
-@spaces.GPU(duration=180)
 def predict(image: Image.Image, size: str):
     if image is None:
-        return None, None
     image_pil = image.convert("RGB")
-    image_rgb = np.array(image_pil)
-    image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
-    h0, w0 = image_rgb.shape[:2]
     model = _get_pointmap_model(size)
-    pointmap = _estimate_pointmap(image_bgr, model)
-    # Foreground masking is mandatory — keeps the cloud clean and the camera
-    # marker meaningful (background depth is unreliable).
-    mask = _foreground_mask(image_pil, h0, w0)
-    ply_path = _make_ply(image_rgb, pointmap, mask)
-    return ply_path, ply_path
 # -----------------------------------------------------------------------------
@@ -281,14 +250,7 @@ with gr.Blocks(title="Sapiens2 Pointmap", theme=gr.themes.Soft(), css=CUSTOM_CSS
     with gr.Row(equal_height=True):
         inp = gr.Image(label="Input", type="pil", height=640)
-        out_ply = gr.Model3D(
-            label="Point cloud — drag to rotate, scroll to zoom, shift+drag to pan",
-            height=640,
-            clear_color=[0.07, 0.09, 0.13, 1.0],   # subtle slate-900 backdrop
-            display_mode="point_cloud",
-            zoom_speed=0.7,
-            pan_speed=0.5,
-        )
     with gr.Row():
         size = gr.Radio(
@@ -301,10 +263,7 @@ with gr.Blocks(title="Sapiens2 Pointmap", theme=gr.themes.Soft(), css=CUSTOM_CSS
     gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
-    with gr.Accordion("Raw Pointmap", open=False):
-        out_ply_file = gr.File(label="Point cloud (.ply — open in MeshLab/CloudCompare/Blender)")
-    run.click(predict, inputs=[inp, size], outputs=[out_ply, out_ply_file])
 if __name__ == "__main__":

 """Sapiens2 pointmap Gradio Space.
+Image → per-pixel 3D pointmap (camera frame, metric units). For now we just
+visualize the depth (z) channel as a colored heatmap, matching the look of the
+normal demo. The 3D point-cloud viewer can be re-enabled later.
 """
 import sys
 import cv2
 import gradio as gr
 import numpy as np
 import spaces
 import torch
 import torch.nn.functional as F
 # Iteration mode: only preload the default (0.4B) for fast Space boot.
 print("[startup] pre-loading 0.4B (iteration mode) + fg/bg ...")
 _get_pointmap_model(DEFAULT_SIZE)
 _get_fg_model()
 print("[startup] ready.")
 # -----------------------------------------------------------------------------
+# Inference (operates at the model's native resolution — no big upsamples)
 def _estimate_pointmap(image_bgr: np.ndarray, model) -> np.ndarray:
     data = model.pipeline(dict(img=image_bgr))
     data = model.data_preprocessor(data)
     inputs, data_samples = data["inputs"], data["data_samples"]
         pointmap, scale = model(inputs)
         pointmap = pointmap / scale  # → metric units
+    pad_left, pad_right, pad_top, pad_bottom = data_samples["meta"]["padding_size"]
     pointmap = pointmap[
         :, :,
         pad_top : inputs.shape[2] - pad_bottom,
         pad_left : inputs.shape[3] - pad_right,
     ]
+    return pointmap.squeeze(0).cpu().float().numpy().transpose(1, 2, 0)  # (H_native, W_native, 3)
 def _foreground_mask(image_pil: Image.Image, target_h: int, target_w: int) -> np.ndarray:
     return (out.argmax(dim=1)[0] > 0).cpu().numpy()
+def _depth_to_rgb(depth: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray:
+    """Depth (H, W) → RGB (H, W, 3) uint8 via inverse-depth turbo colormap.
+    Inverse-depth (1/z) gives more contrast on near surfaces (where humans tend
+    to be), which matches what most SfM/depth viewers show.
     """
+    valid = np.isfinite(depth) & (depth > 1e-3)
+    if mask is not None:
+        valid &= mask
+    if not valid.any():
+        return np.zeros((*depth.shape, 3), dtype=np.uint8)
+    inv = np.zeros_like(depth, dtype=np.float32)
+    inv[valid] = 1.0 / depth[valid]
+    p1, p99 = np.percentile(inv[valid], [1, 99])
+    lo, hi = float(p1), float(p99)
+    if hi <= lo:
+        hi = lo + 1e-3
+    norm = np.zeros_like(inv, dtype=np.float32)
+    norm[valid] = ((inv[valid] - lo) / (hi - lo)).clip(0, 1)
+    grey = (norm * 255.0).astype(np.uint8)
+    # cv2.applyColorMap returns BGR — flip to RGB for Gradio.
+    rgb = cv2.applyColorMap(grey, cv2.COLORMAP_TURBO)[:, :, ::-1].copy()
+    if mask is not None:
+        rgb[~mask] = 0  # background → black
+    return rgb
 # -----------------------------------------------------------------------------
 # Gradio handler
+@spaces.GPU(duration=120)
 def predict(image: Image.Image, size: str):
     if image is None:
+        return None
     image_pil = image.convert("RGB")
+    image_bgr = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
     model = _get_pointmap_model(size)
+    pointmap = _estimate_pointmap(image_bgr, model)         # (H_native, W_native, 3)
+    h_n, w_n = pointmap.shape[:2]
+    mask = _foreground_mask(image_pil, h_n, w_n)            # native-res mask, fast
+    depth = pointmap[:, :, 2]                                # z channel
+    rgb_native = _depth_to_rgb(depth, mask)                  # (H_native, W_native, 3) uint8
+    # Lanczos upsample the RGB heatmap to the original image size — sharp.
+    w0, h0 = image_pil.size
+    rgb_pil = Image.fromarray(rgb_native).resize((w0, h0), Image.LANCZOS)
+    return rgb_pil
 # -----------------------------------------------------------------------------
     with gr.Row(equal_height=True):
         inp = gr.Image(label="Input", type="pil", height=640)
+        out_img = gr.Image(label="Depth (turbo)", type="pil", height=640)
     with gr.Row():
         size = gr.Radio(
     gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
+    run.click(predict, inputs=[inp, size], outputs=[out_img])
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -19,4 +19,3 @@ prettytable
 termcolor
 accelerate
 rich
-open3d

 termcolor
 accelerate
 rich