Spaces:

facebook
/

sapiens2-pointmap

Running on Zero

App Files Files Community

Rawal Khirodkar commited on 14 days ago

Commit

2c70f2e

1 Parent(s): 380dd37

Pointmap: bring back .ply + Model3D, but native-res only (max 1024×768 grid → 200K pts)

Browse files

Files changed (2) hide show

app.py +71 -43
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,8 +1,11 @@
 """Sapiens2 pointmap Gradio Space.
-Image → per-pixel 3D pointmap (camera frame, metric units). For now we just
-visualize the depth (z) channel as a colored heatmap, matching the look of the
-normal demo. The 3D point-cloud viewer can be re-enabled later.
 """
 import sys
@@ -14,6 +17,7 @@ import tempfile
 import cv2
 import gradio as gr
 import numpy as np
 import spaces
 import torch
 import torch.nn.functional as F
@@ -92,7 +96,6 @@ def _get_fg_model():
     return _fg_model
-# Iteration mode: only preload the default (0.4B) for fast Space boot.
 print("[startup] pre-loading 0.4B (iteration mode) + fg/bg ...")
 _get_pointmap_model(DEFAULT_SIZE)
 _get_fg_model()
@@ -100,7 +103,7 @@ print("[startup] ready.")
 # -----------------------------------------------------------------------------
-# Inference (operates at the model's native resolution — no big upsamples)
 def _estimate_pointmap(image_bgr: np.ndarray, model) -> np.ndarray:
     data = model.pipeline(dict(img=image_bgr))
@@ -111,7 +114,7 @@ def _estimate_pointmap(image_bgr: np.ndarray, model) -> np.ndarray:
     with torch.no_grad():
         pointmap, scale = model(inputs)
-        pointmap = pointmap / scale  # → metric units
     pad_left, pad_right, pad_top, pad_bottom = data_samples["meta"]["padding_size"]
     pointmap = pointmap[
@@ -131,33 +134,52 @@ def _foreground_mask(image_pil: Image.Image, target_h: int, target_w: int) -> np
     return (out.argmax(dim=1)[0] > 0).cpu().numpy()
-def _depth_to_rgb(depth: np.ndarray, mask: np.ndarray | None = None) -> np.ndarray:
-    """Depth (H, W) → RGB (H, W, 3) uint8 via inverse-depth turbo colormap.
-    Inverse-depth (1/z) gives more contrast on near surfaces (where humans tend
-    to be), which matches what most SfM/depth viewers show.
-    """
-    valid = np.isfinite(depth) & (depth > 1e-3)
-    if mask is not None:
-        valid &= mask
-    if not valid.any():
-        return np.zeros((*depth.shape, 3), dtype=np.uint8)
-    inv = np.zeros_like(depth, dtype=np.float32)
-    inv[valid] = 1.0 / depth[valid]
-    p1, p99 = np.percentile(inv[valid], [1, 99])
-    lo, hi = float(p1), float(p99)
-    if hi <= lo:
-        hi = lo + 1e-3
-    norm = np.zeros_like(inv, dtype=np.float32)
-    norm[valid] = ((inv[valid] - lo) / (hi - lo)).clip(0, 1)
-    grey = (norm * 255.0).astype(np.uint8)
-    # cv2.applyColorMap returns BGR — flip to RGB for Gradio.
-    rgb = cv2.applyColorMap(grey, cv2.COLORMAP_TURBO)[:, :, ::-1].copy()
-    if mask is not None:
-        rgb[~mask] = 0  # background → black
-    return rgb
 # -----------------------------------------------------------------------------
@@ -166,23 +188,19 @@ def _depth_to_rgb(depth: np.ndarray, mask: np.ndarray | None = None) -> np.ndarr
 @spaces.GPU(duration=120)
 def predict(image: Image.Image, size: str):
     if image is None:
-        return None
     image_pil = image.convert("RGB")
     image_bgr = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
     model = _get_pointmap_model(size)
-    pointmap = _estimate_pointmap(image_bgr, model)         # (H_native, W_native, 3)
     h_n, w_n = pointmap.shape[:2]
     mask = _foreground_mask(image_pil, h_n, w_n)            # native-res mask, fast
-    depth = pointmap[:, :, 2]                                # z channel
-    rgb_native = _depth_to_rgb(depth, mask)                  # (H_native, W_native, 3) uint8
-    # Lanczos upsample the RGB heatmap to the original image size — sharp.
-    w0, h0 = image_pil.size
-    rgb_pil = Image.fromarray(rgb_native).resize((w0, h0), Image.LANCZOS)
-    return rgb_pil
 # -----------------------------------------------------------------------------
@@ -250,7 +268,14 @@ with gr.Blocks(title="Sapiens2 Pointmap", theme=gr.themes.Soft(), css=CUSTOM_CSS
     with gr.Row(equal_height=True):
         inp = gr.Image(label="Input", type="pil", height=640)
-        out_img = gr.Image(label="Depth (Z)", type="pil", height=640)
     with gr.Row():
         size = gr.Radio(
@@ -263,7 +288,10 @@ with gr.Blocks(title="Sapiens2 Pointmap", theme=gr.themes.Soft(), css=CUSTOM_CSS
     gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
-    run.click(predict, inputs=[inp, size], outputs=[out_img])
 if __name__ == "__main__":

 """Sapiens2 pointmap Gradio Space.
+Image → per-pixel 3D pointmap (camera frame, metric units). Visualized as a
+.ply point cloud rendered with Gradio's Model3D component for interactive 3D
+viewing. Foreground mask is mandatory.
+Everything runs at the model's NATIVE resolution (max 1024×768 grid → at most
+~786K points before subsampling to 200K). No huge interpolations.
 """
 import sys
 import cv2
 import gradio as gr
 import numpy as np
+import open3d as o3d
 import spaces
 import torch
 import torch.nn.functional as F
     return _fg_model
 print("[startup] pre-loading 0.4B (iteration mode) + fg/bg ...")
 _get_pointmap_model(DEFAULT_SIZE)
 _get_fg_model()
 # -----------------------------------------------------------------------------
+# Inference (always at native resolution)
 def _estimate_pointmap(image_bgr: np.ndarray, model) -> np.ndarray:
     data = model.pipeline(dict(img=image_bgr))
     with torch.no_grad():
         pointmap, scale = model(inputs)
+        pointmap = pointmap / scale  # → metric
     pad_left, pad_right, pad_top, pad_bottom = data_samples["meta"]["padding_size"]
     pointmap = pointmap[
     return (out.argmax(dim=1)[0] > 0).cpu().numpy()
+# -----------------------------------------------------------------------------
+# Point cloud export (camera marker + cloud, native-res grid)
+def _camera_marker(radius: float = 0.04, n_points: int = 800,
+                   color=(0.20, 0.55, 0.96)) -> o3d.geometry.PointCloud:
+    """Tiny slate-blue Fibonacci sphere at the world origin."""
+    i = np.arange(n_points)
+    phi = np.arccos(1 - 2 * (i + 0.5) / n_points)
+    theta = np.pi * (1 + 5 ** 0.5) * (i + 0.5)
+    pts = np.stack([
+        radius * np.sin(phi) * np.cos(theta),
+        radius * np.sin(phi) * np.sin(theta),
+        radius * np.cos(phi),
+    ], axis=1)
+    pc = o3d.geometry.PointCloud()
+    pc.points = o3d.utility.Vector3dVector(pts.astype(np.float64))
+    pc.colors = o3d.utility.Vector3dVector(np.tile(color, (n_points, 1)).astype(np.float64))
+    return pc
+def _make_ply(image_pil_native: Image.Image, pointmap_hwc: np.ndarray,
+              mask_hw: np.ndarray, max_points: int = 200_000) -> str:
+    """`image_pil_native` MUST already be sized to `pointmap_hwc.shape[:2]` so
+    point colors line up. Output .ply: foreground points + camera marker."""
+    h, w = pointmap_hwc.shape[:2]
+    image_rgb = np.asarray(image_pil_native.resize((w, h), Image.LANCZOS))
+    pts = pointmap_hwc.reshape(-1, 3)
+    cols = image_rgb.reshape(-1, 3).astype(np.float32) / 255.0
+    z = pts[:, 2]
+    finite = np.isfinite(pts).all(axis=1) & (z > 0.05) & (z < 25.0) & mask_hw.reshape(-1)
+    pts, cols = pts[finite], cols[finite]
+    if len(pts) > max_points:
+        idx = np.random.default_rng(0).choice(len(pts), size=max_points, replace=False)
+        pts, cols = pts[idx], cols[idx]
+    pc = o3d.geometry.PointCloud()
+    pc.points = o3d.utility.Vector3dVector(pts.astype(np.float64))
+    pc.colors = o3d.utility.Vector3dVector(cols.astype(np.float64))
+    pc += _camera_marker()
+    out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".ply").name
+    o3d.io.write_point_cloud(out_path, pc, write_ascii=False)
+    return out_path
 # -----------------------------------------------------------------------------
 @spaces.GPU(duration=120)
 def predict(image: Image.Image, size: str):
     if image is None:
+        return None, None
     image_pil = image.convert("RGB")
     image_bgr = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
     model = _get_pointmap_model(size)
+    pointmap = _estimate_pointmap(image_bgr, model)        # (H_n, W_n, 3) — at most 1024 in either dim
     h_n, w_n = pointmap.shape[:2]
     mask = _foreground_mask(image_pil, h_n, w_n)            # native-res mask, fast
+    ply_path = _make_ply(image_pil, pointmap, mask)         # native-res .ply
+    return ply_path, ply_path
 # -----------------------------------------------------------------------------
     with gr.Row(equal_height=True):
         inp = gr.Image(label="Input", type="pil", height=640)
+        out_ply = gr.Model3D(
+            label="Point cloud — drag to rotate, scroll to zoom, shift+drag to pan",
+            height=640,
+            clear_color=[0.07, 0.09, 0.13, 1.0],
+            display_mode="point_cloud",
+            zoom_speed=0.7,
+            pan_speed=0.5,
+        )
     with gr.Row():
         size = gr.Radio(
     gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
+    with gr.Accordion("Raw Pointmap", open=False):
+        out_ply_file = gr.File(label="Point cloud (.ply — open in MeshLab/CloudCompare/Blender)")
+    run.click(predict, inputs=[inp, size], outputs=[out_ply, out_ply_file])
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -19,3 +19,4 @@ prettytable
 termcolor
 accelerate
 rich

 termcolor
 accelerate
 rich
+open3d