Spaces:

facebook
/

sapiens2-pointmap

Running on Zero

App Files Files Community

Rawal Khirodkar commited on 13 days ago

Commit

5dd5fbb

1 Parent(s): 824c1d9

Pointmap: trimesh→.glb (MoGe-2 pattern), Model3D back, cap input height to 1024

Browse files

Files changed (2) hide show

app.py +65 -76
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 """Sapiens2 pointmap Gradio Space.
-Image → per-pixel 3D pointmap (camera frame, metric units). Visualized as a
-.ply point cloud rendered with Gradio's Model3D component for interactive 3D
-viewing. Foreground mask is mandatory.
-Everything runs at the model's NATIVE resolution (max 1024×768 grid → at most
-~786K points before subsampling to 200K). No huge interpolations.
 """
 import sys
@@ -13,14 +14,15 @@ import os
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 import tempfile
 import cv2
 import gradio as gr
 import numpy as np
-import open3d as o3d
 import spaces
 import torch
 import torch.nn.functional as F
 from PIL import Image
 from torchvision import transforms
@@ -57,12 +59,13 @@ POINTMAP_MODELS = {
         "config": os.path.join(CONFIGS_DIR, "sapiens2_5b_pointmap_render_people-1024x768.py"),
     },
 }
-DEFAULT_SIZE = "0.4B"  # iteration mode — only this is preloaded; others lazy-load on click
 FG_REPO = "facebook/sapiens-seg-foreground-1b-torchscript"
 FG_FILENAME = "sapiens_1b_seg_foreground_epoch_8_torchscript.pt2"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 _fg_transform = transforms.Compose([
     transforms.Resize((1024, 768)),
@@ -103,7 +106,15 @@ print("[startup] ready.")
 # -----------------------------------------------------------------------------
-# Inference (always at native resolution)
 def _estimate_pointmap(image_bgr: np.ndarray, model) -> np.ndarray:
     data = model.pipeline(dict(img=image_bgr))
@@ -134,116 +145,87 @@ def _foreground_mask(image_pil: Image.Image, target_h: int, target_w: int) -> np
     return (out.argmax(dim=1)[0] > 0).cpu().numpy()
-def _depth_to_rgb(depth: np.ndarray, mask: np.ndarray) -> np.ndarray:
-    """Inverse-depth turbo colormap (matches sapiens2 vis_pointmap.py).
-    Background pixels are left at 0 — caller should overlay them."""
-    valid = np.isfinite(depth) & (depth > 1e-3) & mask
-    rgb = np.zeros((*depth.shape, 3), dtype=np.uint8)
-    if not valid.any():
-        return rgb
-    inv = np.zeros_like(depth, dtype=np.float32)
-    inv[valid] = 1.0 / depth[valid]
-    p1, p99 = np.percentile(inv[valid], [1, 99])
-    lo, hi = float(p1), float(p99)
-    if hi <= lo:
-        hi = lo + 1e-3
-    norm = ((inv - lo) / (hi - lo)).clip(0, 1)
-    grey = (norm * 255.0).astype(np.uint8)
-    color = cv2.applyColorMap(grey, cv2.COLORMAP_TURBO)[:, :, ::-1]  # cv2 is BGR → RGB
-    rgb[valid] = color[valid]
-    return rgb
 # -----------------------------------------------------------------------------
-# Point cloud export (camera marker + cloud, native-res grid)
 def _camera_marker(radius: float = 0.04, n_points: int = 800,
-                   color=(0.20, 0.55, 0.96)) -> o3d.geometry.PointCloud:
-    """Tiny slate-blue Fibonacci sphere at the world origin."""
     i = np.arange(n_points)
     phi = np.arccos(1 - 2 * (i + 0.5) / n_points)
     theta = np.pi * (1 + 5 ** 0.5) * (i + 0.5)
-    pts = np.stack([
         radius * np.sin(phi) * np.cos(theta),
         radius * np.sin(phi) * np.sin(theta),
         radius * np.cos(phi),
-    ], axis=1)
-    pc = o3d.geometry.PointCloud()
-    pc.points = o3d.utility.Vector3dVector(pts.astype(np.float64))
-    pc.colors = o3d.utility.Vector3dVector(np.tile(color, (n_points, 1)).astype(np.float64))
-    return pc
-def _make_ply(image_pil_native: Image.Image, pointmap_hwc: np.ndarray,
               mask_hw: np.ndarray, max_points: int = 200_000) -> str:
-    """`image_pil_native` MUST already be sized to `pointmap_hwc.shape[:2]` so
-    point colors line up. Output .ply: foreground points + camera marker."""
     h, w = pointmap_hwc.shape[:2]
     image_rgb = np.asarray(image_pil_native.resize((w, h), Image.LANCZOS))
-    pts = pointmap_hwc.reshape(-1, 3)
-    cols = image_rgb.reshape(-1, 3).astype(np.float32) / 255.0
     z = pts[:, 2]
     finite = np.isfinite(pts).all(axis=1) & (z > 0.05) & (z < 25.0) & mask_hw.reshape(-1)
-    pts, cols = pts[finite], cols[finite]
     if len(pts) > max_points:
         idx = np.random.default_rng(0).choice(len(pts), size=max_points, replace=False)
-        pts, cols = pts[idx], cols[idx]
-    pc = o3d.geometry.PointCloud()
-    pc.points = o3d.utility.Vector3dVector(pts.astype(np.float64))
-    pc.colors = o3d.utility.Vector3dVector(cols.astype(np.float64))
-    pc += _camera_marker()
-    out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".ply").name
-    o3d.io.write_point_cloud(out_path, pc, write_ascii=False)
     return out_path
 # -----------------------------------------------------------------------------
 # Gradio handler
-import time as _t
 @spaces.GPU(duration=120)
 def predict(image: Image.Image, size: str):
     if image is None:
         return None, None
     t0 = _t.perf_counter()
-    image_pil = image.convert("RGB")
     image_bgr = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
-    print(f"[time] convert+bgr           {(_t.perf_counter()-t0)*1000:.0f} ms  (input {image_pil.size})")
     t = _t.perf_counter()
     model = _get_pointmap_model(size)
-    print(f"[time] _get_pointmap_model   {(_t.perf_counter()-t)*1000:.0f} ms")
-    t = _t.perf_counter()
     pointmap = _estimate_pointmap(image_bgr, model)
     h_n, w_n = pointmap.shape[:2]
-    print(f"[time] _estimate_pointmap    {(_t.perf_counter()-t)*1000:.0f} ms  (native {w_n}x{h_n})")
     t = _t.perf_counter()
     mask = _foreground_mask(image_pil, h_n, w_n)
-    print(f"[time] _foreground_mask      {(_t.perf_counter()-t)*1000:.0f} ms")
     t = _t.perf_counter()
-    depth = pointmap[:, :, 2]
-    depth_rgb = _depth_to_rgb(depth, mask)
-    depth_rgb[~mask] = 200
-    w0, h0 = image_pil.size
-    depth_pil = Image.fromarray(depth_rgb).resize((w0, h0), Image.LANCZOS)
-    print(f"[time] depth heatmap+resize  {(_t.perf_counter()-t)*1000:.0f} ms  (target {w0}x{h0})")
-    t = _t.perf_counter()
-    ply_path = _make_ply(image_pil, pointmap, mask)
-    print(f"[time] _make_ply             {(_t.perf_counter()-t)*1000:.0f} ms")
-    print(f"[time] TOTAL                 {(_t.perf_counter()-t0)*1000:.0f} ms")
-    return depth_pil, ply_path
 # -----------------------------------------------------------------------------
@@ -311,7 +293,14 @@ with gr.Blocks(title="Sapiens2 Pointmap", theme=gr.themes.Soft(), css=CUSTOM_CSS
     with gr.Row(equal_height=True):
         inp = gr.Image(label="Input", type="pil", height=640)
-        out_img = gr.Image(label="Depth (Z)", type="pil", height=640)
     with gr.Row():
         size = gr.Radio(
@@ -325,9 +314,9 @@ with gr.Blocks(title="Sapiens2 Pointmap", theme=gr.themes.Soft(), css=CUSTOM_CSS
     gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
     with gr.Accordion("Raw Pointmap", open=False):
-        out_ply_file = gr.File(label="Point cloud (.ply — open in MeshLab/CloudCompare/Blender)")
-    run.click(predict, inputs=[inp, size], outputs=[out_img, out_ply_file])
 if __name__ == "__main__":

 """Sapiens2 pointmap Gradio Space.
+Image → per-pixel 3D pointmap (camera frame, metric units). Right pane is an
+interactive 3D point-cloud viewer rendering a `.glb` exported via trimesh
+(MoGe-2's approach — much faster than Open3D's `.ply` for Three.js viewers).
+All work happens at the model's NATIVE resolution. We additionally cap the
+input image to height=1024 before processing so 4K uploads don't blow up
+downstream sizes.
 """
 import sys
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 import tempfile
+import time as _t
 import cv2
 import gradio as gr
 import numpy as np
 import spaces
 import torch
 import torch.nn.functional as F
+import trimesh
 from PIL import Image
 from torchvision import transforms
         "config": os.path.join(CONFIGS_DIR, "sapiens2_5b_pointmap_render_people-1024x768.py"),
     },
 }
+DEFAULT_SIZE = "0.4B"  # iteration mode
 FG_REPO = "facebook/sapiens-seg-foreground-1b-torchscript"
 FG_FILENAME = "sapiens_1b_seg_foreground_epoch_8_torchscript.pt2"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MAX_HEIGHT = 1024  # cap input height before processing — keeps everything fast
 _fg_transform = transforms.Compose([
     transforms.Resize((1024, 768)),
 # -----------------------------------------------------------------------------
+# Helpers
+def _cap_height(image_pil: Image.Image, max_h: int = MAX_HEIGHT) -> Image.Image:
+    w, h = image_pil.size
+    if h <= max_h:
+        return image_pil
+    new_w = int(round(w * max_h / h))
+    return image_pil.resize((new_w, max_h), Image.LANCZOS)
 def _estimate_pointmap(image_bgr: np.ndarray, model) -> np.ndarray:
     data = model.pipeline(dict(img=image_bgr))
     return (out.argmax(dim=1)[0] > 0).cpu().numpy()
 # -----------------------------------------------------------------------------
+# Point cloud export — trimesh → .glb (much faster than Open3D .ply for Three.js)
 def _camera_marker(radius: float = 0.04, n_points: int = 800,
+                   color=(51, 140, 245)):
+    """Tiny slate-blue Fibonacci sphere at the world origin. Returns (verts, cols)."""
     i = np.arange(n_points)
     phi = np.arccos(1 - 2 * (i + 0.5) / n_points)
     theta = np.pi * (1 + 5 ** 0.5) * (i + 0.5)
+    verts = np.stack([
         radius * np.sin(phi) * np.cos(theta),
         radius * np.sin(phi) * np.sin(theta),
         radius * np.cos(phi),
+    ], axis=1).astype(np.float32)
+    cols = np.tile(np.array(color + (255,), dtype=np.uint8), (n_points, 1))
+    return verts, cols
+def _make_glb(image_pil_native: Image.Image, pointmap_hwc: np.ndarray,
               mask_hw: np.ndarray, max_points: int = 200_000) -> str:
     h, w = pointmap_hwc.shape[:2]
     image_rgb = np.asarray(image_pil_native.resize((w, h), Image.LANCZOS))
+    pts = pointmap_hwc.reshape(-1, 3).astype(np.float32)
+    cols_rgb = image_rgb.reshape(-1, 3).astype(np.uint8)
     z = pts[:, 2]
     finite = np.isfinite(pts).all(axis=1) & (z > 0.05) & (z < 25.0) & mask_hw.reshape(-1)
+    pts, cols_rgb = pts[finite], cols_rgb[finite]
     if len(pts) > max_points:
         idx = np.random.default_rng(0).choice(len(pts), size=max_points, replace=False)
+        pts, cols_rgb = pts[idx], cols_rgb[idx]
+    cam_verts, cam_cols = _camera_marker()
+    verts = np.concatenate([pts, cam_verts], axis=0)
+    cols_rgba = np.concatenate(
+        [np.concatenate([cols_rgb, np.full((len(cols_rgb), 1), 255, dtype=np.uint8)], axis=1),
+         cam_cols], axis=0,
+    )
+    # Three.js viewers (and gr.Model3D) typically use Y-up. Sapiens2 pointmaps
+    # come in camera frame with Y down, Z forward — flip Y so the viewer's
+    # default orientation matches photographic intuition.
+    verts = verts * np.array([1.0, -1.0, -1.0], dtype=np.float32)
+    pc = trimesh.PointCloud(vertices=verts, colors=cols_rgba)
+    out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".glb").name
+    pc.export(out_path)
     return out_path
 # -----------------------------------------------------------------------------
 # Gradio handler
 @spaces.GPU(duration=120)
 def predict(image: Image.Image, size: str):
     if image is None:
         return None, None
     t0 = _t.perf_counter()
+    image_pil = _cap_height(image.convert("RGB"))                # cap to 1024px height
     image_bgr = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
+    print(f"[time] convert+cap     {(_t.perf_counter()-t0)*1000:.0f} ms  (input {image_pil.size})")
     t = _t.perf_counter()
     model = _get_pointmap_model(size)
     pointmap = _estimate_pointmap(image_bgr, model)
     h_n, w_n = pointmap.shape[:2]
+    print(f"[time] pointmap        {(_t.perf_counter()-t)*1000:.0f} ms  (native {w_n}x{h_n})")
     t = _t.perf_counter()
     mask = _foreground_mask(image_pil, h_n, w_n)
+    print(f"[time] fg mask         {(_t.perf_counter()-t)*1000:.0f} ms")
     t = _t.perf_counter()
+    glb_path = _make_glb(image_pil, pointmap, mask)
+    print(f"[time] glb export      {(_t.perf_counter()-t)*1000:.0f} ms")
+    print(f"[time] TOTAL           {(_t.perf_counter()-t0)*1000:.0f} ms")
+    return glb_path, glb_path
 # -----------------------------------------------------------------------------
     with gr.Row(equal_height=True):
         inp = gr.Image(label="Input", type="pil", height=640)
+        out_glb = gr.Model3D(
+            label="Point cloud — drag to rotate, scroll to zoom, shift+drag to pan",
+            height=640,
+            clear_color=[0.07, 0.09, 0.13, 1.0],
+            display_mode="point_cloud",
+            zoom_speed=0.7,
+            pan_speed=0.5,
+        )
     with gr.Row():
         size = gr.Radio(
     gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
     with gr.Accordion("Raw Pointmap", open=False):
+        out_glb_file = gr.File(label="Point cloud (.glb — open in Blender/MeshLab/web viewers)")
+    run.click(predict, inputs=[inp, size], outputs=[out_glb, out_glb_file])
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -19,4 +19,4 @@ prettytable
 termcolor
 accelerate
 rich
-open3d

 termcolor
 accelerate
 rich
+trimesh