sapiens2-pointmap

Runtime error

App Files Files Community

Rawal Khirodkar commited on 14 days ago

Commit

a0fd52f

1 Parent(s): 2f57cf8

Add fg-bg dropdown using v1 binary segmentation TorchScript model

Browse files

Files changed (1) hide show

app.py +61 -14

app.py CHANGED Viewed

@@ -3,6 +3,9 @@
 Image → per-pixel 3D pointmap (camera frame, metric units). The result is
 exported as a .ply point cloud and rendered with Gradio's Model3D component
 for interactive 3D viewing.
 """
 import sys
@@ -19,6 +22,7 @@ import spaces
 import torch
 import torch.nn.functional as F
 from PIL import Image
 from huggingface_hub import hf_hub_download
 from sapiens.dense.models import PointmapEstimator, init_model  # registers in registry
@@ -55,13 +59,26 @@ POINTMAP_MODELS = {
 }
 DEFAULT_SIZE = "1B"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # -----------------------------------------------------------------------------
 # Model cache
 _pointmap_model_cache: dict = {}
 def _get_pointmap_model(size: str):
@@ -73,9 +90,18 @@ def _get_pointmap_model(size: str):
     return _pointmap_model_cache[size]
-print("[startup] pre-loading all pointmap sizes ...")
 for _size in POINTMAP_MODELS:
     _get_pointmap_model(_size)
 print("[startup] ready.")
@@ -105,17 +131,28 @@ def _estimate_pointmap(image_bgr: np.ndarray, model) -> np.ndarray:
     return pointmap.squeeze(0).cpu().float().numpy().transpose(1, 2, 0)  # (H, W, 3)
 # -----------------------------------------------------------------------------
 # Point cloud export
-def _make_ply(image_rgb: np.ndarray, pointmap_hwc: np.ndarray, max_points: int = 200_000) -> str:
-    """Subsample, filter to a reasonable depth range, and write a .ply file."""
     pts = pointmap_hwc.reshape(-1, 3)
     cols = (image_rgb.reshape(-1, 3).astype(np.float32) / 255.0)
-    # Drop points with non-finite or extreme depth
     z = pts[:, 2]
     finite = np.isfinite(pts).all(axis=1) & (z > 0.05) & (z < 25.0)
     pts, cols = pts[finite], cols[finite]
     if len(pts) > max_points:
@@ -135,16 +172,20 @@ def _make_ply(image_rgb: np.ndarray, pointmap_hwc: np.ndarray, max_points: int =
 # Gradio handler
 @spaces.GPU(duration=180)
-def predict(image: Image.Image, size: str):
     if image is None:
         return None, None
-    image_rgb = np.array(image.convert("RGB"))
     image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
     model = _get_pointmap_model(size)
-    pointmap = _estimate_pointmap(image_bgr, model)  # (H, W, 3) metric, camera frame
-    ply_path = _make_ply(image_rgb, pointmap)
     npy_path = tempfile.NamedTemporaryFile(delete=False, suffix=".npy").name
     np.save(npy_path, pointmap.astype(np.float32))
@@ -173,18 +214,24 @@ with gr.Blocks(title="Sapiens2 Pointmap", theme=gr.themes.Default()) as demo:
     with gr.Row():
         with gr.Column():
             inp = gr.Image(label="Input", type="pil")
-            size = gr.Radio(
-                choices=list(POINTMAP_MODELS.keys()),
-                value=DEFAULT_SIZE,
-                label="Model size",
-            )
             run = gr.Button("Run", variant="primary")
             gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
         with gr.Column():
             out_ply = gr.Model3D(label="Point cloud (drag to rotate)", clear_color=[0.05, 0.05, 0.05, 1.0])
             out_npy = gr.File(label="Raw pointmap (.npy float32 [H, W, 3] in meters)")
-    run.click(predict, inputs=[inp, size], outputs=[out_ply, out_npy])
 if __name__ == "__main__":

 Image → per-pixel 3D pointmap (camera frame, metric units). The result is
 exported as a .ply point cloud and rendered with Gradio's Model3D component
 for interactive 3D viewing.
+Optionally applies a v1 foreground/background mask so only person points end
+up in the cloud (background is dropped entirely).
 """
 import sys
 import torch
 import torch.nn.functional as F
 from PIL import Image
+from torchvision import transforms
 from huggingface_hub import hf_hub_download
 from sapiens.dense.models import PointmapEstimator, init_model  # registers in registry
 }
 DEFAULT_SIZE = "1B"
+FG_REPO = "facebook/sapiens-seg-foreground-1b-torchscript"
+FG_FILENAME = "sapiens_1b_seg_foreground_epoch_8_torchscript.pt2"
+BG_OPTIONS = ["fg-bg", "no-bg-removal"]
+DEFAULT_BG = "fg-bg"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+_fg_transform = transforms.Compose([
+    transforms.Resize((1024, 768)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[123.5 / 255, 116.5 / 255, 103.5 / 255],
+                         std=[58.5 / 255, 57.0 / 255, 57.5 / 255]),
+])
 # -----------------------------------------------------------------------------
 # Model cache
 _pointmap_model_cache: dict = {}
+_fg_model = None
 def _get_pointmap_model(size: str):
     return _pointmap_model_cache[size]
+def _get_fg_model():
+    global _fg_model
+    if _fg_model is None:
+        ckpt = hf_hub_download(repo_id=FG_REPO, filename=FG_FILENAME)
+        _fg_model = torch.jit.load(ckpt).eval().to(DEVICE)
+    return _fg_model
+print("[startup] pre-loading all pointmap sizes + fg/bg model ...")
 for _size in POINTMAP_MODELS:
     _get_pointmap_model(_size)
+_get_fg_model()
 print("[startup] ready.")
     return pointmap.squeeze(0).cpu().float().numpy().transpose(1, 2, 0)  # (H, W, 3)
+def _foreground_mask(image_pil: Image.Image, target_h: int, target_w: int) -> np.ndarray:
+    fg = _get_fg_model()
+    inputs = _fg_transform(image_pil).unsqueeze(0).to(DEVICE)
+    with torch.no_grad():
+        out = fg(inputs)
+    out = F.interpolate(out, size=(target_h, target_w), mode="bilinear", align_corners=False)
+    return (out.argmax(dim=1)[0] > 0).cpu().numpy()
 # -----------------------------------------------------------------------------
 # Point cloud export
+def _make_ply(image_rgb: np.ndarray, pointmap_hwc: np.ndarray, mask_hw: np.ndarray | None = None,
+              max_points: int = 200_000) -> str:
+    """Subsample, optionally mask to foreground, and write a .ply file."""
     pts = pointmap_hwc.reshape(-1, 3)
     cols = (image_rgb.reshape(-1, 3).astype(np.float32) / 255.0)
     z = pts[:, 2]
     finite = np.isfinite(pts).all(axis=1) & (z > 0.05) & (z < 25.0)
+    if mask_hw is not None:
+        finite &= mask_hw.reshape(-1)
     pts, cols = pts[finite], cols[finite]
     if len(pts) > max_points:
 # Gradio handler
 @spaces.GPU(duration=180)
+def predict(image: Image.Image, size: str, bg_mode: str):
     if image is None:
         return None, None
+    image_pil = image.convert("RGB")
+    image_rgb = np.array(image_pil)
     image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
+    h0, w0 = image_rgb.shape[:2]
     model = _get_pointmap_model(size)
+    pointmap = _estimate_pointmap(image_bgr, model)
+    mask = _foreground_mask(image_pil, h0, w0) if bg_mode == "fg-bg" else None
+    ply_path = _make_ply(image_rgb, pointmap, mask)
     npy_path = tempfile.NamedTemporaryFile(delete=False, suffix=".npy").name
     np.save(npy_path, pointmap.astype(np.float32))
     with gr.Row():
         with gr.Column():
             inp = gr.Image(label="Input", type="pil")
+            with gr.Row():
+                size = gr.Radio(
+                    choices=list(POINTMAP_MODELS.keys()),
+                    value=DEFAULT_SIZE,
+                    label="Model size",
+                )
+                bg = gr.Radio(
+                    choices=BG_OPTIONS,
+                    value=DEFAULT_BG,
+                    label="Background",
+                )
             run = gr.Button("Run", variant="primary")
             gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
         with gr.Column():
             out_ply = gr.Model3D(label="Point cloud (drag to rotate)", clear_color=[0.05, 0.05, 0.05, 1.0])
             out_npy = gr.File(label="Raw pointmap (.npy float32 [H, W, 3] in meters)")
+    run.click(predict, inputs=[inp, size, bg], outputs=[out_ply, out_npy])
 if __name__ == "__main__":