sapiens2-pointmap

Running on Zero

App Files Files Community

solrz commited on 5 days ago

Commit

82e2bb0

1 Parent(s): b9c8e7d

Add video pointmap sequence endpoint

Browse files

Files changed (1) hide show

app.py +213 -32

app.py CHANGED Viewed

@@ -13,8 +13,10 @@ import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 import tempfile
 import time as _t
 import cv2
 import gradio as gr
@@ -59,13 +61,17 @@ POINTMAP_MODELS = {
         "config": os.path.join(CONFIGS_DIR, "sapiens2_5b_pointmap_render_people-1024x768.py"),
     },
 }
-DEFAULT_SIZE = "1B"
 FG_REPO = "facebook/sapiens-seg-foreground-1b-torchscript"
 FG_FILENAME = "sapiens_1b_seg_foreground_epoch_8_torchscript.pt2"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MAX_HEIGHT = 1024  # cap input height before processing — keeps everything fast
 _fg_transform = transforms.Compose([
     transforms.Resize((1024, 768)),
@@ -99,11 +105,7 @@ def _get_fg_model():
     return _fg_model
-print("[startup] pre-loading all pointmap sizes + fg/bg ...")
-for _size in POINTMAP_MODELS:
-    _get_pointmap_model(_size)
-_get_fg_model()
-print("[startup] ready.")
 # -----------------------------------------------------------------------------
@@ -255,6 +257,129 @@ def _make_glb(image_pil_texture: Image.Image, pointmap_hwc: np.ndarray,
     return out_path
 # -----------------------------------------------------------------------------
 # Gradio handler
@@ -295,6 +420,27 @@ def predict(image: Image.Image, size: str):
     return depth_pil, glb_path
 # -----------------------------------------------------------------------------
 # UI
@@ -358,33 +504,68 @@ HEADER_HTML = """
 with gr.Blocks(title="Sapiens2 Pointmap", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
     gr.HTML(HEADER_HTML)
-    # Row 1: input ↔ 3D mesh, equal height
-    with gr.Row(equal_height=True):
-        inp = gr.Image(label="Input", type="pil", height=640, scale=2)
-        out_glb = gr.Model3D(
-            label="Pointmap",
-            height=640,
-            clear_color=[0.97, 0.97, 0.97, 1.0],   # cinematic studio white
-            camera_position=(35, 70, 1.6),  # closer, since scene is centered on the human
-            zoom_speed=0.7,
-            pan_speed=0.5,
-            scale=3,
-        )
-    # Row 2: controls (with examples below them) on the left | depth heatmap on the right.
-    with gr.Row():
-        with gr.Column(scale=2, min_width=320):
-            size = gr.Radio(
-                choices=list(POINTMAP_MODELS.keys()),
-                value=DEFAULT_SIZE,
-                label="Model",
-                container=False,
             )
-            run = gr.Button("Run", variant="primary")
-            gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=16)
-        out_depth = gr.Image(label="Depth (Z)", type="pil", height=640, scale=3)
-    run.click(predict, inputs=[inp, size], outputs=[out_depth, out_glb])
 if __name__ == "__main__":

 import os
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import json
 import tempfile
 import time as _t
+import zipfile
 import cv2
 import gradio as gr
         "config": os.path.join(CONFIGS_DIR, "sapiens2_5b_pointmap_render_people-1024x768.py"),
     },
 }
+DEFAULT_SIZE = "0.4B"
 FG_REPO = "facebook/sapiens-seg-foreground-1b-torchscript"
 FG_FILENAME = "sapiens_1b_seg_foreground_epoch_8_torchscript.pt2"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MAX_HEIGHT = 1024  # cap input height before processing — keeps everything fast
+VIDEO_MAX_HEIGHT = 512
+VIDEO_DEFAULT_FRAMES = 36
+VIDEO_DEFAULT_STRIDE = 5
+VIDEO_MAX_POINTS = 45_000
 _fg_transform = transforms.Compose([
     transforms.Resize((1024, 768)),
     return _fg_model
+print("[startup] ready; models will load lazily on first request.")
 # -----------------------------------------------------------------------------
     return out_path
+def _sample_video_frames(video_path: str, max_frames: int) -> tuple[list[Image.Image], float, int, int]:
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise gr.Error("Could not open the uploaded video.")
+    fps = float(cap.get(cv2.CAP_PROP_FPS) or 0) or 24.0
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
+    frame_limit = int(max(1, min(max_frames, 120)))
+    if total_frames > 0:
+        indices = np.linspace(0, max(total_frames - 1, 0), min(frame_limit, total_frames), dtype=np.int32)
+    else:
+        indices = np.arange(frame_limit, dtype=np.int32)
+    frames: list[Image.Image] = []
+    source_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) or 0)
+    source_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) or 0)
+    for index in indices:
+        if total_frames > 0:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, int(index))
+        ok, frame_bgr = cap.read()
+        if not ok:
+            if total_frames <= 0:
+                break
+            continue
+        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+        frames.append(Image.fromarray(frame_rgb))
+        if total_frames <= 0 and len(frames) >= frame_limit:
+            break
+    cap.release()
+    if not frames:
+        raise gr.Error("No readable frames were found in the uploaded video.")
+    return frames, fps, source_w, source_h
+def _select_point_coords(mask: np.ndarray, stride: int, max_points: int) -> tuple[np.ndarray, np.ndarray]:
+    stride = int(max(1, min(stride, 16)))
+    grid = np.zeros_like(mask, dtype=bool)
+    grid[::stride, ::stride] = True
+    selected = mask & grid
+    yy, xx = np.where(selected)
+    if len(yy) < 128:
+        yy, xx = np.where(grid)
+    if len(yy) > max_points:
+        keep = np.linspace(0, len(yy) - 1, max_points, dtype=np.int64)
+        yy = yy[keep]
+        xx = xx[keep]
+    return yy.astype(np.int32), xx.astype(np.int32)
+def _point_sequence_zip(
+    frames: list[Image.Image],
+    size: str,
+    max_frames: int,
+    point_stride: int,
+    fps: float,
+    source_w: int,
+    source_h: int,
+) -> str:
+    model = _get_pointmap_model(size)
+    sampled = frames[: int(max(1, min(max_frames, len(frames))))]
+    positions_frames: list[np.ndarray] = []
+    colors_frames: list[np.ndarray] = []
+    sample_y: np.ndarray | None = None
+    sample_x: np.ndarray | None = None
+    native_w = 0
+    native_h = 0
+    for frame_index, frame in enumerate(sampled):
+        t = _t.perf_counter()
+        image_pil = _cap_height(frame.convert("RGB"), VIDEO_MAX_HEIGHT)
+        image_bgr = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
+        pointmap = _estimate_pointmap(image_bgr, model)
+        native_h, native_w = pointmap.shape[:2]
+        if sample_y is None or sample_x is None:
+            mask = _foreground_mask(image_pil, native_h, native_w)
+            sample_y, sample_x = _select_point_coords(mask, point_stride, VIDEO_MAX_POINTS)
+            print(f"[video] selected {len(sample_y)} points at native {native_w}x{native_h}")
+        image_native = np.array(image_pil.resize((native_w, native_h), Image.LANCZOS))
+        points = pointmap[sample_y, sample_x].astype(np.float32)
+        finite = np.isfinite(points).all(axis=1) & (points[:, 2] > 0.05) & (points[:, 2] < 25.0)
+        if finite.any():
+            centroid = points[finite].mean(axis=0).astype(np.float32)
+        else:
+            centroid = np.zeros(3, dtype=np.float32)
+        points = (points - centroid) * np.array([1.0, -1.0, -1.0], dtype=np.float32)
+        points[~finite] = 0
+        colors = image_native[sample_y, sample_x, :3].astype(np.uint8)
+        colors[~finite] = 0
+        positions_frames.append(points)
+        colors_frames.append(colors)
+        print(f"[video] frame {frame_index + 1}/{len(sampled)} {(_t.perf_counter() - t) * 1000:.0f} ms")
+    if not positions_frames:
+        raise gr.Error("Pointmap inference did not produce any frames.")
+    positions = np.stack(positions_frames, axis=0).astype(np.float32)
+    colors = np.stack(colors_frames, axis=0).astype(np.uint8)
+    metadata = {
+        "format": "fpbox-sapiens-pointmap-sequence-v1",
+        "model": f"sapiens2-pointmap-{size}",
+        "frameCount": int(positions.shape[0]),
+        "fps": float(min(fps, max(1, positions.shape[0]))),
+        "pointCount": int(positions.shape[1]),
+        "width": int(native_w),
+        "height": int(native_h),
+        "sourceWidth": int(source_w),
+        "sourceHeight": int(source_h),
+        "coordinateSystem": "x, -y, -z, centered per frame",
+        "dtype": {"positions": "float32", "colors": "uint8"},
+    }
+    out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".zip").name
+    with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
+        zf.writestr("metadata.json", json.dumps(metadata, indent=2))
+        zf.writestr("positions_f32.bin", positions.tobytes(order="C"))
+        zf.writestr("colors_u8.bin", colors.tobytes(order="C"))
+    return out_path
 # -----------------------------------------------------------------------------
 # Gradio handler
     return depth_pil, glb_path
+@spaces.GPU(duration=300)
+def predict_video(video_path: str, size: str, max_frames: int, point_stride: int):
+    if video_path is None:
+        return None
+    t0 = _t.perf_counter()
+    frames, fps, source_w, source_h = _sample_video_frames(video_path, max_frames)
+    print(f"[video] sampled {len(frames)} frames from {source_w}x{source_h} video at {fps:.2f} fps")
+    zip_path = _point_sequence_zip(
+        frames=frames,
+        size=size,
+        max_frames=max_frames,
+        point_stride=point_stride,
+        fps=fps,
+        source_w=source_w,
+        source_h=source_h,
+    )
+    print(f"[video] TOTAL {(_t.perf_counter() - t0) * 1000:.0f} ms")
+    return zip_path
 # -----------------------------------------------------------------------------
 # UI
 with gr.Blocks(title="Sapiens2 Pointmap", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
     gr.HTML(HEADER_HTML)
+    with gr.Tabs():
+        with gr.Tab("Image"):
+            # Row 1: input ↔ 3D mesh, equal height
+            with gr.Row(equal_height=True):
+                inp = gr.Image(label="Input", type="pil", height=640, scale=2)
+                out_glb = gr.Model3D(
+                    label="Pointmap",
+                    height=640,
+                    clear_color=[0.97, 0.97, 0.97, 1.0],   # cinematic studio white
+                    camera_position=(35, 70, 1.6),  # closer, since scene is centered on the human
+                    zoom_speed=0.7,
+                    pan_speed=0.5,
+                    scale=3,
+                )
+            # Row 2: controls (with examples below them) on the left | depth heatmap on the right.
+            with gr.Row():
+                with gr.Column(scale=2, min_width=320):
+                    size = gr.Radio(
+                        choices=list(POINTMAP_MODELS.keys()),
+                        value=DEFAULT_SIZE,
+                        label="Model",
+                        container=False,
+                    )
+                    run = gr.Button("Run", variant="primary")
+                    gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=16)
+                out_depth = gr.Image(label="Depth (Z)", type="pil", height=640, scale=3)
+            run.click(predict, inputs=[inp, size], outputs=[out_depth, out_glb])
+        with gr.Tab("Video"):
+            with gr.Row():
+                video_inp = gr.Video(label="Input Video", height=420)
+                sequence_zip = gr.File(label="Pointmap Sequence (.zip)")
+            with gr.Row():
+                video_size = gr.Radio(
+                    choices=list(POINTMAP_MODELS.keys()),
+                    value=DEFAULT_SIZE,
+                    label="Model",
+                    container=False,
+                )
+                video_frames = gr.Slider(
+                    minimum=1,
+                    maximum=120,
+                    step=1,
+                    value=VIDEO_DEFAULT_FRAMES,
+                    label="Sampled Frames",
+                )
+                video_stride = gr.Slider(
+                    minimum=1,
+                    maximum=16,
+                    step=1,
+                    value=VIDEO_DEFAULT_STRIDE,
+                    label="Point Stride",
+                )
+            run_video = gr.Button("Run Video Pointmap", variant="primary")
+            run_video.click(
+                predict_video,
+                inputs=[video_inp, video_size, video_frames, video_stride],
+                outputs=[sequence_zip],
+                api_name="video_pointmap",
             )
 if __name__ == "__main__":