Spaces:

facebook
/

sapiens2-normal

Running on Zero

App Files Files Community

Rawal Khirodkar commited on 14 days ago

Commit

d84d54c

1 Parent(s): 977839e

Add fg-bg dropdown using v1 binary segmentation TorchScript model

Browse files

Files changed (1) hide show

app.py +70 -15

app.py CHANGED Viewed

@@ -2,6 +2,9 @@
 Image → per-pixel surface normals. Visualized by RGB-encoding the unit-length
 (x, y, z) normal: r = (x + 1) / 2, g = (y + 1) / 2, b = (z + 1) / 2.
 """
 import sys
@@ -17,6 +20,7 @@ import spaces
 import torch
 import torch.nn.functional as F
 from PIL import Image
 from huggingface_hub import hf_hub_download
 from sapiens.dense.models import NormalEstimator, init_model  # NormalEstimator triggers registry
@@ -53,13 +57,28 @@ NORMAL_MODELS = {
 }
 DEFAULT_SIZE = "1B"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # -----------------------------------------------------------------------------
 # Model cache
 _normal_model_cache: dict = {}
 def _get_normal_model(size: str):
@@ -71,9 +90,19 @@ def _get_normal_model(size: str):
     return _normal_model_cache[size]
-print("[startup] pre-loading all normal sizes ...")
 for _size in NORMAL_MODELS:
     _get_normal_model(_size)
 print("[startup] ready.")
@@ -91,35 +120,55 @@ def _estimate_normal(image_bgr: np.ndarray, model) -> np.ndarray:
     with torch.no_grad():
         normals = model(inputs)  # (1, 3, H, W)
-    # Unit-length normalization, interpolate to original size, cast to numpy
     normals = normals / normals.norm(dim=1, keepdim=True).clamp_min(1e-6)
     normals = F.interpolate(normals, size=(h0, w0), mode="bilinear", align_corners=False)
-    normals = normals[0].cpu().float().numpy()  # (3, H, W) in [-1, 1]
     return normals.transpose(1, 2, 0)  # (H, W, 3)
 def _normal_to_rgb(normal_hwc: np.ndarray) -> np.ndarray:
     rgb = (((normal_hwc + 1.0) / 2.0) * 255.0).clip(0, 255).astype(np.uint8)
-    return rgb[:, :, ::-1]  # match training viz channel order
 # -----------------------------------------------------------------------------
 # Gradio handler
 @spaces.GPU(duration=120)
-def predict(image: Image.Image, size: str):
     if image is None:
         return None, None
-    image_rgb = np.array(image.convert("RGB"))
     image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
     model = _get_normal_model(size)
     normals = _estimate_normal(image_bgr, model)  # (H, W, 3) in [-1, 1]
-    rgb = _normal_to_rgb(normals)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as f:
-        np.save(f.name, normals.astype(np.float32))
         npy_path = f.name
     return Image.fromarray(rgb), npy_path
@@ -146,18 +195,24 @@ with gr.Blocks(title="Sapiens2 Normal", theme=gr.themes.Default()) as demo:
     with gr.Row():
         with gr.Column():
             inp = gr.Image(label="Input", type="pil")
-            size = gr.Radio(
-                choices=list(NORMAL_MODELS.keys()),
-                value=DEFAULT_SIZE,
-                label="Model size",
-            )
             run = gr.Button("Run", variant="primary")
             gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
         with gr.Column():
             out_img = gr.Image(label="Surface normal (RGB-encoded)", type="pil")
-            out_npy = gr.File(label="Raw normals (.npy float32 [-1, 1])")
-    run.click(predict, inputs=[inp, size], outputs=[out_img, out_npy])
 if __name__ == "__main__":

 Image → per-pixel surface normals. Visualized by RGB-encoding the unit-length
 (x, y, z) normal: r = (x + 1) / 2, g = (y + 1) / 2, b = (z + 1) / 2.
+Optionally applies a v1 foreground/background mask so only person pixels are
+shown (background reads as a flat colour).
 """
 import sys
 import torch
 import torch.nn.functional as F
 from PIL import Image
+from torchvision import transforms
 from huggingface_hub import hf_hub_download
 from sapiens.dense.models import NormalEstimator, init_model  # NormalEstimator triggers registry
 }
 DEFAULT_SIZE = "1B"
+# v1 binary fg/bg TorchScript model — uses a different normalization (PIL → tensor → ImageNet).
+FG_REPO = "facebook/sapiens-seg-foreground-1b-torchscript"
+FG_FILENAME = "sapiens_1b_seg_foreground_epoch_8_torchscript.pt2"
+BG_OPTIONS = ["fg-bg", "no-bg-removal"]
+DEFAULT_BG = "fg-bg"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Pre-process for v1 fg-bg model (matches v1 sapiens-normal Space recipe).
+_fg_transform = transforms.Compose([
+    transforms.Resize((1024, 768)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[123.5 / 255, 116.5 / 255, 103.5 / 255],
+                         std=[58.5 / 255, 57.0 / 255, 57.5 / 255]),
+])
 # -----------------------------------------------------------------------------
 # Model cache
 _normal_model_cache: dict = {}
+_fg_model = None
 def _get_normal_model(size: str):
     return _normal_model_cache[size]
+def _get_fg_model():
+    global _fg_model
+    if _fg_model is None:
+        ckpt = hf_hub_download(repo_id=FG_REPO, filename=FG_FILENAME)
+        model = torch.jit.load(ckpt).eval().to(DEVICE)
+        _fg_model = model
+    return _fg_model
+print("[startup] pre-loading all normal sizes + fg/bg model ...")
 for _size in NORMAL_MODELS:
     _get_normal_model(_size)
+_get_fg_model()
 print("[startup] ready.")
     with torch.no_grad():
         normals = model(inputs)  # (1, 3, H, W)
     normals = normals / normals.norm(dim=1, keepdim=True).clamp_min(1e-6)
     normals = F.interpolate(normals, size=(h0, w0), mode="bilinear", align_corners=False)
+    normals = normals[0].cpu().float().numpy()
     return normals.transpose(1, 2, 0)  # (H, W, 3)
+def _foreground_mask(image_pil: Image.Image, target_h: int, target_w: int) -> np.ndarray:
+    """Returns a (H, W) bool mask using the v1 binary fg/bg torchscript model."""
+    fg = _get_fg_model()
+    inputs = _fg_transform(image_pil).unsqueeze(0).to(DEVICE)
+    with torch.no_grad():
+        out = fg(inputs)  # (1, K, H, W) logits
+    out = F.interpolate(out, size=(target_h, target_w), mode="bilinear", align_corners=False)
+    return (out.argmax(dim=1)[0] > 0).cpu().numpy()
 def _normal_to_rgb(normal_hwc: np.ndarray) -> np.ndarray:
     rgb = (((normal_hwc + 1.0) / 2.0) * 255.0).clip(0, 255).astype(np.uint8)
+    return rgb[:, :, ::-1]
 # -----------------------------------------------------------------------------
 # Gradio handler
 @spaces.GPU(duration=120)
+def predict(image: Image.Image, size: str, bg_mode: str):
     if image is None:
         return None, None
+    image_pil = image.convert("RGB")
+    image_rgb = np.array(image_pil)
     image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
+    h0, w0 = image_rgb.shape[:2]
     model = _get_normal_model(size)
     normals = _estimate_normal(image_bgr, model)  # (H, W, 3) in [-1, 1]
+    raw = normals.copy()
+    if bg_mode == "fg-bg":
+        mask = _foreground_mask(image_pil, h0, w0)
+        raw[~mask] = np.nan
+        # For viz, show background as middle-grey rather than a saturated colour.
+        rgb = _normal_to_rgb(normals)
+        rgb[~mask] = 128
+    else:
+        rgb = _normal_to_rgb(normals)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as f:
+        np.save(f.name, raw.astype(np.float32))
         npy_path = f.name
     return Image.fromarray(rgb), npy_path
     with gr.Row():
         with gr.Column():
             inp = gr.Image(label="Input", type="pil")
+            with gr.Row():
+                size = gr.Radio(
+                    choices=list(NORMAL_MODELS.keys()),
+                    value=DEFAULT_SIZE,
+                    label="Model size",
+                )
+                bg = gr.Radio(
+                    choices=BG_OPTIONS,
+                    value=DEFAULT_BG,
+                    label="Background",
+                )
             run = gr.Button("Run", variant="primary")
             gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
         with gr.Column():
             out_img = gr.Image(label="Surface normal (RGB-encoded)", type="pil")
+            out_npy = gr.File(label="Raw normals (.npy float32 [-1, 1]; NaN where bg)")
+    run.click(predict, inputs=[inp, size, bg], outputs=[out_img, out_npy])
 if __name__ == "__main__":