Spaces:

facebook
/

sapiens2-normal

Running on Zero

App Files Files Community

Rawal Khirodkar commited on 14 days ago

Commit

2070091

1 Parent(s): 9884195

Normal: copy seg aesthetic; fix output (unpad + drop bogus channel swap); 0.4B-only preload

Browse files

Files changed (1) hide show

app.py +111 -63

app.py CHANGED Viewed

@@ -3,8 +3,7 @@
 Image → per-pixel surface normals. Visualized by RGB-encoding the unit-length
 (x, y, z) normal: r = (x + 1) / 2, g = (y + 1) / 2, b = (z + 1) / 2.
-Optionally applies a v1 foreground/background mask so only person pixels are
-shown (background reads as a flat colour).
 """
 import sys
@@ -23,7 +22,7 @@ from PIL import Image
 from torchvision import transforms
 from huggingface_hub import hf_hub_download
-from sapiens.dense.models import NormalEstimator, init_model  # NormalEstimator triggers registry
 _ = NormalEstimator
@@ -55,9 +54,9 @@ NORMAL_MODELS = {
         "config": os.path.join(CONFIGS_DIR, "sapiens2_5b_normal_metasim_render_people-1024x768.py"),
     },
 }
-DEFAULT_SIZE = "1B"
-# v1 binary fg/bg TorchScript model — uses a different normalization (PIL → tensor → ImageNet).
 FG_REPO = "facebook/sapiens-seg-foreground-1b-torchscript"
 FG_FILENAME = "sapiens_1b_seg_foreground_epoch_8_torchscript.pt2"
 BG_OPTIONS = ["fg-bg", "no-bg-removal"]
@@ -65,7 +64,6 @@ DEFAULT_BG = "fg-bg"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Pre-process for v1 fg-bg model (matches v1 sapiens-normal Space recipe).
 _fg_transform = transforms.Compose([
     transforms.Resize((1024, 768)),
     transforms.ToTensor(),
@@ -94,51 +92,56 @@ def _get_fg_model():
     global _fg_model
     if _fg_model is None:
         ckpt = hf_hub_download(repo_id=FG_REPO, filename=FG_FILENAME)
-        model = torch.jit.load(ckpt).eval().to(DEVICE)
-        _fg_model = model
     return _fg_model
-print("[startup] pre-loading all normal sizes + fg/bg model ...")
-for _size in NORMAL_MODELS:
-    _get_normal_model(_size)
 _get_fg_model()
 print("[startup] ready.")
 # -----------------------------------------------------------------------------
-# Inference
 def _estimate_normal(image_bgr: np.ndarray, model) -> np.ndarray:
     h0, w0 = image_bgr.shape[:2]
-    data = model.pipeline(dict(img=image_bgr))
-    data = model.data_preprocessor(data)
-    inputs = data["inputs"]
-    if inputs.ndim == 3:
-        inputs = inputs.unsqueeze(0)
     with torch.no_grad():
-        normals = model(inputs)  # (1, 3, H, W)
-    normals = normals / normals.norm(dim=1, keepdim=True).clamp_min(1e-6)
-    normals = F.interpolate(normals, size=(h0, w0), mode="bilinear", align_corners=False)
-    normals = normals[0].cpu().float().numpy()
-    return normals.transpose(1, 2, 0)  # (H, W, 3)
 def _foreground_mask(image_pil: Image.Image, target_h: int, target_w: int) -> np.ndarray:
-    """Returns a (H, W) bool mask using the v1 binary fg/bg torchscript model."""
     fg = _get_fg_model()
     inputs = _fg_transform(image_pil).unsqueeze(0).to(DEVICE)
     with torch.no_grad():
-        out = fg(inputs)  # (1, K, H, W) logits
     out = F.interpolate(out, size=(target_h, target_w), mode="bilinear", align_corners=False)
     return (out.argmax(dim=1)[0] > 0).cpu().numpy()
 def _normal_to_rgb(normal_hwc: np.ndarray) -> np.ndarray:
-    rgb = (((normal_hwc + 1.0) / 2.0) * 255.0).clip(0, 255).astype(np.uint8)
-    return rgb[:, :, ::-1]
 # -----------------------------------------------------------------------------
@@ -155,21 +158,17 @@ def predict(image: Image.Image, size: str, bg_mode: str):
     h0, w0 = image_rgb.shape[:2]
     model = _get_normal_model(size)
-    normals = _estimate_normal(image_bgr, model)  # (H, W, 3) in [-1, 1]
-    raw = normals.copy()
     if bg_mode == "fg-bg":
         mask = _foreground_mask(image_pil, h0, w0)
         raw[~mask] = np.nan
-        # For viz, show background as middle-grey rather than a saturated colour.
-        rgb = _normal_to_rgb(normals)
-        rgb[~mask] = 128
-    else:
-        rgb = _normal_to_rgb(normals)
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".npy") as f:
-        np.save(f.name, raw.astype(np.float32))
-        npy_path = f.name
     return Image.fromarray(rgb), npy_path
@@ -183,34 +182,83 @@ EXAMPLES = sorted(
     if n.lower().endswith((".jpg", ".jpeg", ".png"))
 )
-with gr.Blocks(title="Sapiens2 Normal", theme=gr.themes.Default()) as demo:
-    gr.Markdown(
-        "# Sapiens2: Surface Normal Estimation\n"
-        "### ICLR 2026\n"
-        "Per-pixel surface-normal estimation. Output is RGB-encoded (x, y, z → R, G, B).\n\n"
-        "[Code](https://github.com/facebookresearch/sapiens2) · "
-        "[Models](https://huggingface.co/facebook/sapiens2) · "
-        "[Paper](https://openreview.net/pdf?id=IVAlYCqdvW)"
-    )
     with gr.Row():
-        with gr.Column():
-            inp = gr.Image(label="Input", type="pil")
-            with gr.Row():
-                size = gr.Radio(
-                    choices=list(NORMAL_MODELS.keys()),
-                    value=DEFAULT_SIZE,
-                    label="Model size",
-                )
-                bg = gr.Radio(
-                    choices=BG_OPTIONS,
-                    value=DEFAULT_BG,
-                    label="Background",
-                )
-            run = gr.Button("Run", variant="primary")
-            gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
-        with gr.Column():
-            out_img = gr.Image(label="Surface normal (RGB-encoded)", type="pil")
-            out_npy = gr.File(label="Raw normals (.npy float32 [-1, 1]; NaN where bg)")
     run.click(predict, inputs=[inp, size, bg], outputs=[out_img, out_npy])

 Image → per-pixel surface normals. Visualized by RGB-encoding the unit-length
 (x, y, z) normal: r = (x + 1) / 2, g = (y + 1) / 2, b = (z + 1) / 2.
+Optionally applies a v1 binary fg/bg mask so background pixels are blacked out.
 """
 import sys
 from torchvision import transforms
 from huggingface_hub import hf_hub_download
+from sapiens.dense.models import NormalEstimator, init_model  # registers NormalEstimator
 _ = NormalEstimator
         "config": os.path.join(CONFIGS_DIR, "sapiens2_5b_normal_metasim_render_people-1024x768.py"),
     },
 }
+DEFAULT_SIZE = "0.4B"  # iteration mode — only this is preloaded; others lazy-load on click
+# v1 binary fg/bg TorchScript model.
 FG_REPO = "facebook/sapiens-seg-foreground-1b-torchscript"
 FG_FILENAME = "sapiens_1b_seg_foreground_epoch_8_torchscript.pt2"
 BG_OPTIONS = ["fg-bg", "no-bg-removal"]
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 _fg_transform = transforms.Compose([
     transforms.Resize((1024, 768)),
     transforms.ToTensor(),
     global _fg_model
     if _fg_model is None:
         ckpt = hf_hub_download(repo_id=FG_REPO, filename=FG_FILENAME)
+        _fg_model = torch.jit.load(ckpt).eval().to(DEVICE)
     return _fg_model
+# Iteration mode: only preload the default (0.4B) for fast Space boot.
+# Re-enable full preload by uncommenting the loop below.
+print("[startup] pre-loading 0.4B (iteration mode) + fg/bg ...")
+_get_normal_model(DEFAULT_SIZE)
 _get_fg_model()
+# for _size in NORMAL_MODELS:
+#     _get_normal_model(_size)
 print("[startup] ready.")
 # -----------------------------------------------------------------------------
+# Inference (mirrors sapiens/dense/tools/vis/vis_normal.py)
 def _estimate_normal(image_bgr: np.ndarray, model) -> np.ndarray:
     h0, w0 = image_bgr.shape[:2]
+    data = model.pipeline(dict(img=image_bgr))         # resize + pad
+    data = model.data_preprocessor(data)               # normalize + batch
+    inputs, data_samples = data["inputs"], data["data_samples"]
     with torch.no_grad():
+        normal = model(inputs)                         # (1, 3, padded_H, padded_W)
+        normal = normal / normal.norm(dim=1, keepdim=True).clamp_min(1e-8)
+    pad_left, pad_right, pad_top, pad_bottom = data_samples["meta"]["padding_size"]
+    normal = normal[
+        :, :,
+        pad_top : inputs.shape[2] - pad_bottom,
+        pad_left : inputs.shape[3] - pad_right,
+    ]
+    normal = F.interpolate(normal, size=(h0, w0), mode="bilinear", align_corners=False)
+    return normal.squeeze(0).cpu().float().numpy().transpose(1, 2, 0)  # (H, W, 3) in [-1, 1]
 def _foreground_mask(image_pil: Image.Image, target_h: int, target_w: int) -> np.ndarray:
     fg = _get_fg_model()
     inputs = _fg_transform(image_pil).unsqueeze(0).to(DEVICE)
     with torch.no_grad():
+        out = fg(inputs)                               # (1, K, H, W) logits
     out = F.interpolate(out, size=(target_h, target_w), mode="bilinear", align_corners=False)
     return (out.argmax(dim=1)[0] > 0).cpu().numpy()
 def _normal_to_rgb(normal_hwc: np.ndarray) -> np.ndarray:
+    """(H, W, 3) in [-1, 1] → (H, W, 3) uint8 RGB. NO channel swap (the swap in
+    vis_normal.py is purely for cv2.imwrite's BGR convention)."""
+    return (((normal_hwc + 1.0) / 2.0) * 255.0).clip(0, 255).astype(np.uint8)
 # -----------------------------------------------------------------------------
     h0, w0 = image_rgb.shape[:2]
     model = _get_normal_model(size)
+    normal = _estimate_normal(image_bgr, model)        # (H, W, 3) in [-1, 1]
+    raw = normal.copy()
     if bg_mode == "fg-bg":
         mask = _foreground_mask(image_pil, h0, w0)
         raw[~mask] = np.nan
+        normal[~mask] = -1.0                            # → RGB(0,0,0) after vis
+    rgb = _normal_to_rgb(normal)
+    npy_path = tempfile.NamedTemporaryFile(delete=False, suffix=".npy").name
+    np.save(npy_path, raw.astype(np.float32))
     return Image.fromarray(rgb), npy_path
     if n.lower().endswith((".jpg", ".jpeg", ".png"))
 )
+CUSTOM_CSS = """
+:root, body, .gradio-container, button, input, select, textarea,
+.gradio-container *:not(code):not(pre) {
+    font-family: "Helvetica Neue", Helvetica, Arial, sans-serif !important;
+    -webkit-font-smoothing: antialiased;
+    -moz-osx-font-smoothing: grayscale;
+}
+#title { text-align: center; font-size: 44px; font-weight: 700;
+         letter-spacing: -0.01em; margin: 28px 0 4px;
+         background: linear-gradient(90deg, #1d4ed8 0%, #6d28d9 50%, #be185d 100%);
+         -webkit-background-clip: text; -webkit-text-fill-color: transparent;
+         background-clip: text; }
+#subtitle { text-align: center; font-size: 12px; color: #64748b;
+            letter-spacing: 0.18em; margin: 0 0 14px; text-transform: uppercase;
+            font-weight: 500; }
+#badges { display: flex; justify-content: center; flex-wrap: wrap;
+          gap: 8px; margin: 0 0 32px; }
+.pill { display: inline-flex; align-items: center; gap: 6px;
+        padding: 7px 14px; border-radius: 999px;
+        background: #f1f5f9; color: #0f172a !important;
+        font-size: 13px; font-weight: 500; letter-spacing: 0.01em;
+        text-decoration: none !important; border: 1px solid #e2e8f0;
+        transition: background 150ms ease, transform 150ms ease, border-color 150ms ease; }
+.pill:hover { background: #0f172a; color: #f8fafc !important;
+              border-color: #0f172a; transform: translateY(-1px); }
+.pill svg { width: 14px; height: 14px; }
+"""
+HEADER_HTML = """
+<div id="title">Sapiens2: Normal</div>
+<div id="subtitle">ICLR 2026</div>
+<div id="badges">
+  <a class="pill" href="https://github.com/facebookresearch/sapiens2" target="_blank" rel="noopener">
+    <svg viewBox="0 0 24 24" fill="currentColor"><path d="M12 .3a12 12 0 0 0-3.8 23.4c.6.1.8-.3.8-.6v-2c-3.3.7-4-1.6-4-1.6-.6-1.4-1.4-1.8-1.4-1.8-1.1-.7.1-.7.1-.7 1.3.1 2 1.3 2 1.3 1.1 1.9 3 1.4 3.7 1 .1-.8.4-1.4.8-1.7-2.7-.3-5.5-1.3-5.5-5.9 0-1.3.5-2.4 1.3-3.2-.1-.4-.6-1.6.1-3.2 0 0 1-.3 3.3 1.2a11.5 11.5 0 0 1 6 0c2.3-1.5 3.3-1.2 3.3-1.2.7 1.6.2 2.8.1 3.2.8.8 1.3 1.9 1.3 3.2 0 4.6-2.8 5.6-5.5 5.9.4.4.8 1.1.8 2.2v3.3c0 .3.2.7.8.6A12 12 0 0 0 12 .3"/></svg>
+    Code
+  </a>
+  <a class="pill" href="https://huggingface.co/facebook/sapiens2" target="_blank" rel="noopener">
+    🤗 Models
+  </a>
+  <a class="pill" href="https://openreview.net/pdf?id=IVAlYCqdvW" target="_blank" rel="noopener">
+    <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"/><polyline points="14 2 14 8 20 8"/><line x1="9" y1="13" x2="15" y2="13"/><line x1="9" y1="17" x2="15" y2="17"/></svg>
+    Paper
+  </a>
+  <a class="pill" href="https://rawalkhirodkar.github.io/sapiens2" target="_blank" rel="noopener">
+    <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="12" r="10"/><line x1="2" y1="12" x2="22" y2="12"/><path d="M12 2a15.3 15.3 0 0 1 4 10 15.3 15.3 0 0 1-4 10 15.3 15.3 0 0 1-4-10 15.3 15.3 0 0 1 4-10z"/></svg>
+    Project
+  </a>
+</div>
+"""
+with gr.Blocks(title="Sapiens2 Normal", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
+    gr.HTML(HEADER_HTML)
+    with gr.Row(equal_height=True):
+        inp = gr.Image(label="Input", type="pil", height=640)
+        out_img = gr.Image(label="Surface normal (RGB-encoded)", type="pil", height=640)
     with gr.Row():
+        size = gr.Radio(
+            choices=list(NORMAL_MODELS.keys()),
+            value=DEFAULT_SIZE,
+            label="Model",
+            scale=2,
+        )
+        bg = gr.Radio(
+            choices=BG_OPTIONS,
+            value=DEFAULT_BG,
+            label="Background",
+            scale=2,
+        )
+        run = gr.Button("Run", variant="primary", size="lg", scale=1)
+    gr.Examples(examples=EXAMPLES, inputs=inp, examples_per_page=14)
+    with gr.Accordion("Original Res + Raw Normals", open=False):
+        out_npy = gr.File(label="Raw normals (.npy float32 [-1, 1]; NaN where bg removed)")
     run.click(predict, inputs=[inp, size, bg], outputs=[out_img, out_npy])