Spaces:

Daankular
/

Image2Model

Running on Zero

Daankular commited on 8 days ago

Commit

4a05450

1 Parent(s): 60f5fd7

Replace runtime TripoSG string-patching with pre-patched files in patches/triposg/

All three TripoSG scripts that previously required runtime str.replace() hacks are
now committed as fixed files under patches/triposg/ (mirroring upstream layout).
load_triposg() now simply shutil.copy2s them over the cloned repo after checkout.

Patches applied in the files:
- scripts/inference_triposg.py: remove pymeshlab import + helper fns, use trimesh.simplify_quadric_decimation
- scripts/image_process.py: empty contours guard, rmbg_net=None guard, all-zero alpha fallback
- triposg/inference_utils.py: diso optional try/except, hierarchical wrapper, queries dtype cast

Files changed (4) hide show

app.py +12 -152
patches/triposg/scripts/image_process.py +159 -0
patches/triposg/scripts/inference_triposg.py +90 -0
patches/triposg/triposg/inference_utils.py +492 -0

app.py CHANGED Viewed

@@ -365,6 +365,7 @@ def load_triposg():
         return _triposg_pipe, _rmbg_net
     print("[load_triposg] Loading TripoSG pipeline...")
     from huggingface_hub import snapshot_download
     # TripoSG source has no setup.py — clone GitHub repo and add to sys.path
@@ -377,160 +378,19 @@ def load_triposg():
              str(triposg_src)],
             check=True
         )
-    if str(triposg_src) not in sys.path:
-        sys.path.insert(0, str(triposg_src))
-    # Patch image_process.py: guard rmbg_net=None in load_image.
-    # TripoSG calls rmbg(rgb_image_resized) unconditionally when alpha is None,
-    # with no check for rmbg_net being None. Fallback: all-white alpha (full foreground).
-    _ip_path = triposg_src / "scripts" / "image_process.py"
-    if _ip_path.exists():
-        _ip_text = _ip_path.read_text()
-        if "rmbg_net_none_guard_v2" not in _ip_text:
-            _ip_text = _ip_text.replace(
-                "        # seg from rmbg\n        alpha_gpu_rmbg = rmbg(rgb_image_resized)",
-                "        # seg from rmbg\n"
-                "        if rmbg_net is None:  # rmbg_net_none_guard_v2\n"
-                "            alpha_gpu_rmbg = torch.ones(\n"
-                "                1, 1, rgb_image_resized.shape[1], rgb_image_resized.shape[2],\n"
-                "                device=rgb_image_resized.device)\n"
-                "        else:\n"
-                "            alpha_gpu_rmbg = rmbg(rgb_image_resized)",
-            )
-            _ip_path.write_text(_ip_text)
-            print("[load_triposg] Patched image_process.py: rmbg_net None guard")
-        # Patch find_bounding_box: guard against empty contours (blank alpha mask).
-        # When RMBG produces an all-black mask, findContours returns [] and max() raises.
-        # Fallback: return the full image bounding box so pipeline can continue.
-        # NOTE: parameter is gray_image, not alpha.
-        _ip_text2 = _ip_path.read_text()
-        if "empty_contours_guard" not in _ip_text2:
-            _ip_text2 = _ip_text2.replace(
-                "    max_contour = max(contours, key=cv2.contourArea)",
-                "    if not contours:  # empty_contours_guard\n"
-                "        h, w = gray_image.shape[:2]\n"
-                "        return 0, 0, w, h\n"
-                "    max_contour = max(contours, key=cv2.contourArea)",
-            )
-            _ip_path.write_text(_ip_text2)
-            print("[load_triposg] Patched image_process.py: empty contours guard")
-        # Patch all-zero alpha guard: instead of raising ValueError("input image too small"),
-        # fall back to full-foreground alpha so the pipeline can continue with the whole image.
-        # Happens when RMBG produces a blank mask (e.g. remove_small_objects wipes everything).
-        _ip_text3 = _ip_path.read_text()
-        if "all_zero_alpha_guard" not in _ip_text3:
-            _ip_text3 = _ip_text3.replace(
-                '    if np.all(alpha==0):\n        raise ValueError(f"input image too small")',
-                "    if np.all(alpha==0):  # all_zero_alpha_guard\n"
-                "        h_full, w_full = alpha.shape[:2]\n"
-                "        alpha = np.full((h_full, w_full), 255, dtype=np.uint8)\n"
-                "        alpha_gpu = torch.ones(1, h_full, w_full, dtype=torch.float32,\n"
-                "                              device=rgb_image_gpu.device)\n"
-                "        x, y, w, h = 0, 0, w_full, h_full",
-            )
-            _ip_path.write_text(_ip_text3)
-            print("[load_triposg] Patched image_process.py: all-zero alpha fallback")
-    # Safety net: patch inference_utils.py to make diso import optional.
-    # Even if diso compiled with submodules, guard against any residual link errors.
-    _iu_path = triposg_src / "triposg" / "inference_utils.py"
-    if _iu_path.exists():
-        _iu_text = _iu_path.read_text()
-        if "queries.to(dtype=batch_latents.dtype)" not in _iu_text:
-            _iu_text = _iu_text.replace(
-                "from diso import DiffDMC",
-                "try:\n    from diso import DiffDMC\n"
-                "except Exception as _diso_err:\n"
-                "    print(f'[TripoSG] diso unavailable ({_diso_err}), using flash fallback')\n"
-                "    DiffDMC = None",
-            )
-            if ("def hierarchical_extract_geometry(" in _iu_text
-                    and "flash_extract_geometry" in _iu_text):
-                _iu_text = _iu_text.replace(
-                    "def hierarchical_extract_geometry(",
-                    "def _hierarchical_extract_geometry_impl(",
-                )
-                _iu_text += (
-                    "\n\n"
-                    "def hierarchical_extract_geometry(*args, **kwargs):\n"
-                    "    if DiffDMC is None:\n"
-                    "        return flash_extract_geometry(*args, **kwargs)\n"
-                    "    return _hierarchical_extract_geometry_impl(*args, **kwargs)\n"
-                )
-            # Also cast queries to match batch_latents dtype before vae.decode.
-            # TripoSGPipeline loads as float16 but flash_extract_geometry creates
-            # query grids as float32, causing a dtype mismatch in F.linear.
-            _iu_text = _iu_text.replace(
-                "logits = vae.decode(batch_latents, queries).sample",
-                "logits = vae.decode(batch_latents, queries.to(dtype=batch_latents.dtype)).sample",
-            )
-            _iu_path.write_text(_iu_text)
-            print("[load_triposg] Patched inference_utils.py: diso optional + queries dtype cast")
-    # Patch inference_triposg.py: replace pymeshlab (no py3.13 wheels) with trimesh.
-    # pymeshlab is only used for simplify_mesh() — QEM decimation + vertex merging.
-    # trimesh.simplify_quadric_decimation() is a direct equivalent (needs fast-simplification).
-    _it_path = triposg_src / "scripts" / "inference_triposg.py"
-    if _it_path.exists():
-        _it_text = _it_path.read_text()
-        if "pymeshlab_replaced_v1" not in _it_text:
-            # Step 1: always strip the top-level pymeshlab import
-            _it_text = _it_text.replace("import pymeshlab\n", "")
-            # Step 2: try to replace the pymeshlab helper functions with trimesh equivalent
-            _new_simplify = (
-                "# pymeshlab_replaced_v1: replaced with trimesh (no py3.13 wheels for pymeshlab)\n"
-                "def simplify_mesh(mesh: trimesh.Trimesh, n_faces):\n"
-                "    if mesh.faces.shape[0] > n_faces:\n"
-                "        mesh = trimesh.Trimesh(vertices=mesh.vertices, faces=mesh.faces, process=True)\n"
-                "        mesh = mesh.simplify_quadric_decimation(n_faces)\n"
-                "    return mesh\n"
-            )
-            # Try the exact upstream string first
-            _old_exact = (
-                "def mesh_to_pymesh(vertices, faces):\n"
-                "    mesh = pymeshlab.Mesh(vertex_matrix=vertices, face_matrix=faces)\n"
-                "    ms = pymeshlab.MeshSet()\n"
-                "    ms.add_mesh(mesh)\n"
-                "    return ms\n"
-                "\n"
-                "\n"
-                "def pymesh_to_trimesh(mesh):\n"
-                "    verts = mesh.vertex_matrix()\n"
-                "    faces = mesh.face_matrix()\n"
-                "    return trimesh.Trimesh(vertices=verts, faces=faces)\n"
-                "\n"
-                "\n"
-                "def simplify_mesh(mesh: trimesh.Trimesh, n_faces):\n"
-                "    if mesh.faces.shape[0] > n_faces:\n"
-                "        ms = mesh_to_pymesh(mesh.vertices, mesh.faces)\n"
-                "        ms.meshing_merge_close_vertices()\n"
-                "        ms.meshing_decimation_quadric_edge_collapse(targetfacenum = n_faces)\n"
-                "        return pymesh_to_trimesh(ms.current_mesh())\n"
-                "    else:\n"
-                "        return mesh\n"
-            )
-            if _old_exact in _it_text:
-                _it_text = _it_text.replace(_old_exact, _new_simplify)
-                print("[load_triposg] Patched inference_triposg.py: pymeshlab → trimesh (exact match)")
-            else:
-                # Fallback: regex — handles whitespace/indent variants
-                import re as _re
-                _it_text = _re.sub(
-                    r"def mesh_to_pymesh\(.*?\n.*?\n.*?\n.*?\n.*?\n\n\ndef pymesh_to_trimesh\(.*?\n.*?\n.*?\n.*?\n\n\ndef simplify_mesh\([^)]*\):[^\n]*\n(?:    [^\n]*\n)*",
-                    _new_simplify,
-                    _it_text,
-                    flags=_re.DOTALL,
-                )
-                print("[load_triposg] Patched inference_triposg.py: pymeshlab → trimesh (regex fallback)")
-            # Step 3: always write — import removal alone fixes the crash even if
-            # function replacement didn't match (simplify_mesh just won't exist,
-            # which is a NameError only if called, not at import time)
-            _it_path.write_text(_it_text)
-            print("[load_triposg] inference_triposg.py written.")
     weights_path = snapshot_download("VAST-AI/TripoSG")

         return _triposg_pipe, _rmbg_net
     print("[load_triposg] Loading TripoSG pipeline...")
+    import shutil as _shutil
     from huggingface_hub import snapshot_download
     # TripoSG source has no setup.py — clone GitHub repo and add to sys.path
              str(triposg_src)],
             check=True
         )
+    # Overwrite upstream scripts with pre-patched versions committed to this repo.
+    # Patches live in patches/triposg/ and mirror the upstream directory layout.
+    _patches_dir = HERE / "patches" / "triposg"
+    for _pf in _patches_dir.rglob("*"):
+        if _pf.is_file():
+            _dest = triposg_src / _pf.relative_to(_patches_dir)
+            _dest.parent.mkdir(parents=True, exist_ok=True)
+            _shutil.copy2(str(_pf), str(_dest))
+    print("[load_triposg] Applied pre-patched scripts from patches/triposg/")
+    if str(triposg_src) not in sys.path:
+        sys.path.insert(0, str(triposg_src))
     weights_path = snapshot_download("VAST-AI/TripoSG")

patches/triposg/scripts/image_process.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# -*- coding: utf-8 -*-
+import os
+from skimage.morphology import remove_small_objects
+from skimage.measure import label
+import numpy as np
+from PIL import Image
+import cv2
+from torchvision import transforms
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+def find_bounding_box(gray_image):
+    _, binary_image = cv2.threshold(gray_image, 1, 255, cv2.THRESH_BINARY)
+    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    if not contours:
+        h, w = gray_image.shape[:2]
+        return 0, 0, w, h
+    max_contour = max(contours, key=cv2.contourArea)
+    x, y, w, h = cv2.boundingRect(max_contour)
+    return x, y, w, h
+def load_image(img_path, bg_color=None, rmbg_net=None, padding_ratio=0.1):
+    img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
+    if img is None:
+        return f"invalid image path {img_path}"
+    def is_valid_alpha(alpha, min_ratio = 0.01):
+        bins = 20
+        if isinstance(alpha, np.ndarray):
+            hist = cv2.calcHist([alpha], [0], None, [bins], [0, 256])
+        else:
+            hist = torch.histc(alpha, bins=bins, min=0, max=1)
+        min_hist_val = alpha.shape[0] * alpha.shape[1] * min_ratio
+        return hist[0] >= min_hist_val and hist[-1] >= min_hist_val
+    def rmbg(image: torch.Tensor) -> torch.Tensor:
+        image = TF.normalize(image, [0.5,0.5,0.5], [1.0,1.0,1.0]).unsqueeze(0)
+        result=rmbg_net(image)
+        return result[0][0]
+    if len(img.shape) == 2:
+        num_channels = 1
+    else:
+        num_channels = img.shape[2]
+    # check if too large
+    height, width = img.shape[:2]
+    if height > width:
+        scale = 2000 / height
+    else:
+        scale = 2000 / width
+    if scale < 1:
+        new_size = (int(width * scale), int(height * scale))
+        img = cv2.resize(img, new_size, interpolation=cv2.INTER_AREA)
+    if img.dtype != 'uint8':
+        img = (img * (255. / np.iinfo(img.dtype).max)).astype(np.uint8)
+    rgb_image = None
+    alpha = None
+    if num_channels == 1:
+        rgb_image = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+    elif num_channels == 3:
+        rgb_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    elif num_channels == 4:
+        rgb_image = cv2.cvtColor(img, cv2.COLOR_BGRA2RGB)
+        b, g, r, alpha = cv2.split(img)
+        if not is_valid_alpha(alpha):
+            alpha = None
+        else:
+            alpha_gpu = torch.from_numpy(alpha).unsqueeze(0).cuda().float() / 255.
+    else:
+        return f"invalid image: channels {num_channels}"
+    rgb_image_gpu = torch.from_numpy(rgb_image).cuda().float().permute(2, 0, 1) / 255.
+    if alpha is None:
+        resize_transform = transforms.Resize((384, 384), antialias=True)
+        rgb_image_resized = resize_transform(rgb_image_gpu)
+        normalize_image = rgb_image_resized * 2 - 1
+        mean_color = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1).cuda()
+        resize_transform = transforms.Resize((1024, 1024), antialias=True)
+        rgb_image_resized = resize_transform(rgb_image_gpu)
+        max_value = rgb_image_resized.flatten().max()
+        if max_value < 1e-3:
+            return "invalid image: pure black image"
+        normalize_image = rgb_image_resized / max_value - mean_color
+        normalize_image = normalize_image.unsqueeze(0)
+        resize_transform = transforms.Resize((rgb_image_gpu.shape[1], rgb_image_gpu.shape[2]), antialias=True)
+        # seg from rmbg
+        if rmbg_net is None:
+            alpha_gpu_rmbg = torch.ones(1, 1, rgb_image_resized.shape[1], rgb_image_resized.shape[2], device=rgb_image_resized.device)
+        else:
+            alpha_gpu_rmbg = rmbg(rgb_image_resized)
+        alpha_gpu_rmbg = alpha_gpu_rmbg.squeeze(0)
+        alpha_gpu_rmbg = resize_transform(alpha_gpu_rmbg)
+        ma, mi = alpha_gpu_rmbg.max(), alpha_gpu_rmbg.min()
+        alpha_gpu_rmbg = (alpha_gpu_rmbg - mi) / (ma - mi)
+        alpha_gpu = alpha_gpu_rmbg
+        alpha_gpu_tmp = alpha_gpu * 255
+        alpha = alpha_gpu_tmp.to(torch.uint8).squeeze().cpu().numpy()
+        _, alpha = cv2.threshold(alpha, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
+        labeled_alpha = label(alpha)
+        cleaned_alpha = remove_small_objects(labeled_alpha, min_size=200)
+        cleaned_alpha = (cleaned_alpha > 0).astype(np.uint8)
+        alpha = cleaned_alpha * 255
+        alpha_gpu = torch.from_numpy(cleaned_alpha).cuda().float().unsqueeze(0)
+        x, y, w, h = find_bounding_box(alpha)
+    # If alpha is provided, the bounds of all foreground are used
+    else:
+        rows, cols = np.where(alpha > 0)
+        if rows.size > 0 and cols.size > 0:
+            x_min = np.min(cols)
+            y_min = np.min(rows)
+            x_max = np.max(cols)
+            y_max = np.max(rows)
+            width = x_max - x_min + 1
+            height = y_max - y_min + 1
+        x, y, w, h = x_min, y_min, width, height
+    if np.all(alpha==0):
+        # Blank alpha: treat entire image as foreground instead of raising
+        h_full, w_full = alpha.shape[:2]
+        alpha = np.full((h_full, w_full), 255, dtype=np.uint8)
+        alpha_gpu = torch.ones(1, h_full, w_full, dtype=torch.float32, device=rgb_image_gpu.device)
+        x, y, w, h = 0, 0, w_full, h_full
+    bg_gray = bg_color[0]
+    bg_color = torch.from_numpy(bg_color).float().cuda().repeat(alpha_gpu.shape[1], alpha_gpu.shape[2], 1).permute(2, 0, 1)
+    rgb_image_gpu = rgb_image_gpu * alpha_gpu + bg_color * (1 - alpha_gpu)
+    padding_size = [0] * 6
+    if w > h:
+        padding_size[0] = int(w * padding_ratio)
+        padding_size[2] = int(padding_size[0] + (w - h) / 2)
+    else:
+        padding_size[2] = int(h * padding_ratio)
+        padding_size[0] = int(padding_size[2] + (h - w) / 2)
+    padding_size[1] = padding_size[0]
+    padding_size[3] = padding_size[2]
+    padded_tensor = F.pad(rgb_image_gpu[:, y:(y+h), x:(x+w)], pad=tuple(padding_size), mode='constant', value=bg_gray)
+    return padded_tensor
+def prepare_image(image_path, bg_color, rmbg_net=None):
+    if os.path.isfile(image_path):
+        img_tensor = load_image(image_path, bg_color=bg_color, rmbg_net=rmbg_net)
+        img_np = img_tensor.permute(1,2,0).cpu().numpy()
+        img_pil = Image.fromarray((img_np*255).astype(np.uint8))
+        return img_pil

patches/triposg/scripts/inference_triposg.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import argparse
+import os
+import sys
+from glob import glob
+from typing import Any, Union
+import numpy as np
+import torch
+import trimesh
+from huggingface_hub import snapshot_download
+from PIL import Image
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from triposg.pipelines.pipeline_triposg import TripoSGPipeline
+from image_process import prepare_image
+from briarmbg import BriaRMBG
+@torch.no_grad()
+def run_triposg(
+    pipe: Any,
+    image_input: Union[str, Image.Image],
+    rmbg_net: Any,
+    seed: int,
+    num_inference_steps: int = 50,
+    guidance_scale: float = 7.0,
+    faces: int = -1,
+) -> trimesh.Scene:
+    img_pil = prepare_image(image_input, bg_color=np.array([1.0, 1.0, 1.0]), rmbg_net=rmbg_net)
+    outputs = pipe(
+        image=img_pil,
+        generator=torch.Generator(device=pipe.device).manual_seed(seed),
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+    ).samples[0]
+    mesh = trimesh.Trimesh(outputs[0].astype(np.float32), np.ascontiguousarray(outputs[1]))
+    if faces > 0:
+        mesh = simplify_mesh(mesh, faces)
+    return mesh
+def simplify_mesh(mesh: trimesh.Trimesh, n_faces):
+    if mesh.faces.shape[0] > n_faces:
+        mesh = trimesh.Trimesh(vertices=mesh.vertices, faces=mesh.faces, process=True)
+        mesh = mesh.simplify_quadric_decimation(n_faces)
+    return mesh
+if __name__ == "__main__":
+    device = "cuda"
+    dtype = torch.float16
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--image-input", type=str, required=True)
+    parser.add_argument("--output-path", type=str, default="./output.glb")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num-inference-steps", type=int, default=50)
+    parser.add_argument("--guidance-scale", type=float, default=7.0)
+    parser.add_argument("--faces", type=int, default=-1)
+    args = parser.parse_args()
+    # download pretrained weights
+    triposg_weights_dir = "pretrained_weights/TripoSG"
+    rmbg_weights_dir = "pretrained_weights/RMBG-1.4"
+    snapshot_download(repo_id="VAST-AI/TripoSG", local_dir=triposg_weights_dir)
+    snapshot_download(repo_id="briaai/RMBG-1.4", local_dir=rmbg_weights_dir)
+    # init rmbg model for background removal
+    rmbg_net = BriaRMBG.from_pretrained(rmbg_weights_dir).to(device)
+    rmbg_net.eval()
+    # init tripoSG pipeline
+    pipe: TripoSGPipeline = TripoSGPipeline.from_pretrained(triposg_weights_dir).to(device, dtype)
+    # run inference
+    run_triposg(
+        pipe,
+        image_input=args.image_input,
+        rmbg_net=rmbg_net,
+        seed=args.seed,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        faces=args.faces,
+    ).export(args.output_path)
+    print(f"Mesh saved to {args.output_path}")

patches/triposg/triposg/inference_utils.py ADDED Viewed

	@@ -0,0 +1,492 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import scipy.ndimage
+from skimage import measure
+from einops import repeat
+try:
+    from diso import DiffDMC
+except Exception as _diso_err:
+    print(f'[TripoSG] diso unavailable ({_diso_err}), using flash fallback')
+    DiffDMC = None
+import torch.nn.functional as F
+from triposg.utils.typing import *
+def generate_dense_grid_points_gpu(bbox_min: torch.Tensor,
+                                   bbox_max: torch.Tensor,
+                                   octree_depth: int,
+                                   indexing: str = "ij"):
+    length = bbox_max - bbox_min
+    num_cells = 2 ** octree_depth
+    device = bbox_min.device
+    x = torch.linspace(bbox_min[0], bbox_max[0], int(num_cells), dtype=torch.float16, device=device)
+    y = torch.linspace(bbox_min[1], bbox_max[1], int(num_cells), dtype=torch.float16, device=device)
+    z = torch.linspace(bbox_min[2], bbox_max[2], int(num_cells), dtype=torch.float16, device=device)
+    xs, ys, zs = torch.meshgrid(x, y, z, indexing=indexing)
+    xyz = torch.stack((xs, ys, zs), dim=-1)
+    xyz = xyz.view(-1, 3)
+    grid_size = [int(num_cells), int(num_cells), int(num_cells)]
+    return xyz, grid_size, length
+def find_mesh_grid_coordinates_fast_gpu(occupancy_grid, n_limits=-1):
+    core_grid = occupancy_grid[1:-1, 1:-1, 1:-1]
+    occupied = core_grid > 0
+    neighbors_unoccupied = (
+        (occupancy_grid[:-2, :-2, :-2] < 0)
+        | (occupancy_grid[:-2, :-2, 1:-1] < 0)
+        | (occupancy_grid[:-2, :-2, 2:] < 0)  # x-1, y-1, z-1/0/1
+        | (occupancy_grid[:-2, 1:-1, :-2] < 0)
+        | (occupancy_grid[:-2, 1:-1, 1:-1] < 0)
+        | (occupancy_grid[:-2, 1:-1, 2:] < 0)  # x-1, y0, z-1/0/1
+        | (occupancy_grid[:-2, 2:, :-2] < 0)
+        | (occupancy_grid[:-2, 2:, 1:-1] < 0)
+        | (occupancy_grid[:-2, 2:, 2:] < 0)  # x-1, y+1, z-1/0/1
+        | (occupancy_grid[1:-1, :-2, :-2] < 0)
+        | (occupancy_grid[1:-1, :-2, 1:-1] < 0)
+        | (occupancy_grid[1:-1, :-2, 2:] < 0)  # x0, y-1, z-1/0/1
+        | (occupancy_grid[1:-1, 1:-1, :-2] < 0)
+        | (occupancy_grid[1:-1, 1:-1, 2:] < 0)  # x0, y0, z-1/1
+        | (occupancy_grid[1:-1, 2:, :-2] < 0)
+        | (occupancy_grid[1:-1, 2:, 1:-1] < 0)
+        | (occupancy_grid[1:-1, 2:, 2:] < 0)  # x0, y+1, z-1/0/1
+        | (occupancy_grid[2:, :-2, :-2] < 0)
+        | (occupancy_grid[2:, :-2, 1:-1] < 0)
+        | (occupancy_grid[2:, :-2, 2:] < 0)  # x+1, y-1, z-1/0/1
+        | (occupancy_grid[2:, 1:-1, :-2] < 0)
+        | (occupancy_grid[2:, 1:-1, 1:-1] < 0)
+        | (occupancy_grid[2:, 1:-1, 2:] < 0)  # x+1, y0, z-1/0/1
+        | (occupancy_grid[2:, 2:, :-2] < 0)
+        | (occupancy_grid[2:, 2:, 1:-1] < 0)
+        | (occupancy_grid[2:, 2:, 2:] < 0)  # x+1, y+1, z-1/0/1
+    )
+    core_mesh_coords = torch.nonzero(occupied & neighbors_unoccupied, as_tuple=False) + 1
+    if n_limits != -1 and core_mesh_coords.shape[0] > n_limits:
+        print(f"core mesh coords {core_mesh_coords.shape[0]} is too large, limited to {n_limits}")
+        ind = np.random.choice(core_mesh_coords.shape[0], n_limits, True)
+        core_mesh_coords = core_mesh_coords[ind]
+    return core_mesh_coords
+def find_candidates_band(occupancy_grid: torch.Tensor, band_threshold: float, n_limits: int = -1) -> torch.Tensor:
+    """
+    Returns the coordinates of all voxels in the occupancy_grid where |value| < band_threshold.
+    Args:
+        occupancy_grid (torch.Tensor): A 3D tensor of SDF values.
+        band_threshold (float): The threshold below which |SDF| must be to include the voxel.
+        n_limits (int): Maximum number of points to return (-1 for no limit)
+    Returns:
+        torch.Tensor: A 2D tensor of coordinates (N x 3) where each row is [x, y, z].
+    """
+    core_grid = occupancy_grid[1:-1, 1:-1, 1:-1]
+    # logits to sdf
+    core_grid = torch.sigmoid(core_grid) * 2 - 1
+    # Create a boolean mask for all cells in the band
+    in_band = torch.abs(core_grid) < band_threshold
+    # Get coordinates of all voxels in the band
+    core_mesh_coords = torch.nonzero(in_band, as_tuple=False) + 1
+    if n_limits != -1 and core_mesh_coords.shape[0] > n_limits:
+        print(f"core mesh coords {core_mesh_coords.shape[0]} is too large, limited to {n_limits}")
+        ind = np.random.choice(core_mesh_coords.shape[0], n_limits, True)
+        core_mesh_coords = core_mesh_coords[ind]
+    return core_mesh_coords
+def expand_edge_region_fast(edge_coords, grid_size):
+    expanded_tensor = torch.zeros(grid_size, grid_size, grid_size, device='cuda', dtype=torch.float16, requires_grad=False)
+    expanded_tensor[edge_coords[:, 0], edge_coords[:, 1], edge_coords[:, 2]] = 1
+    if grid_size < 512:
+        kernel_size = 5
+        pooled_tensor = torch.nn.functional.max_pool3d(expanded_tensor.unsqueeze(0).unsqueeze(0), kernel_size=kernel_size, stride=1, padding=2).squeeze()
+    else:
+        kernel_size = 3
+        pooled_tensor = torch.nn.functional.max_pool3d(expanded_tensor.unsqueeze(0).unsqueeze(0), kernel_size=kernel_size, stride=1, padding=1).squeeze()
+    expanded_coords_low_res = torch.nonzero(pooled_tensor, as_tuple=False).to(torch.int16)
+    expanded_coords_high_res = torch.stack([
+        torch.cat((expanded_coords_low_res[:, 0] * 2, expanded_coords_low_res[:, 0] * 2, expanded_coords_low_res[:, 0] * 2, expanded_coords_low_res[:, 0] * 2, expanded_coords_low_res[:, 0] * 2 + 1, expanded_coords_low_res[:, 0] * 2 + 1, expanded_coords_low_res[:, 0] * 2 + 1, expanded_coords_low_res[:, 0] * 2 + 1)),
+        torch.cat((expanded_coords_low_res[:, 1] * 2, expanded_coords_low_res[:, 1] * 2, expanded_coords_low_res[:, 1] * 2+1, expanded_coords_low_res[:, 1] * 2 + 1, expanded_coords_low_res[:, 1] * 2, expanded_coords_low_res[:, 1] * 2, expanded_coords_low_res[:, 1] * 2 + 1, expanded_coords_low_res[:, 1] * 2 + 1)),
+        torch.cat((expanded_coords_low_res[:, 2] * 2, expanded_coords_low_res[:, 2] * 2+1, expanded_coords_low_res[:, 2] * 2, expanded_coords_low_res[:, 2] * 2 + 1, expanded_coords_low_res[:, 2] * 2, expanded_coords_low_res[:, 2] * 2+1, expanded_coords_low_res[:, 2] * 2, expanded_coords_low_res[:, 2] * 2 + 1))
+    ], dim=1)
+    return expanded_coords_high_res
+def zoom_block(block, scale_factor, order=3):
+    block = block.astype(np.float32)
+    return scipy.ndimage.zoom(block, scale_factor, order=order)
+def parallel_zoom(occupancy_grid, scale_factor):
+    result = torch.nn.functional.interpolate(occupancy_grid.unsqueeze(0).unsqueeze(0), scale_factor=scale_factor)
+    return result.squeeze(0).squeeze(0)
+@torch.no_grad()
+def _hierarchical_extract_geometry_impl(geometric_func: Callable,
+                     device: torch.device,
+                     bounds: Union[Tuple[float], List[float], float] = (-1.25, -1.25, -1.25, 1.25, 1.25, 1.25),
+                     dense_octree_depth: int = 8,
+                     hierarchical_octree_depth: int = 9,
+                     ):
+    """
+    Args:
+        geometric_func:
+        device:
+        bounds:
+        dense_octree_depth:
+        hierarchical_octree_depth:
+    Returns:
+    """
+    if isinstance(bounds, float):
+        bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+    bbox_min = torch.tensor(bounds[0:3]).to(device)
+    bbox_max = torch.tensor(bounds[3:6]).to(device)
+    bbox_size = bbox_max - bbox_min
+    xyz_samples, grid_size, length = generate_dense_grid_points_gpu(
+        bbox_min=bbox_min,
+        bbox_max=bbox_max,
+        octree_depth=dense_octree_depth,
+        indexing="ij"
+    )
+    print(f'step 1 query num: {xyz_samples.shape[0]}')
+    grid_logits = geometric_func(xyz_samples.unsqueeze(0)).to(torch.float16).view(grid_size[0], grid_size[1], grid_size[2])
+    # print(f'step 1 grid_logits shape: {grid_logits.shape}')
+    for i in range(hierarchical_octree_depth - dense_octree_depth):
+        curr_octree_depth = dense_octree_depth + i + 1
+        # upsample
+        grid_size = 2**curr_octree_depth
+        normalize_offset = grid_size / 2
+        high_res_occupancy = parallel_zoom(grid_logits, 2)
+        band_threshold = 1.0
+        edge_coords = find_candidates_band(grid_logits, band_threshold)
+        expanded_coords = expand_edge_region_fast(edge_coords, grid_size=int(grid_size/2)).to(torch.float16)
+        print(f'step {i+2} query num: {len(expanded_coords)}')
+        expanded_coords_norm = (expanded_coords - normalize_offset) * (abs(bounds[0]) / normalize_offset)
+        all_logits = None
+        all_logits = geometric_func(expanded_coords_norm.unsqueeze(0)).to(torch.float16)
+        all_logits = torch.cat([expanded_coords_norm, all_logits[0]], dim=1)
+        # print("all logits shape = ", all_logits.shape)
+        indices = all_logits[..., :3]
+        indices = indices * (normalize_offset / abs(bounds[0]))  + normalize_offset
+        indices = indices.type(torch.IntTensor)
+        values = all_logits[:, 3]
+        # breakpoint()
+        high_res_occupancy[indices[:, 0], indices[:, 1], indices[:, 2]] = values
+        grid_logits = high_res_occupancy
+        torch.cuda.empty_cache()
+    mesh_v_f = []
+    try:
+        print("final grids shape = ", grid_logits.shape)
+        vertices, faces, normals, _ = measure.marching_cubes(grid_logits.float().cpu().numpy(), 0, method="lewiner")
+        vertices = vertices / (2**hierarchical_octree_depth) * bbox_size.cpu().numpy() + bbox_min.cpu().numpy()
+        mesh_v_f = (vertices.astype(np.float32), np.ascontiguousarray(faces))
+    except Exception as e:
+        print(e)
+        torch.cuda.empty_cache()
+        mesh_v_f = (None, None)
+    return [mesh_v_f]
+def hierarchical_extract_geometry(*args, **kwargs):
+    """Wrapper: uses DiffDMC-based flash path when diso is available, else marching cubes fallback."""
+    if DiffDMC is None:
+        # flash_extract_geometry needs latents + vae — forward positional args unchanged
+        return flash_extract_geometry(*args, **kwargs)
+    return _hierarchical_extract_geometry_impl(*args, **kwargs)
+def extract_near_surface_volume_fn(input_tensor: torch.Tensor, alpha: float):
+    """
+    Args:
+        input_tensor: shape [D, D, D], torch.float16
+        alpha: isosurface offset
+    Returns:
+        mask: shape [D, D, D], torch.int32
+    """
+    device = input_tensor.device
+    D = input_tensor.shape[0]
+    signed_val = 0.0
+    # add isosurface offset and exclude invalid value
+    val = input_tensor + alpha
+    valid_mask = val > -9000
+    # obtain neighbors
+    def get_neighbor(t, shift, axis):
+        if shift == 0:
+            return t.clone()
+        pad_dims = [0, 0, 0, 0, 0, 0]  # [x_front，x_back，y_front，y_back，z_front，z_back]
+        if axis == 0:  # x axis
+            pad_idx = 0 if shift > 0 else 1
+            pad_dims[pad_idx] = abs(shift)
+        elif axis == 1:  # y axis
+            pad_idx = 2 if shift > 0 else 3
+            pad_dims[pad_idx] = abs(shift)
+        elif axis == 2:  # z axis
+            pad_idx = 4 if shift > 0 else 5
+            pad_dims[pad_idx] = abs(shift)
+        # Apply padding with replication at boundaries
+        padded = F.pad(t.unsqueeze(0).unsqueeze(0), pad_dims[::-1], mode='replicate')
+        # Create dynamic slicing indices
+        slice_dims = [slice(None)] * 3
+        if axis == 0:  # x axis
+            if shift > 0:
+                slice_dims[0] = slice(shift, None)
+            else:
+                slice_dims[0] = slice(None, shift)
+        elif axis == 1:  # y axis
+            if shift > 0:
+                slice_dims[1] = slice(shift, None)
+            else:
+                slice_dims[1] = slice(None, shift)
+        elif axis == 2:  # z axis
+            if shift > 0:
+                slice_dims[2] = slice(shift, None)
+            else:
+                slice_dims[2] = slice(None, shift)
+        # Apply slicing and restore dimensions
+        padded = padded.squeeze(0).squeeze(0)
+        sliced = padded[slice_dims]
+        return sliced
+    # Get neighbors in all directions
+    left = get_neighbor(val, 1, axis=0)  # x axis
+    right = get_neighbor(val, -1, axis=0)
+    back = get_neighbor(val, 1, axis=1)  # y axis
+    front = get_neighbor(val, -1, axis=1)
+    down = get_neighbor(val, 1, axis=2)  # z axis
+    up = get_neighbor(val, -1, axis=2)
+    # Handle invalid boundary values
+    def safe_where(neighbor):
+        return torch.where(neighbor > -9000, neighbor, val)
+    left = safe_where(left)
+    right = safe_where(right)
+    back = safe_where(back)
+    front = safe_where(front)
+    down = safe_where(down)
+    up = safe_where(up)
+    # Calculate sign consistency
+    sign = torch.sign(val.to(torch.float32))
+    neighbors_sign = torch.stack([
+        torch.sign(left.to(torch.float32)),
+        torch.sign(right.to(torch.float32)),
+        torch.sign(back.to(torch.float32)),
+        torch.sign(front.to(torch.float32)),
+        torch.sign(down.to(torch.float32)),
+        torch.sign(up.to(torch.float32))
+    ], dim=0)
+    # Check if all signs are consistent
+    same_sign = torch.all(neighbors_sign == sign, dim=0)
+    # Generate final mask
+    mask = (~same_sign).to(torch.int32)
+    return mask * valid_mask.to(torch.int32)
+def generate_dense_grid_points_2(
+    bbox_min: np.ndarray,
+    bbox_max: np.ndarray,
+    octree_resolution: int,
+    indexing: str = "ij",
+):
+    length = bbox_max - bbox_min
+    num_cells = octree_resolution
+    x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
+    y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
+    z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
+    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
+    xyz = np.stack((xs, ys, zs), axis=-1)
+    grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
+    return xyz, grid_size, length
+@torch.no_grad()
+def flash_extract_geometry(
+    latents: torch.FloatTensor,
+    vae: Callable,
+    bounds: Union[Tuple[float], List[float], float] = 1.01,
+    num_chunks: int = 10000,
+    mc_level: float = 0.0,
+    octree_depth: int = 9,
+    min_resolution: int = 63,
+    mini_grid_num: int = 4,
+    **kwargs,
+):
+    geo_decoder = vae.decoder
+    device = latents.device
+    dtype = latents.dtype
+    # resolution to depth
+    octree_resolution = 2 ** octree_depth
+    resolutions = []
+    if octree_resolution < min_resolution:
+        resolutions.append(octree_resolution)
+    while octree_resolution >= min_resolution:
+        resolutions.append(octree_resolution)
+        octree_resolution = octree_resolution // 2
+    resolutions.reverse()
+    resolutions[0] = round(resolutions[0] / mini_grid_num) * mini_grid_num - 1
+    for i, resolution in enumerate(resolutions[1:]):
+        resolutions[i + 1] = resolutions[0] * 2 ** (i + 1)
+    # 1. generate query points
+    if isinstance(bounds, float):
+        bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+    bbox_min = np.array(bounds[0:3])
+    bbox_max = np.array(bounds[3:6])
+    bbox_size = bbox_max - bbox_min
+    xyz_samples, grid_size, length = generate_dense_grid_points_2(
+        bbox_min=bbox_min,
+        bbox_max=bbox_max,
+        octree_resolution=resolutions[0],
+        indexing="ij"
+    )
+    dilate = nn.Conv3d(1, 1, 3, padding=1, bias=False, device=device, dtype=dtype)
+    dilate.weight = torch.nn.Parameter(torch.ones(dilate.weight.shape, dtype=dtype, device=device))
+    grid_size = np.array(grid_size)
+    # 2. latents to 3d volume
+    xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype)
+    batch_size = latents.shape[0]
+    mini_grid_size = xyz_samples.shape[0] // mini_grid_num
+    xyz_samples = xyz_samples.view(
+        mini_grid_num, mini_grid_size,
+        mini_grid_num, mini_grid_size,
+        mini_grid_num, mini_grid_size, 3
+    ).permute(
+        0, 2, 4, 1, 3, 5, 6
+    ).reshape(
+        -1, mini_grid_size * mini_grid_size * mini_grid_size, 3
+    )
+    batch_logits = []
+    num_batchs = max(num_chunks // xyz_samples.shape[1], 1)
+    for start in range(0, xyz_samples.shape[0], num_batchs):
+        queries = xyz_samples[start: start + num_batchs, :]
+        batch = queries.shape[0]
+        batch_latents = repeat(latents.squeeze(0), "p c -> b p c", b=batch)
+        # geo_decoder.set_topk(True)
+        geo_decoder.set_topk(False)
+        logits = vae.decode(batch_latents, queries.to(dtype=batch_latents.dtype)).sample
+        batch_logits.append(logits)
+    grid_logits = torch.cat(batch_logits, dim=0).reshape(
+        mini_grid_num, mini_grid_num, mini_grid_num,
+        mini_grid_size, mini_grid_size,
+        mini_grid_size
+    ).permute(0, 3, 1, 4, 2, 5).contiguous().view(
+        (batch_size, grid_size[0], grid_size[1], grid_size[2])
+    )
+    for octree_depth_now in resolutions[1:]:
+        grid_size = np.array([octree_depth_now + 1] * 3)
+        resolution = bbox_size / octree_depth_now
+        next_index = torch.zeros(tuple(grid_size), dtype=dtype, device=device)
+        next_logits = torch.full(next_index.shape, -10000., dtype=dtype, device=device)
+        curr_points = extract_near_surface_volume_fn(grid_logits.squeeze(0), mc_level)
+        curr_points += grid_logits.squeeze(0).abs() < 0.95
+        if octree_depth_now == resolutions[-1]:
+            expand_num = 0
+        else:
+            expand_num = 1
+        for i in range(expand_num):
+            curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0)
+            curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0)
+        (cidx_x, cidx_y, cidx_z) = torch.where(curr_points > 0)
+        next_index[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1
+        for i in range(2 - expand_num):
+            next_index = dilate(next_index.unsqueeze(0)).squeeze(0)
+        nidx = torch.where(next_index > 0)
+        next_points = torch.stack(nidx, dim=1)
+        next_points = (next_points * torch.tensor(resolution, dtype=torch.float32, device=device) +
+                        torch.tensor(bbox_min, dtype=torch.float32, device=device))
+        query_grid_num = 6
+        min_val = next_points.min(axis=0).values
+        max_val = next_points.max(axis=0).values
+        vol_queries_index = (next_points - min_val) / (max_val - min_val) * (query_grid_num - 0.001)
+        index = torch.floor(vol_queries_index).long()
+        index = index[..., 0] * (query_grid_num ** 2) + index[..., 1] * query_grid_num + index[..., 2]
+        index = index.sort()
+        next_points = next_points[index.indices].unsqueeze(0).contiguous()
+        unique_values = torch.unique(index.values, return_counts=True)
+        grid_logits = torch.zeros((next_points.shape[1]), dtype=latents.dtype, device=latents.device)
+        input_grid = [[], []]
+        logits_grid_list = []
+        start_num = 0
+        sum_num = 0
+        for grid_index, count in zip(unique_values[0].cpu().tolist(), unique_values[1].cpu().tolist()):
+            if sum_num + count < num_chunks or sum_num == 0:
+                sum_num += count
+                input_grid[0].append(grid_index)
+                input_grid[1].append(count)
+            else:
+                # geo_decoder.set_topk(input_grid)
+                geo_decoder.set_topk(False)
+                logits_grid = vae.decode(latents, next_points[:, start_num:start_num + sum_num].to(dtype=latents.dtype)).sample
+                start_num = start_num + sum_num
+                logits_grid_list.append(logits_grid)
+                input_grid = [[grid_index], [count]]
+                sum_num = count
+        if sum_num > 0:
+            # geo_decoder.set_topk(input_grid)
+            geo_decoder.set_topk(False)
+            logits_grid = vae.decode(latents, next_points[:, start_num:start_num + sum_num].to(dtype=latents.dtype)).sample
+            logits_grid_list.append(logits_grid)
+        logits_grid = torch.cat(logits_grid_list, dim=1)
+        grid_logits[index.indices] = logits_grid.squeeze(0).squeeze(-1)
+        next_logits[nidx] = grid_logits
+        grid_logits = next_logits.unsqueeze(0)
+    grid_logits[grid_logits == -10000.] = float('nan')
+    torch.cuda.empty_cache()
+    mesh_v_f = []
+    grid_logits = grid_logits[0]
+    try:
+        print("final grids shape = ", grid_logits.shape)
+        dmc = DiffDMC(dtype=torch.float32).to(grid_logits.device)
+        sdf = -grid_logits / octree_resolution
+        sdf = sdf.to(torch.float32).contiguous()
+        vertices, faces = dmc(sdf, deform=None, return_quads=False, normalize=False)
+        vertices = vertices.detach().cpu().numpy()
+        faces = faces.detach().cpu().numpy()[:, ::-1]
+        vertices = vertices / (2 ** octree_depth) * bbox_size + bbox_min
+        mesh_v_f = (vertices.astype(np.float32), np.ascontiguousarray(faces))
+    except Exception as e:
+        print(e)
+        torch.cuda.empty_cache()
+        mesh_v_f = (None, None)
+    return [mesh_v_f]