lingbot-3d-ZERO

Running on Zero

App Files Files Community

dennny123 commited on 19 days ago

Commit

afd3356

verified ·

1 Parent(s): bafdd0f

Remove teaser image, startup note, and run summary accordion

Browse files

Files changed (1) hide show

app.py +228 -23

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import contextlib
 import json
 import os
 import shutil
@@ -15,6 +17,7 @@ import numpy as np
 import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image, ImageDraw
 try:
     import spaces
@@ -55,6 +58,7 @@ DEFAULT_SCALE_FRAMES = 4
 DEFAULT_KEYFRAME_INTERVAL = 2
 DEFAULT_CONF_PERCENTILE = 50.0
 DEFAULT_CAMERA_ITERATIONS = 1
 IS_SPACE_RUNTIME = bool(os.getenv("SPACE_ID"))
 SKIP_EAGER_MODEL_LOAD = os.getenv("LINGBOT_SPACE_SKIP_MODEL_LOAD") == "1"
@@ -335,6 +339,14 @@ def _save_predictions_npz(predictions: dict[str, Any], output_path: Path) -> str
     return str(output_path)
 def _count_confident_points(vis_predictions: dict[str, Any], conf_percentile: float) -> tuple[int, float]:
     conf = vis_predictions.get("world_points_conf")
     if conf is None:
@@ -345,6 +357,179 @@ def _count_confident_points(vis_predictions: dict[str, Any], conf_percentile: fl
     return kept, float(threshold)
 def _zip_outputs(work_dir: Path, paths: list[Path], output_name: str) -> str:
     zip_path = work_dir / output_name
     with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zip_file:
@@ -364,7 +549,7 @@ def _export_outputs(
     num_scale_frames: int,
     keyframe_interval: int,
     conf_percentile: float,
-) -> tuple[str, str, dict[str, Any]]:
     vis_predictions = _prepare_for_visualization(predictions, images_cpu)
     glb_path = work_dir / "lingbot-map-reconstruction.glb"
@@ -377,6 +562,11 @@ def _export_outputs(
     )
     scene.export(glb_path)
     preview_path = Path(_make_preview_strip(images_cpu, work_dir / "preview.png"))
     npz_path = Path(_save_predictions_npz(predictions, work_dir / "predictions.npz"))
@@ -390,6 +580,7 @@ def _export_outputs(
         "confidence_percentile": conf_percentile,
         "confidence_threshold": round(conf_threshold, 4),
         "points_kept_for_glb": points_kept,
         "input_summary": input_summary,
         "runtime_summary": runtime_summary,
     }
@@ -399,10 +590,10 @@ def _export_outputs(
     artifact_path = _zip_outputs(
         work_dir,
-        [glb_path, preview_path, npz_path, summary_path],
         output_name="lingbot-map-results.zip",
     )
-    return str(glb_path), artifact_path, summary
 def _format_status(summary: dict[str, Any]) -> str:
@@ -416,6 +607,7 @@ def _format_status(summary: dict[str, Any]) -> str:
         f"- Runtime: `{runtime['runtime_seconds']}s` on `{runtime['device']}`",
         f"- GLB confidence percentile: `{summary['confidence_percentile']}`",
         f"- Points kept for GLB: `{summary['points_kept_for_glb']}`",
     ]
     if runtime.get("peak_memory_gb") is not None:
         lines.append(f"- Peak GPU memory: `{runtime['peak_memory_gb']} GB`")
@@ -446,7 +638,7 @@ def reconstruct_scene(
         keyframe_interval=keyframe_interval,
     )
-    glb_path, artifact_path, summary = _export_outputs(
         work_dir=work_dir,
         image_paths=image_paths,
         predictions=predictions,
@@ -460,7 +652,7 @@ def reconstruct_scene(
     preview_path = str(work_dir / "preview.png")
     status = _format_status(summary)
-    return glb_path, preview_path, artifact_path, summary, status
 def _build_startup_markdown() -> str:
@@ -479,6 +671,25 @@ css = """
     object-fit: cover !important;
     border-radius: 8px !important;
 }
 footer {display: none !important;}
 """
@@ -488,23 +699,12 @@ _eager_load_default_model()
 with gr.Blocks(title="LingBot 3D") as demo:
     with gr.Column(elem_id="container"):
-        gr.Image(
-            value=str(ROOT / "assets" / "teaser.png"),
-            show_label=False,
-            interactive=False,
-            container=False,
-            elem_classes=["teaser"],
-        )
         gr.Markdown("# LingBot 3D")
         gr.Markdown(
             "Upload a short video clip and get back a navigable 3D scene. "
             "Powered by the LingBot-Map checkpoint, exported as a GLB plus a downloadable results bundle."
         )
-        startup_md = _build_startup_markdown()
-        if startup_md:
-            gr.Markdown(startup_md)
         with gr.Row():
             with gr.Column():
                 video_file = gr.Video(
@@ -514,12 +714,17 @@ with gr.Blocks(title="LingBot 3D") as demo:
                     height=380,
                 )
             with gr.Column():
-                model_preview = gr.Model3D(
-                    label="3D preview",
-                    display_mode="point_cloud",
-                    clear_color=[1.0, 1.0, 1.0, 1.0],
-                    height=380,
                 )
         run_button = gr.Button("Build 3D Scene", variant="primary")
         status_markdown = gr.Markdown()
@@ -544,8 +749,7 @@ with gr.Blocks(title="LingBot 3D") as demo:
             preview_image = gr.Image(label="Frame preview", interactive=False, height=200)
             artifact_file = gr.File(label="Download results bundle")
-        with gr.Accordion("Run summary", open=False):
-            summary_json = gr.JSON(label=None)
     run_button.click(
         fn=reconstruct_scene,
@@ -558,6 +762,7 @@ with gr.Blocks(title="LingBot 3D") as demo:
             conf_percentile,
         ],
         outputs=[
             model_preview,
             preview_image,
             artifact_file,

 import contextlib
+import colorsys
+import html
 import json
 import os
 import shutil
 import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image, ImageDraw
+from scipy.spatial.transform import Rotation
 try:
     import spaces
 DEFAULT_KEYFRAME_INTERVAL = 2
 DEFAULT_CONF_PERCENTILE = 50.0
 DEFAULT_CAMERA_ITERATIONS = 1
+MAX_VISER_POINTS = 25_000
 IS_SPACE_RUNTIME = bool(os.getenv("SPACE_ID"))
 SKIP_EAGER_MODEL_LOAD = os.getenv("LINGBOT_SPACE_SKIP_MODEL_LOAD") == "1"
     return str(output_path)
+def _empty_viser_preview(message: str) -> str:
+    return (
+        "<div class='viser-empty'>"
+        f"<div>{html.escape(message)}</div>"
+        "</div>"
+    )
 def _count_confident_points(vis_predictions: dict[str, Any], conf_percentile: float) -> tuple[int, float]:
     conf = vis_predictions.get("world_points_conf")
     if conf is None:
     return kept, float(threshold)
+def _prepare_viser_point_cloud(
+    vis_predictions: dict[str, Any],
+    conf_percentile: float,
+    max_points: int = MAX_VISER_POINTS,
+) -> tuple[np.ndarray, np.ndarray, float]:
+    world_points = vis_predictions.get("world_points")
+    conf = vis_predictions.get("world_points_conf")
+    if world_points is None:
+        world_points = vis_predictions.get("world_points_from_depth")
+        conf = vis_predictions.get("depth_conf")
+    if world_points is None:
+        raise ValueError("Missing world point predictions.")
+    images = vis_predictions["images"]
+    if images.ndim == 4 and images.shape[1] == 3:
+        images = np.transpose(images, (0, 2, 3, 1))
+    points = np.asarray(world_points).reshape(-1, 3)
+    colors = (np.asarray(images).reshape(-1, 3) * 255).clip(0, 255).astype(np.uint8)
+    if conf is None:
+        conf_flat = np.ones(points.shape[0], dtype=np.float32)
+        threshold = 0.0
+    else:
+        conf_flat = np.asarray(conf).reshape(-1)
+        threshold = np.percentile(conf_flat, conf_percentile) if conf_percentile > 0 else 0.0
+    mask = (conf_flat >= threshold) & (conf_flat > 1e-5)
+    points = points[mask]
+    colors = colors[mask]
+    if points.shape[0] == 0:
+        return points.astype(np.float32), colors, float(threshold)
+    if points.shape[0] > max_points:
+        keep_indices = np.linspace(0, points.shape[0] - 1, num=max_points, dtype=np.int64)
+        points = points[keep_indices]
+        colors = colors[keep_indices]
+    return points.astype(np.float32), colors, float(threshold)
+def _add_viser_cameras(
+    server: Any,
+    vis_predictions: dict[str, Any],
+    scene_extent: float,
+) -> list[np.ndarray]:
+    extrinsics = vis_predictions.get("extrinsic")
+    intrinsics = vis_predictions.get("intrinsic")
+    images = vis_predictions.get("images")
+    if extrinsics is None or intrinsics is None or images is None:
+        return []
+    extrinsics = np.asarray(extrinsics)
+    intrinsics = np.asarray(intrinsics)
+    images = np.asarray(images)
+    if images.ndim == 4 and images.shape[1] == 3:
+        _, _, image_height, image_width = images.shape
+    else:
+        _, image_height, image_width, _ = images.shape
+    camera_positions: list[np.ndarray] = []
+    frustum_scale = max(scene_extent * 0.05, 0.05)
+    for idx, world_to_camera_3x4 in enumerate(extrinsics):
+        world_to_camera = np.eye(4, dtype=np.float32)
+        world_to_camera[:3, :4] = world_to_camera_3x4
+        camera_to_world = np.linalg.inv(world_to_camera)
+        camera_positions.append(camera_to_world[:3, 3].copy())
+        intrinsic = intrinsics[idx]
+        fy = float(max(intrinsic[1, 1], 1e-6))
+        fov = float(np.clip(2 * np.arctan2(image_height / 2.0, fy), 0.1, np.pi - 0.1))
+        aspect = float(max(image_width / max(image_height, 1), 1e-3))
+        quat_xyzw = Rotation.from_matrix(camera_to_world[:3, :3]).as_quat()
+        wxyz = (
+            float(quat_xyzw[3]),
+            float(quat_xyzw[0]),
+            float(quat_xyzw[1]),
+            float(quat_xyzw[2]),
+        )
+        color = tuple(
+            int(channel * 255)
+            for channel in colorsys.hsv_to_rgb(idx / max(len(extrinsics), 1), 0.65, 1.0)
+        )
+        server.scene.add_camera_frustum(
+            f"/cameras/camera_{idx:02d}",
+            fov=fov,
+            aspect=aspect,
+            scale=frustum_scale,
+            color=color,
+            wxyz=wxyz,
+            position=tuple(float(x) for x in camera_to_world[:3, 3]),
+            variant="wireframe",
+        )
+    return camera_positions
+def _build_viser_preview(
+    vis_predictions: dict[str, Any],
+    output_path: Path,
+    conf_percentile: float,
+) -> tuple[str, str | None, int]:
+    try:
+        import viser
+    except ModuleNotFoundError:
+        return (
+            _empty_viser_preview("Static Viser preview is unavailable because `viser` is not installed."),
+            None,
+            0,
+        )
+    server = None
+    try:
+        points, colors, _ = _prepare_viser_point_cloud(vis_predictions, conf_percentile)
+        if points.shape[0] == 0:
+            return _empty_viser_preview("No confident points were available for the static Viser preview."), None, 0
+        server = viser.ViserServer(port=0, verbose=False)
+        server.scene.set_up_direction("+z")
+        if hasattr(server.scene, "world_axes"):
+            server.scene.world_axes.visible = False
+        lower = np.percentile(points, 5, axis=0)
+        upper = np.percentile(points, 95, axis=0)
+        scene_extent = float(np.linalg.norm(upper - lower))
+        scene_extent = max(scene_extent, 1e-3)
+        scene_center = points.mean(axis=0)
+        server.scene.add_point_cloud(
+            "/reconstruction",
+            points=points,
+            colors=colors,
+            point_size=max(scene_extent * 0.0025, 0.003),
+        )
+        camera_positions = _add_viser_cameras(server, vis_predictions, scene_extent)
+        if camera_positions:
+            camera_center = np.mean(np.asarray(camera_positions), axis=0)
+            scene_center = (scene_center + camera_center) / 2.0
+        server.initial_camera.look_at = tuple(float(x) for x in scene_center)
+        server.initial_camera.position = tuple(
+            float(x)
+            for x in scene_center + np.array([scene_extent, scene_extent, max(scene_extent * 0.65, 0.25)])
+        )
+        server.initial_camera.up = (0.0, 0.0, 1.0)
+        html_doc = server.scene.as_html(dark_mode=True)
+        output_path.write_text(html_doc, encoding="utf-8")
+        iframe_html = (
+            "<iframe class='viser-frame' "
+            "sandbox='allow-scripts allow-same-origin allow-downloads' "
+            f"srcdoc=\"{html.escape(html_doc, quote=True)}\"></iframe>"
+        )
+        return iframe_html, str(output_path), int(points.shape[0])
+    except Exception as exc:
+        return (
+            _empty_viser_preview(f"Static Viser preview could not be created for this run: {exc}"),
+            None,
+            0,
+        )
+    finally:
+        if server is not None and hasattr(server, "stop"):
+            with contextlib.suppress(Exception):
+                server.stop()
 def _zip_outputs(work_dir: Path, paths: list[Path], output_name: str) -> str:
     zip_path = work_dir / output_name
     with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zip_file:
     num_scale_frames: int,
     keyframe_interval: int,
     conf_percentile: float,
+) -> tuple[str, str, str, dict[str, Any]]:
     vis_predictions = _prepare_for_visualization(predictions, images_cpu)
     glb_path = work_dir / "lingbot-map-reconstruction.glb"
     )
     scene.export(glb_path)
+    viser_preview_html, viser_preview_path, viser_points = _build_viser_preview(
+        vis_predictions,
+        work_dir / "viser-preview.html",
+        conf_percentile=conf_percentile,
+    )
     preview_path = Path(_make_preview_strip(images_cpu, work_dir / "preview.png"))
     npz_path = Path(_save_predictions_npz(predictions, work_dir / "predictions.npz"))
         "confidence_percentile": conf_percentile,
         "confidence_threshold": round(conf_threshold, 4),
         "points_kept_for_glb": points_kept,
+        "points_used_for_viser_preview": viser_points,
         "input_summary": input_summary,
         "runtime_summary": runtime_summary,
     }
     artifact_path = _zip_outputs(
         work_dir,
+        [glb_path, preview_path, npz_path, summary_path, Path(viser_preview_path) if viser_preview_path else work_dir / "__missing__"],
         output_name="lingbot-map-results.zip",
     )
+    return str(glb_path), viser_preview_html, artifact_path, summary
 def _format_status(summary: dict[str, Any]) -> str:
         f"- Runtime: `{runtime['runtime_seconds']}s` on `{runtime['device']}`",
         f"- GLB confidence percentile: `{summary['confidence_percentile']}`",
         f"- Points kept for GLB: `{summary['points_kept_for_glb']}`",
+        f"- Points used for static Viser preview: `{summary['points_used_for_viser_preview']}`",
     ]
     if runtime.get("peak_memory_gb") is not None:
         lines.append(f"- Peak GPU memory: `{runtime['peak_memory_gb']} GB`")
         keyframe_interval=keyframe_interval,
     )
+    glb_path, viser_preview_html, artifact_path, summary = _export_outputs(
         work_dir=work_dir,
         image_paths=image_paths,
         predictions=predictions,
     preview_path = str(work_dir / "preview.png")
     status = _format_status(summary)
+    return viser_preview_html, glb_path, preview_path, artifact_path, summary, status
 def _build_startup_markdown() -> str:
     object-fit: cover !important;
     border-radius: 8px !important;
 }
+.viser-frame {
+    width: 100%;
+    height: 380px;
+    border: 1px solid #d7dce5;
+    border-radius: 12px;
+    background: #0f1720;
+}
+.viser-empty {
+    min-height: 380px;
+    border: 1px dashed #c9d1dd;
+    border-radius: 12px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    padding: 24px;
+    text-align: center;
+    background: linear-gradient(180deg, #f8fafc 0%, #eef2f7 100%);
+    color: #334155;
+}
 footer {display: none !important;}
 """
 with gr.Blocks(title="LingBot 3D") as demo:
     with gr.Column(elem_id="container"):
         gr.Markdown("# LingBot 3D")
         gr.Markdown(
             "Upload a short video clip and get back a navigable 3D scene. "
             "Powered by the LingBot-Map checkpoint, exported as a GLB plus a downloadable results bundle."
         )
         with gr.Row():
             with gr.Column():
                 video_file = gr.Video(
                     height=380,
                 )
             with gr.Column():
+                gr.Markdown("### Static Viser Preview")
+                viser_preview = gr.HTML(
+                    value=_empty_viser_preview("Run a reconstruction to load the static Viser preview."),
                 )
+                with gr.Accordion("Fallback GLB preview", open=False):
+                    model_preview = gr.Model3D(
+                        label="GLB preview",
+                        display_mode="point_cloud",
+                        clear_color=[1.0, 1.0, 1.0, 1.0],
+                        height=380,
+                    )
         run_button = gr.Button("Build 3D Scene", variant="primary")
         status_markdown = gr.Markdown()
             preview_image = gr.Image(label="Frame preview", interactive=False, height=200)
             artifact_file = gr.File(label="Download results bundle")
+        summary_json = gr.JSON(visible=False)
     run_button.click(
         fn=reconstruct_scene,
             conf_percentile,
         ],
         outputs=[
+            viser_preview,
             model_preview,
             preview_image,
             artifact_file,