lingbot-3d-ZERO

Running on Zero

App Files Files Community

dennny123 commited on 20 days ago

Commit

3e5df04

verified ·

1 Parent(s): bd30ee0

Make Space video-only and fix preview export shape handling

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +13 -50

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: LingBot 3D
-short_description: Streaming 3D scene reconstruction from short clips.
 colorFrom: blue
 colorTo: green
 sdk: gradio

 ---
 title: LingBot 3D
+short_description: Streaming 3D scene reconstruction from short videos.
 colorFrom: blue
 colorTo: green
 sdk: gradio

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import threading
 import time
 import zipfile
 from pathlib import Path
-from typing import Any, Iterable
 import cv2
 import gradio as gr
@@ -142,21 +142,6 @@ def _eager_load_default_model() -> None:
         STARTUP_NOTES.append(f"Startup preload failed: {exc}")
-def _copy_image_inputs(image_files: Iterable[Any], input_dir: Path, max_frames: int) -> list[str]:
-    paths = sorted(filter(None, (_resolve_path(item) for item in image_files)), key=lambda value: Path(value).name)
-    if not paths:
-        return []
-    copied = []
-    for idx, src_path in enumerate(paths[:max_frames]):
-        src = Path(src_path)
-        suffix = src.suffix.lower() or ".png"
-        dest = input_dir / f"{idx:06d}{suffix}"
-        shutil.copy2(src, dest)
-        copied.append(str(dest))
-    return copied
 def _extract_video_frames(video_file: str, frames_dir: Path, fps: int, max_frames: int) -> tuple[list[str], dict[str, Any]]:
     cap = cv2.VideoCapture(video_file)
     if not cap.isOpened():
@@ -187,27 +172,18 @@ def _extract_video_frames(video_file: str, frames_dir: Path, fps: int, max_frame
     }
-def _prepare_inputs(image_files: list[Any], video_file: Any, fps: int, max_frames: int) -> tuple[torch.Tensor, list[str], Path, dict[str, Any]]:
     _cleanup_old_runs()
     work_dir = Path(tempfile.mkdtemp(prefix="lingbot-map-", dir=OUTPUT_ROOT))
     input_dir = work_dir / "inputs"
     input_dir.mkdir(parents=True, exist_ok=True)
-    image_paths = _copy_image_inputs(image_files or [], input_dir, max_frames=max_frames)
-    input_summary = {"input_mode": None}
-    if image_paths:
-        input_summary["input_mode"] = "images"
-        input_summary["source_fps"] = None
-        input_summary["sample_interval"] = None
-        input_summary["original_frame_count"] = len(image_paths)
-    else:
-        video_path = _resolve_path(video_file)
-        if not video_path:
-            raise ValueError("Upload either ordered images or a video.")
-        image_paths, video_summary = _extract_video_frames(video_path, input_dir, fps=fps, max_frames=max_frames)
-        input_summary["input_mode"] = "video"
-        input_summary.update(video_summary)
     if len(image_paths) < 2:
         raise ValueError("Provide at least 2 frames. The Space is tuned for short multi-frame reconstructions.")
@@ -326,7 +302,7 @@ def _run_inference(images: torch.Tensor, num_scale_frames: int, keyframe_interva
 def _make_preview_strip(images: torch.Tensor, output_path: Path) -> str:
-    frames = images.detach().cpu()
     count = frames.shape[0]
     indices = sorted({int(round(i)) for i in np.linspace(0, count - 1, num=min(4, count))})
@@ -449,7 +425,6 @@ def _format_status(summary: dict[str, Any]) -> str:
 def reconstruct_scene(
-    image_files: list[Any],
     video_file: Any,
     fps: int,
     max_frames: int,
@@ -462,12 +437,7 @@ def reconstruct_scene(
     keyframe_interval = max(1, int(keyframe_interval))
     conf_percentile = float(conf_percentile)
-    images, image_paths, work_dir, input_summary = _prepare_inputs(
-        image_files=image_files or [],
-        video_file=video_file,
-        fps=int(fps),
-        max_frames=max_frames,
-    )
     num_scale_frames = min(num_scale_frames, int(images.shape[0]))
     predictions, images_cpu, runtime_summary = _run_inference(
@@ -496,7 +466,7 @@ def reconstruct_scene(
 def _build_startup_markdown() -> str:
     if not STARTUP_NOTES:
         return (
-            "Build a 3D scene from a short image sequence or video. "
             "This app uses the upstream LingBot-Map checkpoint and exports a navigable GLB scene plus a downloadable results bundle."
         )
     return "\n".join([f"- {note}" for note in STARTUP_NOTES])
@@ -535,7 +505,7 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
             """
 <div class="headline">
   <h1>LingBot 3D</h1>
-  <p>Upload ordered images or a short video, sample a compact clip, and export a 3D scene you can inspect or download.</p>
 </div>
             """
         )
@@ -544,14 +514,8 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
     with gr.Row():
         with gr.Column(scale=1):
-            image_files = gr.File(
-                label="Ordered images",
-                file_count="multiple",
-                file_types=["image"],
-                type="filepath",
-            )
             video_file = gr.File(
-                label="Or upload one video",
                 file_types=["video"],
                 type="filepath",
             )
@@ -578,7 +542,6 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
     run_button.click(
         fn=reconstruct_scene,
         inputs=[
-            image_files,
             video_file,
             fps,
             max_frames,

 import time
 import zipfile
 from pathlib import Path
+from typing import Any
 import cv2
 import gradio as gr
         STARTUP_NOTES.append(f"Startup preload failed: {exc}")
 def _extract_video_frames(video_file: str, frames_dir: Path, fps: int, max_frames: int) -> tuple[list[str], dict[str, Any]]:
     cap = cv2.VideoCapture(video_file)
     if not cap.isOpened():
     }
+def _prepare_inputs(video_file: Any, fps: int, max_frames: int) -> tuple[torch.Tensor, list[str], Path, dict[str, Any]]:
     _cleanup_old_runs()
     work_dir = Path(tempfile.mkdtemp(prefix="lingbot-map-", dir=OUTPUT_ROOT))
     input_dir = work_dir / "inputs"
     input_dir.mkdir(parents=True, exist_ok=True)
+    input_summary = {"input_mode": "video"}
+    video_path = _resolve_path(video_file)
+    if not video_path:
+        raise ValueError("Upload one short video.")
+    image_paths, video_summary = _extract_video_frames(video_path, input_dir, fps=fps, max_frames=max_frames)
+    input_summary.update(video_summary)
     if len(image_paths) < 2:
         raise ValueError("Provide at least 2 frames. The Space is tuned for short multi-frame reconstructions.")
 def _make_preview_strip(images: torch.Tensor, output_path: Path) -> str:
+    frames = _squeeze_single_batch("images", images.detach().cpu())
     count = frames.shape[0]
     indices = sorted({int(round(i)) for i in np.linspace(0, count - 1, num=min(4, count))})
 def reconstruct_scene(
     video_file: Any,
     fps: int,
     max_frames: int,
     keyframe_interval = max(1, int(keyframe_interval))
     conf_percentile = float(conf_percentile)
+    images, image_paths, work_dir, input_summary = _prepare_inputs(video_file=video_file, fps=int(fps), max_frames=max_frames)
     num_scale_frames = min(num_scale_frames, int(images.shape[0]))
     predictions, images_cpu, runtime_summary = _run_inference(
 def _build_startup_markdown() -> str:
     if not STARTUP_NOTES:
         return (
+            "Build a 3D scene from a short video. "
             "This app uses the upstream LingBot-Map checkpoint and exports a navigable GLB scene plus a downloadable results bundle."
         )
     return "\n".join([f"- {note}" for note in STARTUP_NOTES])
             """
 <div class="headline">
   <h1>LingBot 3D</h1>
+  <p>Upload one short video, sample a compact clip, and export a 3D scene you can inspect or download.</p>
 </div>
             """
         )
     with gr.Row():
         with gr.Column(scale=1):
             video_file = gr.File(
+                label="Upload one short video",
                 file_types=["video"],
                 type="filepath",
             )
     run_button.click(
         fn=reconstruct_scene,
         inputs=[
             video_file,
             fps,
             max_frames,