Spaces:
Running
Running
Make Space video-only and fix preview export shape handling
Browse files
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: LingBot 3D
|
| 3 |
-
short_description: Streaming 3D scene reconstruction from short
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
|
|
|
| 1 |
---
|
| 2 |
title: LingBot 3D
|
| 3 |
+
short_description: Streaming 3D scene reconstruction from short videos.
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
app.py
CHANGED
|
@@ -7,7 +7,7 @@ import threading
|
|
| 7 |
import time
|
| 8 |
import zipfile
|
| 9 |
from pathlib import Path
|
| 10 |
-
from typing import Any
|
| 11 |
|
| 12 |
import cv2
|
| 13 |
import gradio as gr
|
|
@@ -142,21 +142,6 @@ def _eager_load_default_model() -> None:
|
|
| 142 |
STARTUP_NOTES.append(f"Startup preload failed: {exc}")
|
| 143 |
|
| 144 |
|
| 145 |
-
def _copy_image_inputs(image_files: Iterable[Any], input_dir: Path, max_frames: int) -> list[str]:
|
| 146 |
-
paths = sorted(filter(None, (_resolve_path(item) for item in image_files)), key=lambda value: Path(value).name)
|
| 147 |
-
if not paths:
|
| 148 |
-
return []
|
| 149 |
-
|
| 150 |
-
copied = []
|
| 151 |
-
for idx, src_path in enumerate(paths[:max_frames]):
|
| 152 |
-
src = Path(src_path)
|
| 153 |
-
suffix = src.suffix.lower() or ".png"
|
| 154 |
-
dest = input_dir / f"{idx:06d}{suffix}"
|
| 155 |
-
shutil.copy2(src, dest)
|
| 156 |
-
copied.append(str(dest))
|
| 157 |
-
return copied
|
| 158 |
-
|
| 159 |
-
|
| 160 |
def _extract_video_frames(video_file: str, frames_dir: Path, fps: int, max_frames: int) -> tuple[list[str], dict[str, Any]]:
|
| 161 |
cap = cv2.VideoCapture(video_file)
|
| 162 |
if not cap.isOpened():
|
|
@@ -187,27 +172,18 @@ def _extract_video_frames(video_file: str, frames_dir: Path, fps: int, max_frame
|
|
| 187 |
}
|
| 188 |
|
| 189 |
|
| 190 |
-
def _prepare_inputs(
|
| 191 |
_cleanup_old_runs()
|
| 192 |
work_dir = Path(tempfile.mkdtemp(prefix="lingbot-map-", dir=OUTPUT_ROOT))
|
| 193 |
input_dir = work_dir / "inputs"
|
| 194 |
input_dir.mkdir(parents=True, exist_ok=True)
|
| 195 |
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
input_summary["sample_interval"] = None
|
| 203 |
-
input_summary["original_frame_count"] = len(image_paths)
|
| 204 |
-
else:
|
| 205 |
-
video_path = _resolve_path(video_file)
|
| 206 |
-
if not video_path:
|
| 207 |
-
raise ValueError("Upload either ordered images or a video.")
|
| 208 |
-
image_paths, video_summary = _extract_video_frames(video_path, input_dir, fps=fps, max_frames=max_frames)
|
| 209 |
-
input_summary["input_mode"] = "video"
|
| 210 |
-
input_summary.update(video_summary)
|
| 211 |
|
| 212 |
if len(image_paths) < 2:
|
| 213 |
raise ValueError("Provide at least 2 frames. The Space is tuned for short multi-frame reconstructions.")
|
|
@@ -326,7 +302,7 @@ def _run_inference(images: torch.Tensor, num_scale_frames: int, keyframe_interva
|
|
| 326 |
|
| 327 |
|
| 328 |
def _make_preview_strip(images: torch.Tensor, output_path: Path) -> str:
|
| 329 |
-
frames = images.detach().cpu()
|
| 330 |
count = frames.shape[0]
|
| 331 |
indices = sorted({int(round(i)) for i in np.linspace(0, count - 1, num=min(4, count))})
|
| 332 |
|
|
@@ -449,7 +425,6 @@ def _format_status(summary: dict[str, Any]) -> str:
|
|
| 449 |
|
| 450 |
|
| 451 |
def reconstruct_scene(
|
| 452 |
-
image_files: list[Any],
|
| 453 |
video_file: Any,
|
| 454 |
fps: int,
|
| 455 |
max_frames: int,
|
|
@@ -462,12 +437,7 @@ def reconstruct_scene(
|
|
| 462 |
keyframe_interval = max(1, int(keyframe_interval))
|
| 463 |
conf_percentile = float(conf_percentile)
|
| 464 |
|
| 465 |
-
images, image_paths, work_dir, input_summary = _prepare_inputs(
|
| 466 |
-
image_files=image_files or [],
|
| 467 |
-
video_file=video_file,
|
| 468 |
-
fps=int(fps),
|
| 469 |
-
max_frames=max_frames,
|
| 470 |
-
)
|
| 471 |
|
| 472 |
num_scale_frames = min(num_scale_frames, int(images.shape[0]))
|
| 473 |
predictions, images_cpu, runtime_summary = _run_inference(
|
|
@@ -496,7 +466,7 @@ def reconstruct_scene(
|
|
| 496 |
def _build_startup_markdown() -> str:
|
| 497 |
if not STARTUP_NOTES:
|
| 498 |
return (
|
| 499 |
-
"Build a 3D scene from a short
|
| 500 |
"This app uses the upstream LingBot-Map checkpoint and exports a navigable GLB scene plus a downloadable results bundle."
|
| 501 |
)
|
| 502 |
return "\n".join([f"- {note}" for note in STARTUP_NOTES])
|
|
@@ -535,7 +505,7 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
|
|
| 535 |
"""
|
| 536 |
<div class="headline">
|
| 537 |
<h1>LingBot 3D</h1>
|
| 538 |
-
<p>Upload
|
| 539 |
</div>
|
| 540 |
"""
|
| 541 |
)
|
|
@@ -544,14 +514,8 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
|
|
| 544 |
|
| 545 |
with gr.Row():
|
| 546 |
with gr.Column(scale=1):
|
| 547 |
-
image_files = gr.File(
|
| 548 |
-
label="Ordered images",
|
| 549 |
-
file_count="multiple",
|
| 550 |
-
file_types=["image"],
|
| 551 |
-
type="filepath",
|
| 552 |
-
)
|
| 553 |
video_file = gr.File(
|
| 554 |
-
label="
|
| 555 |
file_types=["video"],
|
| 556 |
type="filepath",
|
| 557 |
)
|
|
@@ -578,7 +542,6 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
|
|
| 578 |
run_button.click(
|
| 579 |
fn=reconstruct_scene,
|
| 580 |
inputs=[
|
| 581 |
-
image_files,
|
| 582 |
video_file,
|
| 583 |
fps,
|
| 584 |
max_frames,
|
|
|
|
| 7 |
import time
|
| 8 |
import zipfile
|
| 9 |
from pathlib import Path
|
| 10 |
+
from typing import Any
|
| 11 |
|
| 12 |
import cv2
|
| 13 |
import gradio as gr
|
|
|
|
| 142 |
STARTUP_NOTES.append(f"Startup preload failed: {exc}")
|
| 143 |
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
def _extract_video_frames(video_file: str, frames_dir: Path, fps: int, max_frames: int) -> tuple[list[str], dict[str, Any]]:
|
| 146 |
cap = cv2.VideoCapture(video_file)
|
| 147 |
if not cap.isOpened():
|
|
|
|
| 172 |
}
|
| 173 |
|
| 174 |
|
| 175 |
+
def _prepare_inputs(video_file: Any, fps: int, max_frames: int) -> tuple[torch.Tensor, list[str], Path, dict[str, Any]]:
|
| 176 |
_cleanup_old_runs()
|
| 177 |
work_dir = Path(tempfile.mkdtemp(prefix="lingbot-map-", dir=OUTPUT_ROOT))
|
| 178 |
input_dir = work_dir / "inputs"
|
| 179 |
input_dir.mkdir(parents=True, exist_ok=True)
|
| 180 |
|
| 181 |
+
input_summary = {"input_mode": "video"}
|
| 182 |
+
video_path = _resolve_path(video_file)
|
| 183 |
+
if not video_path:
|
| 184 |
+
raise ValueError("Upload one short video.")
|
| 185 |
+
image_paths, video_summary = _extract_video_frames(video_path, input_dir, fps=fps, max_frames=max_frames)
|
| 186 |
+
input_summary.update(video_summary)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
if len(image_paths) < 2:
|
| 189 |
raise ValueError("Provide at least 2 frames. The Space is tuned for short multi-frame reconstructions.")
|
|
|
|
| 302 |
|
| 303 |
|
| 304 |
def _make_preview_strip(images: torch.Tensor, output_path: Path) -> str:
|
| 305 |
+
frames = _squeeze_single_batch("images", images.detach().cpu())
|
| 306 |
count = frames.shape[0]
|
| 307 |
indices = sorted({int(round(i)) for i in np.linspace(0, count - 1, num=min(4, count))})
|
| 308 |
|
|
|
|
| 425 |
|
| 426 |
|
| 427 |
def reconstruct_scene(
|
|
|
|
| 428 |
video_file: Any,
|
| 429 |
fps: int,
|
| 430 |
max_frames: int,
|
|
|
|
| 437 |
keyframe_interval = max(1, int(keyframe_interval))
|
| 438 |
conf_percentile = float(conf_percentile)
|
| 439 |
|
| 440 |
+
images, image_paths, work_dir, input_summary = _prepare_inputs(video_file=video_file, fps=int(fps), max_frames=max_frames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
num_scale_frames = min(num_scale_frames, int(images.shape[0]))
|
| 443 |
predictions, images_cpu, runtime_summary = _run_inference(
|
|
|
|
| 466 |
def _build_startup_markdown() -> str:
|
| 467 |
if not STARTUP_NOTES:
|
| 468 |
return (
|
| 469 |
+
"Build a 3D scene from a short video. "
|
| 470 |
"This app uses the upstream LingBot-Map checkpoint and exports a navigable GLB scene plus a downloadable results bundle."
|
| 471 |
)
|
| 472 |
return "\n".join([f"- {note}" for note in STARTUP_NOTES])
|
|
|
|
| 505 |
"""
|
| 506 |
<div class="headline">
|
| 507 |
<h1>LingBot 3D</h1>
|
| 508 |
+
<p>Upload one short video, sample a compact clip, and export a 3D scene you can inspect or download.</p>
|
| 509 |
</div>
|
| 510 |
"""
|
| 511 |
)
|
|
|
|
| 514 |
|
| 515 |
with gr.Row():
|
| 516 |
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
video_file = gr.File(
|
| 518 |
+
label="Upload one short video",
|
| 519 |
file_types=["video"],
|
| 520 |
type="filepath",
|
| 521 |
)
|
|
|
|
| 542 |
run_button.click(
|
| 543 |
fn=reconstruct_scene,
|
| 544 |
inputs=[
|
|
|
|
| 545 |
video_file,
|
| 546 |
fps,
|
| 547 |
max_frames,
|