Spaces:
Running
Running
Simplify Space to the default LingBot-Map checkpoint
Browse files
README.md
CHANGED
|
@@ -11,7 +11,7 @@ startup_duration_timeout: 1h
|
|
| 11 |
models:
|
| 12 |
- robbyant/lingbot-map
|
| 13 |
preload_from_hub:
|
| 14 |
-
- robbyant/lingbot-map lingbot-map.pt
|
| 15 |
---
|
| 16 |
|
| 17 |
# LingBot-Map ZeroGPU Demo
|
|
@@ -35,7 +35,7 @@ Gradio Space wrapper around `Robbyant/lingbot-map` tuned for Hugging Face ZeroGP
|
|
| 35 |
- short demos only
|
| 36 |
- default frame cap: 24 frames
|
| 37 |
- model preview is exported as GLB, not the local `viser` server
|
| 38 |
-
- the app
|
| 39 |
|
| 40 |
## Local Sanity Check
|
| 41 |
|
|
|
|
| 11 |
models:
|
| 12 |
- robbyant/lingbot-map
|
| 13 |
preload_from_hub:
|
| 14 |
+
- robbyant/lingbot-map lingbot-map.pt
|
| 15 |
---
|
| 16 |
|
| 17 |
# LingBot-Map ZeroGPU Demo
|
|
|
|
| 35 |
- short demos only
|
| 36 |
- default frame cap: 24 frames
|
| 37 |
- model preview is exported as GLB, not the local `viser` server
|
| 38 |
+
- the app uses the upstream default checkpoint `lingbot-map.pt`
|
| 39 |
|
| 40 |
## Local Sanity Check
|
| 41 |
|
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import contextlib
|
| 2 |
-
import gc
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
import shutil
|
|
@@ -45,16 +44,7 @@ OUTPUT_ROOT = ROOT / "app_output"
|
|
| 45 |
OUTPUT_ROOT.mkdir(exist_ok=True)
|
| 46 |
|
| 47 |
HF_MODEL_REPO = "robbyant/lingbot-map"
|
| 48 |
-
|
| 49 |
-
"balanced": "lingbot-map.pt",
|
| 50 |
-
"long": "lingbot-map-long.pt",
|
| 51 |
-
"stage1": "lingbot-map-stage1.pt",
|
| 52 |
-
}
|
| 53 |
-
MODEL_LABELS = {
|
| 54 |
-
"balanced": "Balanced",
|
| 55 |
-
"long": "Long",
|
| 56 |
-
"stage1": "Stage-1",
|
| 57 |
-
}
|
| 58 |
|
| 59 |
IMAGE_SIZE = 518
|
| 60 |
PATCH_SIZE = 14
|
|
@@ -68,7 +58,7 @@ DEFAULT_CAMERA_ITERATIONS = 1
|
|
| 68 |
IS_SPACE_RUNTIME = bool(os.getenv("SPACE_ID"))
|
| 69 |
SKIP_EAGER_MODEL_LOAD = os.getenv("LINGBOT_SPACE_SKIP_MODEL_LOAD") == "1"
|
| 70 |
|
| 71 |
-
MODEL_CACHE: dict[str,
|
| 72 |
MODEL_CACHE_LOCK = threading.Lock()
|
| 73 |
STARTUP_NOTES: list[str] = []
|
| 74 |
|
|
@@ -95,20 +85,14 @@ def _pick_runtime_device() -> torch.device:
|
|
| 95 |
return torch.device("cpu")
|
| 96 |
|
| 97 |
|
| 98 |
-
def _load_model_bundle(
|
| 99 |
with MODEL_CACHE_LOCK:
|
| 100 |
-
cached = MODEL_CACHE.get(
|
| 101 |
if cached is not None:
|
| 102 |
return cached
|
| 103 |
|
| 104 |
-
if MODEL_CACHE:
|
| 105 |
-
MODEL_CACHE.clear()
|
| 106 |
-
gc.collect()
|
| 107 |
-
if torch.cuda.is_available():
|
| 108 |
-
torch.cuda.empty_cache()
|
| 109 |
-
|
| 110 |
device = _pick_runtime_device()
|
| 111 |
-
weight_name =
|
| 112 |
weight_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=weight_name)
|
| 113 |
|
| 114 |
model = GCTStream(
|
|
@@ -142,7 +126,7 @@ def _load_model_bundle(model_variant: str) -> dict[str, Any]:
|
|
| 142 |
"missing_keys": len(missing),
|
| 143 |
"unexpected_keys": len(unexpected),
|
| 144 |
}
|
| 145 |
-
MODEL_CACHE[
|
| 146 |
return bundle
|
| 147 |
|
| 148 |
|
|
@@ -150,7 +134,7 @@ def _eager_load_default_model() -> None:
|
|
| 150 |
if not IS_SPACE_RUNTIME or SKIP_EAGER_MODEL_LOAD:
|
| 151 |
return
|
| 152 |
try:
|
| 153 |
-
bundle = _load_model_bundle(
|
| 154 |
STARTUP_NOTES.append(
|
| 155 |
f"Startup preload complete on `{bundle['device']}` with `{bundle['weight_name']}`."
|
| 156 |
)
|
|
@@ -287,15 +271,15 @@ def _prepare_for_visualization(predictions: dict[str, Any], images: torch.Tensor
|
|
| 287 |
return vis_predictions
|
| 288 |
|
| 289 |
|
| 290 |
-
def _estimate_gpu_duration(images: torch.Tensor,
|
| 291 |
frame_count = int(getattr(images, "shape", [DEFAULT_MAX_FRAMES])[0])
|
| 292 |
-
del
|
| 293 |
return min(180, max(60, 24 + frame_count * 4))
|
| 294 |
|
| 295 |
|
| 296 |
@spaces.GPU(duration=_estimate_gpu_duration)
|
| 297 |
-
def _run_inference(images: torch.Tensor,
|
| 298 |
-
bundle = _load_model_bundle(
|
| 299 |
model = bundle["model"]
|
| 300 |
device = bundle["device"]
|
| 301 |
dtype = bundle["dtype"]
|
|
@@ -401,7 +385,6 @@ def _export_outputs(
|
|
| 401 |
images_cpu: torch.Tensor,
|
| 402 |
input_summary: dict[str, Any],
|
| 403 |
runtime_summary: dict[str, Any],
|
| 404 |
-
model_variant: str,
|
| 405 |
num_scale_frames: int,
|
| 406 |
keyframe_interval: int,
|
| 407 |
conf_percentile: float,
|
|
@@ -423,8 +406,8 @@ def _export_outputs(
|
|
| 423 |
|
| 424 |
points_kept, conf_threshold = _count_confident_points(vis_predictions, conf_percentile)
|
| 425 |
summary = {
|
| 426 |
-
"model_variant":
|
| 427 |
-
"model_filename":
|
| 428 |
"frames_used": len(image_paths),
|
| 429 |
"num_scale_frames": num_scale_frames,
|
| 430 |
"keyframe_interval": keyframe_interval,
|
|
@@ -468,7 +451,6 @@ def _format_status(summary: dict[str, Any]) -> str:
|
|
| 468 |
def reconstruct_scene(
|
| 469 |
image_files: list[Any],
|
| 470 |
video_file: Any,
|
| 471 |
-
model_variant: str,
|
| 472 |
fps: int,
|
| 473 |
max_frames: int,
|
| 474 |
num_scale_frames: int,
|
|
@@ -490,7 +472,6 @@ def reconstruct_scene(
|
|
| 490 |
num_scale_frames = min(num_scale_frames, int(images.shape[0]))
|
| 491 |
predictions, images_cpu, runtime_summary = _run_inference(
|
| 492 |
images,
|
| 493 |
-
model_variant=model_variant,
|
| 494 |
num_scale_frames=num_scale_frames,
|
| 495 |
keyframe_interval=keyframe_interval,
|
| 496 |
)
|
|
@@ -502,7 +483,6 @@ def reconstruct_scene(
|
|
| 502 |
images_cpu=images_cpu,
|
| 503 |
input_summary=input_summary,
|
| 504 |
runtime_summary=runtime_summary,
|
| 505 |
-
model_variant=model_variant,
|
| 506 |
num_scale_frames=num_scale_frames,
|
| 507 |
keyframe_interval=keyframe_interval,
|
| 508 |
conf_percentile=conf_percentile,
|
|
@@ -575,11 +555,6 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
|
|
| 575 |
file_types=["video"],
|
| 576 |
type="filepath",
|
| 577 |
)
|
| 578 |
-
model_variant = gr.Dropdown(
|
| 579 |
-
choices=[("Balanced", "balanced"), ("Long", "long"), ("Stage-1", "stage1")],
|
| 580 |
-
value="balanced",
|
| 581 |
-
label="Checkpoint",
|
| 582 |
-
)
|
| 583 |
fps = gr.Slider(minimum=1, maximum=12, step=1, value=DEFAULT_FPS, label="Video sampling FPS")
|
| 584 |
max_frames = gr.Slider(minimum=2, maximum=MAX_FRAMES_HARD_LIMIT, step=1, value=DEFAULT_MAX_FRAMES, label="Max frames")
|
| 585 |
num_scale_frames = gr.Slider(minimum=1, maximum=8, step=1, value=DEFAULT_SCALE_FRAMES, label="Scale frames")
|
|
@@ -605,7 +580,6 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
|
|
| 605 |
inputs=[
|
| 606 |
image_files,
|
| 607 |
video_file,
|
| 608 |
-
model_variant,
|
| 609 |
fps,
|
| 610 |
max_frames,
|
| 611 |
num_scale_frames,
|
|
|
|
| 1 |
import contextlib
|
|
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
import shutil
|
|
|
|
| 44 |
OUTPUT_ROOT.mkdir(exist_ok=True)
|
| 45 |
|
| 46 |
HF_MODEL_REPO = "robbyant/lingbot-map"
|
| 47 |
+
MODEL_FILENAME = "lingbot-map.pt"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
IMAGE_SIZE = 518
|
| 50 |
PATCH_SIZE = 14
|
|
|
|
| 58 |
IS_SPACE_RUNTIME = bool(os.getenv("SPACE_ID"))
|
| 59 |
SKIP_EAGER_MODEL_LOAD = os.getenv("LINGBOT_SPACE_SKIP_MODEL_LOAD") == "1"
|
| 60 |
|
| 61 |
+
MODEL_CACHE: dict[str, Any] = {}
|
| 62 |
MODEL_CACHE_LOCK = threading.Lock()
|
| 63 |
STARTUP_NOTES: list[str] = []
|
| 64 |
|
|
|
|
| 85 |
return torch.device("cpu")
|
| 86 |
|
| 87 |
|
| 88 |
+
def _load_model_bundle() -> dict[str, Any]:
|
| 89 |
with MODEL_CACHE_LOCK:
|
| 90 |
+
cached = MODEL_CACHE.get("default")
|
| 91 |
if cached is not None:
|
| 92 |
return cached
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
device = _pick_runtime_device()
|
| 95 |
+
weight_name = MODEL_FILENAME
|
| 96 |
weight_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=weight_name)
|
| 97 |
|
| 98 |
model = GCTStream(
|
|
|
|
| 126 |
"missing_keys": len(missing),
|
| 127 |
"unexpected_keys": len(unexpected),
|
| 128 |
}
|
| 129 |
+
MODEL_CACHE["default"] = bundle
|
| 130 |
return bundle
|
| 131 |
|
| 132 |
|
|
|
|
| 134 |
if not IS_SPACE_RUNTIME or SKIP_EAGER_MODEL_LOAD:
|
| 135 |
return
|
| 136 |
try:
|
| 137 |
+
bundle = _load_model_bundle()
|
| 138 |
STARTUP_NOTES.append(
|
| 139 |
f"Startup preload complete on `{bundle['device']}` with `{bundle['weight_name']}`."
|
| 140 |
)
|
|
|
|
| 271 |
return vis_predictions
|
| 272 |
|
| 273 |
|
| 274 |
+
def _estimate_gpu_duration(images: torch.Tensor, num_scale_frames: int, keyframe_interval: int) -> int:
|
| 275 |
frame_count = int(getattr(images, "shape", [DEFAULT_MAX_FRAMES])[0])
|
| 276 |
+
del num_scale_frames, keyframe_interval
|
| 277 |
return min(180, max(60, 24 + frame_count * 4))
|
| 278 |
|
| 279 |
|
| 280 |
@spaces.GPU(duration=_estimate_gpu_duration)
|
| 281 |
+
def _run_inference(images: torch.Tensor, num_scale_frames: int, keyframe_interval: int) -> tuple[dict[str, Any], torch.Tensor, dict[str, Any]]:
|
| 282 |
+
bundle = _load_model_bundle()
|
| 283 |
model = bundle["model"]
|
| 284 |
device = bundle["device"]
|
| 285 |
dtype = bundle["dtype"]
|
|
|
|
| 385 |
images_cpu: torch.Tensor,
|
| 386 |
input_summary: dict[str, Any],
|
| 387 |
runtime_summary: dict[str, Any],
|
|
|
|
| 388 |
num_scale_frames: int,
|
| 389 |
keyframe_interval: int,
|
| 390 |
conf_percentile: float,
|
|
|
|
| 406 |
|
| 407 |
points_kept, conf_threshold = _count_confident_points(vis_predictions, conf_percentile)
|
| 408 |
summary = {
|
| 409 |
+
"model_variant": "Default",
|
| 410 |
+
"model_filename": MODEL_FILENAME,
|
| 411 |
"frames_used": len(image_paths),
|
| 412 |
"num_scale_frames": num_scale_frames,
|
| 413 |
"keyframe_interval": keyframe_interval,
|
|
|
|
| 451 |
def reconstruct_scene(
|
| 452 |
image_files: list[Any],
|
| 453 |
video_file: Any,
|
|
|
|
| 454 |
fps: int,
|
| 455 |
max_frames: int,
|
| 456 |
num_scale_frames: int,
|
|
|
|
| 472 |
num_scale_frames = min(num_scale_frames, int(images.shape[0]))
|
| 473 |
predictions, images_cpu, runtime_summary = _run_inference(
|
| 474 |
images,
|
|
|
|
| 475 |
num_scale_frames=num_scale_frames,
|
| 476 |
keyframe_interval=keyframe_interval,
|
| 477 |
)
|
|
|
|
| 483 |
images_cpu=images_cpu,
|
| 484 |
input_summary=input_summary,
|
| 485 |
runtime_summary=runtime_summary,
|
|
|
|
| 486 |
num_scale_frames=num_scale_frames,
|
| 487 |
keyframe_interval=keyframe_interval,
|
| 488 |
conf_percentile=conf_percentile,
|
|
|
|
| 555 |
file_types=["video"],
|
| 556 |
type="filepath",
|
| 557 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
fps = gr.Slider(minimum=1, maximum=12, step=1, value=DEFAULT_FPS, label="Video sampling FPS")
|
| 559 |
max_frames = gr.Slider(minimum=2, maximum=MAX_FRAMES_HARD_LIMIT, step=1, value=DEFAULT_MAX_FRAMES, label="Max frames")
|
| 560 |
num_scale_frames = gr.Slider(minimum=1, maximum=8, step=1, value=DEFAULT_SCALE_FRAMES, label="Scale frames")
|
|
|
|
| 580 |
inputs=[
|
| 581 |
image_files,
|
| 582 |
video_file,
|
|
|
|
| 583 |
fps,
|
| 584 |
max_frames,
|
| 585 |
num_scale_frames,
|