lingbot-3d-ZERO

Running on Zero

App Files Files Community

dennny123 commited on 20 days ago

Commit

657ca4b

verified ·

1 Parent(s): 4700ca8

Simplify Space to the default LingBot-Map checkpoint

Browse files

Files changed (2) hide show

README.md +2 -2
app.py +13 -39

README.md CHANGED Viewed

@@ -11,7 +11,7 @@ startup_duration_timeout: 1h
 models:
   - robbyant/lingbot-map
 preload_from_hub:
-  - robbyant/lingbot-map lingbot-map.pt,lingbot-map-long.pt
 ---
 # LingBot-Map ZeroGPU Demo
@@ -35,7 +35,7 @@ Gradio Space wrapper around `Robbyant/lingbot-map` tuned for Hugging Face ZeroGP
 - short demos only
 - default frame cap: 24 frames
 - model preview is exported as GLB, not the local `viser` server
-- the app is optimized for `lingbot-map.pt` and `lingbot-map-long.pt`
 ## Local Sanity Check

 models:
   - robbyant/lingbot-map
 preload_from_hub:
+  - robbyant/lingbot-map lingbot-map.pt
 ---
 # LingBot-Map ZeroGPU Demo
 - short demos only
 - default frame cap: 24 frames
 - model preview is exported as GLB, not the local `viser` server
+- the app uses the upstream default checkpoint `lingbot-map.pt`
 ## Local Sanity Check

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import contextlib
-import gc
 import json
 import os
 import shutil
@@ -45,16 +44,7 @@ OUTPUT_ROOT = ROOT / "app_output"
 OUTPUT_ROOT.mkdir(exist_ok=True)
 HF_MODEL_REPO = "robbyant/lingbot-map"
-MODEL_FILENAMES = {
-    "balanced": "lingbot-map.pt",
-    "long": "lingbot-map-long.pt",
-    "stage1": "lingbot-map-stage1.pt",
-}
-MODEL_LABELS = {
-    "balanced": "Balanced",
-    "long": "Long",
-    "stage1": "Stage-1",
-}
 IMAGE_SIZE = 518
 PATCH_SIZE = 14
@@ -68,7 +58,7 @@ DEFAULT_CAMERA_ITERATIONS = 1
 IS_SPACE_RUNTIME = bool(os.getenv("SPACE_ID"))
 SKIP_EAGER_MODEL_LOAD = os.getenv("LINGBOT_SPACE_SKIP_MODEL_LOAD") == "1"
-MODEL_CACHE: dict[str, dict[str, Any]] = {}
 MODEL_CACHE_LOCK = threading.Lock()
 STARTUP_NOTES: list[str] = []
@@ -95,20 +85,14 @@ def _pick_runtime_device() -> torch.device:
         return torch.device("cpu")
-def _load_model_bundle(model_variant: str) -> dict[str, Any]:
     with MODEL_CACHE_LOCK:
-        cached = MODEL_CACHE.get(model_variant)
         if cached is not None:
             return cached
-        if MODEL_CACHE:
-            MODEL_CACHE.clear()
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
         device = _pick_runtime_device()
-        weight_name = MODEL_FILENAMES[model_variant]
         weight_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=weight_name)
         model = GCTStream(
@@ -142,7 +126,7 @@ def _load_model_bundle(model_variant: str) -> dict[str, Any]:
             "missing_keys": len(missing),
             "unexpected_keys": len(unexpected),
         }
-        MODEL_CACHE[model_variant] = bundle
         return bundle
@@ -150,7 +134,7 @@ def _eager_load_default_model() -> None:
     if not IS_SPACE_RUNTIME or SKIP_EAGER_MODEL_LOAD:
         return
     try:
-        bundle = _load_model_bundle("balanced")
         STARTUP_NOTES.append(
             f"Startup preload complete on `{bundle['device']}` with `{bundle['weight_name']}`."
         )
@@ -287,15 +271,15 @@ def _prepare_for_visualization(predictions: dict[str, Any], images: torch.Tensor
     return vis_predictions
-def _estimate_gpu_duration(images: torch.Tensor, model_variant: str, num_scale_frames: int, keyframe_interval: int) -> int:
     frame_count = int(getattr(images, "shape", [DEFAULT_MAX_FRAMES])[0])
-    del model_variant, num_scale_frames, keyframe_interval
     return min(180, max(60, 24 + frame_count * 4))
 @spaces.GPU(duration=_estimate_gpu_duration)
-def _run_inference(images: torch.Tensor, model_variant: str, num_scale_frames: int, keyframe_interval: int) -> tuple[dict[str, Any], torch.Tensor, dict[str, Any]]:
-    bundle = _load_model_bundle(model_variant)
     model = bundle["model"]
     device = bundle["device"]
     dtype = bundle["dtype"]
@@ -401,7 +385,6 @@ def _export_outputs(
     images_cpu: torch.Tensor,
     input_summary: dict[str, Any],
     runtime_summary: dict[str, Any],
-    model_variant: str,
     num_scale_frames: int,
     keyframe_interval: int,
     conf_percentile: float,
@@ -423,8 +406,8 @@ def _export_outputs(
     points_kept, conf_threshold = _count_confident_points(vis_predictions, conf_percentile)
     summary = {
-        "model_variant": MODEL_LABELS[model_variant],
-        "model_filename": MODEL_FILENAMES[model_variant],
         "frames_used": len(image_paths),
         "num_scale_frames": num_scale_frames,
         "keyframe_interval": keyframe_interval,
@@ -468,7 +451,6 @@ def _format_status(summary: dict[str, Any]) -> str:
 def reconstruct_scene(
     image_files: list[Any],
     video_file: Any,
-    model_variant: str,
     fps: int,
     max_frames: int,
     num_scale_frames: int,
@@ -490,7 +472,6 @@ def reconstruct_scene(
     num_scale_frames = min(num_scale_frames, int(images.shape[0]))
     predictions, images_cpu, runtime_summary = _run_inference(
         images,
-        model_variant=model_variant,
         num_scale_frames=num_scale_frames,
         keyframe_interval=keyframe_interval,
     )
@@ -502,7 +483,6 @@ def reconstruct_scene(
         images_cpu=images_cpu,
         input_summary=input_summary,
         runtime_summary=runtime_summary,
-        model_variant=model_variant,
         num_scale_frames=num_scale_frames,
         keyframe_interval=keyframe_interval,
         conf_percentile=conf_percentile,
@@ -575,11 +555,6 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
                 file_types=["video"],
                 type="filepath",
             )
-            model_variant = gr.Dropdown(
-                choices=[("Balanced", "balanced"), ("Long", "long"), ("Stage-1", "stage1")],
-                value="balanced",
-                label="Checkpoint",
-            )
             fps = gr.Slider(minimum=1, maximum=12, step=1, value=DEFAULT_FPS, label="Video sampling FPS")
             max_frames = gr.Slider(minimum=2, maximum=MAX_FRAMES_HARD_LIMIT, step=1, value=DEFAULT_MAX_FRAMES, label="Max frames")
             num_scale_frames = gr.Slider(minimum=1, maximum=8, step=1, value=DEFAULT_SCALE_FRAMES, label="Scale frames")
@@ -605,7 +580,6 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
         inputs=[
             image_files,
             video_file,
-            model_variant,
             fps,
             max_frames,
             num_scale_frames,

 import contextlib
 import json
 import os
 import shutil
 OUTPUT_ROOT.mkdir(exist_ok=True)
 HF_MODEL_REPO = "robbyant/lingbot-map"
+MODEL_FILENAME = "lingbot-map.pt"
 IMAGE_SIZE = 518
 PATCH_SIZE = 14
 IS_SPACE_RUNTIME = bool(os.getenv("SPACE_ID"))
 SKIP_EAGER_MODEL_LOAD = os.getenv("LINGBOT_SPACE_SKIP_MODEL_LOAD") == "1"
+MODEL_CACHE: dict[str, Any] = {}
 MODEL_CACHE_LOCK = threading.Lock()
 STARTUP_NOTES: list[str] = []
         return torch.device("cpu")
+def _load_model_bundle() -> dict[str, Any]:
     with MODEL_CACHE_LOCK:
+        cached = MODEL_CACHE.get("default")
         if cached is not None:
             return cached
         device = _pick_runtime_device()
+        weight_name = MODEL_FILENAME
         weight_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=weight_name)
         model = GCTStream(
             "missing_keys": len(missing),
             "unexpected_keys": len(unexpected),
         }
+        MODEL_CACHE["default"] = bundle
         return bundle
     if not IS_SPACE_RUNTIME or SKIP_EAGER_MODEL_LOAD:
         return
     try:
+        bundle = _load_model_bundle()
         STARTUP_NOTES.append(
             f"Startup preload complete on `{bundle['device']}` with `{bundle['weight_name']}`."
         )
     return vis_predictions
+def _estimate_gpu_duration(images: torch.Tensor, num_scale_frames: int, keyframe_interval: int) -> int:
     frame_count = int(getattr(images, "shape", [DEFAULT_MAX_FRAMES])[0])
+    del num_scale_frames, keyframe_interval
     return min(180, max(60, 24 + frame_count * 4))
 @spaces.GPU(duration=_estimate_gpu_duration)
+def _run_inference(images: torch.Tensor, num_scale_frames: int, keyframe_interval: int) -> tuple[dict[str, Any], torch.Tensor, dict[str, Any]]:
+    bundle = _load_model_bundle()
     model = bundle["model"]
     device = bundle["device"]
     dtype = bundle["dtype"]
     images_cpu: torch.Tensor,
     input_summary: dict[str, Any],
     runtime_summary: dict[str, Any],
     num_scale_frames: int,
     keyframe_interval: int,
     conf_percentile: float,
     points_kept, conf_threshold = _count_confident_points(vis_predictions, conf_percentile)
     summary = {
+        "model_variant": "Default",
+        "model_filename": MODEL_FILENAME,
         "frames_used": len(image_paths),
         "num_scale_frames": num_scale_frames,
         "keyframe_interval": keyframe_interval,
 def reconstruct_scene(
     image_files: list[Any],
     video_file: Any,
     fps: int,
     max_frames: int,
     num_scale_frames: int,
     num_scale_frames = min(num_scale_frames, int(images.shape[0]))
     predictions, images_cpu, runtime_summary = _run_inference(
         images,
         num_scale_frames=num_scale_frames,
         keyframe_interval=keyframe_interval,
     )
         images_cpu=images_cpu,
         input_summary=input_summary,
         runtime_summary=runtime_summary,
         num_scale_frames=num_scale_frames,
         keyframe_interval=keyframe_interval,
         conf_percentile=conf_percentile,
                 file_types=["video"],
                 type="filepath",
             )
             fps = gr.Slider(minimum=1, maximum=12, step=1, value=DEFAULT_FPS, label="Video sampling FPS")
             max_frames = gr.Slider(minimum=2, maximum=MAX_FRAMES_HARD_LIMIT, step=1, value=DEFAULT_MAX_FRAMES, label="Max frames")
             num_scale_frames = gr.Slider(minimum=1, maximum=8, step=1, value=DEFAULT_SCALE_FRAMES, label="Scale frames")
         inputs=[
             image_files,
             video_file,
             fps,
             max_frames,
             num_scale_frames,