dennny123 commited on
Commit
657ca4b
·
verified ·
1 Parent(s): 4700ca8

Simplify Space to the default LingBot-Map checkpoint

Browse files
Files changed (2) hide show
  1. README.md +2 -2
  2. app.py +13 -39
README.md CHANGED
@@ -11,7 +11,7 @@ startup_duration_timeout: 1h
11
  models:
12
  - robbyant/lingbot-map
13
  preload_from_hub:
14
- - robbyant/lingbot-map lingbot-map.pt,lingbot-map-long.pt
15
  ---
16
 
17
  # LingBot-Map ZeroGPU Demo
@@ -35,7 +35,7 @@ Gradio Space wrapper around `Robbyant/lingbot-map` tuned for Hugging Face ZeroGP
35
  - short demos only
36
  - default frame cap: 24 frames
37
  - model preview is exported as GLB, not the local `viser` server
38
- - the app is optimized for `lingbot-map.pt` and `lingbot-map-long.pt`
39
 
40
  ## Local Sanity Check
41
 
 
11
  models:
12
  - robbyant/lingbot-map
13
  preload_from_hub:
14
+ - robbyant/lingbot-map lingbot-map.pt
15
  ---
16
 
17
  # LingBot-Map ZeroGPU Demo
 
35
  - short demos only
36
  - default frame cap: 24 frames
37
  - model preview is exported as GLB, not the local `viser` server
38
+ - the app uses the upstream default checkpoint `lingbot-map.pt`
39
 
40
  ## Local Sanity Check
41
 
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import contextlib
2
- import gc
3
  import json
4
  import os
5
  import shutil
@@ -45,16 +44,7 @@ OUTPUT_ROOT = ROOT / "app_output"
45
  OUTPUT_ROOT.mkdir(exist_ok=True)
46
 
47
  HF_MODEL_REPO = "robbyant/lingbot-map"
48
- MODEL_FILENAMES = {
49
- "balanced": "lingbot-map.pt",
50
- "long": "lingbot-map-long.pt",
51
- "stage1": "lingbot-map-stage1.pt",
52
- }
53
- MODEL_LABELS = {
54
- "balanced": "Balanced",
55
- "long": "Long",
56
- "stage1": "Stage-1",
57
- }
58
 
59
  IMAGE_SIZE = 518
60
  PATCH_SIZE = 14
@@ -68,7 +58,7 @@ DEFAULT_CAMERA_ITERATIONS = 1
68
  IS_SPACE_RUNTIME = bool(os.getenv("SPACE_ID"))
69
  SKIP_EAGER_MODEL_LOAD = os.getenv("LINGBOT_SPACE_SKIP_MODEL_LOAD") == "1"
70
 
71
- MODEL_CACHE: dict[str, dict[str, Any]] = {}
72
  MODEL_CACHE_LOCK = threading.Lock()
73
  STARTUP_NOTES: list[str] = []
74
 
@@ -95,20 +85,14 @@ def _pick_runtime_device() -> torch.device:
95
  return torch.device("cpu")
96
 
97
 
98
- def _load_model_bundle(model_variant: str) -> dict[str, Any]:
99
  with MODEL_CACHE_LOCK:
100
- cached = MODEL_CACHE.get(model_variant)
101
  if cached is not None:
102
  return cached
103
 
104
- if MODEL_CACHE:
105
- MODEL_CACHE.clear()
106
- gc.collect()
107
- if torch.cuda.is_available():
108
- torch.cuda.empty_cache()
109
-
110
  device = _pick_runtime_device()
111
- weight_name = MODEL_FILENAMES[model_variant]
112
  weight_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=weight_name)
113
 
114
  model = GCTStream(
@@ -142,7 +126,7 @@ def _load_model_bundle(model_variant: str) -> dict[str, Any]:
142
  "missing_keys": len(missing),
143
  "unexpected_keys": len(unexpected),
144
  }
145
- MODEL_CACHE[model_variant] = bundle
146
  return bundle
147
 
148
 
@@ -150,7 +134,7 @@ def _eager_load_default_model() -> None:
150
  if not IS_SPACE_RUNTIME or SKIP_EAGER_MODEL_LOAD:
151
  return
152
  try:
153
- bundle = _load_model_bundle("balanced")
154
  STARTUP_NOTES.append(
155
  f"Startup preload complete on `{bundle['device']}` with `{bundle['weight_name']}`."
156
  )
@@ -287,15 +271,15 @@ def _prepare_for_visualization(predictions: dict[str, Any], images: torch.Tensor
287
  return vis_predictions
288
 
289
 
290
- def _estimate_gpu_duration(images: torch.Tensor, model_variant: str, num_scale_frames: int, keyframe_interval: int) -> int:
291
  frame_count = int(getattr(images, "shape", [DEFAULT_MAX_FRAMES])[0])
292
- del model_variant, num_scale_frames, keyframe_interval
293
  return min(180, max(60, 24 + frame_count * 4))
294
 
295
 
296
  @spaces.GPU(duration=_estimate_gpu_duration)
297
- def _run_inference(images: torch.Tensor, model_variant: str, num_scale_frames: int, keyframe_interval: int) -> tuple[dict[str, Any], torch.Tensor, dict[str, Any]]:
298
- bundle = _load_model_bundle(model_variant)
299
  model = bundle["model"]
300
  device = bundle["device"]
301
  dtype = bundle["dtype"]
@@ -401,7 +385,6 @@ def _export_outputs(
401
  images_cpu: torch.Tensor,
402
  input_summary: dict[str, Any],
403
  runtime_summary: dict[str, Any],
404
- model_variant: str,
405
  num_scale_frames: int,
406
  keyframe_interval: int,
407
  conf_percentile: float,
@@ -423,8 +406,8 @@ def _export_outputs(
423
 
424
  points_kept, conf_threshold = _count_confident_points(vis_predictions, conf_percentile)
425
  summary = {
426
- "model_variant": MODEL_LABELS[model_variant],
427
- "model_filename": MODEL_FILENAMES[model_variant],
428
  "frames_used": len(image_paths),
429
  "num_scale_frames": num_scale_frames,
430
  "keyframe_interval": keyframe_interval,
@@ -468,7 +451,6 @@ def _format_status(summary: dict[str, Any]) -> str:
468
  def reconstruct_scene(
469
  image_files: list[Any],
470
  video_file: Any,
471
- model_variant: str,
472
  fps: int,
473
  max_frames: int,
474
  num_scale_frames: int,
@@ -490,7 +472,6 @@ def reconstruct_scene(
490
  num_scale_frames = min(num_scale_frames, int(images.shape[0]))
491
  predictions, images_cpu, runtime_summary = _run_inference(
492
  images,
493
- model_variant=model_variant,
494
  num_scale_frames=num_scale_frames,
495
  keyframe_interval=keyframe_interval,
496
  )
@@ -502,7 +483,6 @@ def reconstruct_scene(
502
  images_cpu=images_cpu,
503
  input_summary=input_summary,
504
  runtime_summary=runtime_summary,
505
- model_variant=model_variant,
506
  num_scale_frames=num_scale_frames,
507
  keyframe_interval=keyframe_interval,
508
  conf_percentile=conf_percentile,
@@ -575,11 +555,6 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
575
  file_types=["video"],
576
  type="filepath",
577
  )
578
- model_variant = gr.Dropdown(
579
- choices=[("Balanced", "balanced"), ("Long", "long"), ("Stage-1", "stage1")],
580
- value="balanced",
581
- label="Checkpoint",
582
- )
583
  fps = gr.Slider(minimum=1, maximum=12, step=1, value=DEFAULT_FPS, label="Video sampling FPS")
584
  max_frames = gr.Slider(minimum=2, maximum=MAX_FRAMES_HARD_LIMIT, step=1, value=DEFAULT_MAX_FRAMES, label="Max frames")
585
  num_scale_frames = gr.Slider(minimum=1, maximum=8, step=1, value=DEFAULT_SCALE_FRAMES, label="Scale frames")
@@ -605,7 +580,6 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="green", secondary_hue=
605
  inputs=[
606
  image_files,
607
  video_file,
608
- model_variant,
609
  fps,
610
  max_frames,
611
  num_scale_frames,
 
1
  import contextlib
 
2
  import json
3
  import os
4
  import shutil
 
44
  OUTPUT_ROOT.mkdir(exist_ok=True)
45
 
46
  HF_MODEL_REPO = "robbyant/lingbot-map"
47
+ MODEL_FILENAME = "lingbot-map.pt"
 
 
 
 
 
 
 
 
 
48
 
49
  IMAGE_SIZE = 518
50
  PATCH_SIZE = 14
 
58
  IS_SPACE_RUNTIME = bool(os.getenv("SPACE_ID"))
59
  SKIP_EAGER_MODEL_LOAD = os.getenv("LINGBOT_SPACE_SKIP_MODEL_LOAD") == "1"
60
 
61
+ MODEL_CACHE: dict[str, Any] = {}
62
  MODEL_CACHE_LOCK = threading.Lock()
63
  STARTUP_NOTES: list[str] = []
64
 
 
85
  return torch.device("cpu")
86
 
87
 
88
+ def _load_model_bundle() -> dict[str, Any]:
89
  with MODEL_CACHE_LOCK:
90
+ cached = MODEL_CACHE.get("default")
91
  if cached is not None:
92
  return cached
93
 
 
 
 
 
 
 
94
  device = _pick_runtime_device()
95
+ weight_name = MODEL_FILENAME
96
  weight_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=weight_name)
97
 
98
  model = GCTStream(
 
126
  "missing_keys": len(missing),
127
  "unexpected_keys": len(unexpected),
128
  }
129
+ MODEL_CACHE["default"] = bundle
130
  return bundle
131
 
132
 
 
134
  if not IS_SPACE_RUNTIME or SKIP_EAGER_MODEL_LOAD:
135
  return
136
  try:
137
+ bundle = _load_model_bundle()
138
  STARTUP_NOTES.append(
139
  f"Startup preload complete on `{bundle['device']}` with `{bundle['weight_name']}`."
140
  )
 
271
  return vis_predictions
272
 
273
 
274
+ def _estimate_gpu_duration(images: torch.Tensor, num_scale_frames: int, keyframe_interval: int) -> int:
275
  frame_count = int(getattr(images, "shape", [DEFAULT_MAX_FRAMES])[0])
276
+ del num_scale_frames, keyframe_interval
277
  return min(180, max(60, 24 + frame_count * 4))
278
 
279
 
280
  @spaces.GPU(duration=_estimate_gpu_duration)
281
+ def _run_inference(images: torch.Tensor, num_scale_frames: int, keyframe_interval: int) -> tuple[dict[str, Any], torch.Tensor, dict[str, Any]]:
282
+ bundle = _load_model_bundle()
283
  model = bundle["model"]
284
  device = bundle["device"]
285
  dtype = bundle["dtype"]
 
385
  images_cpu: torch.Tensor,
386
  input_summary: dict[str, Any],
387
  runtime_summary: dict[str, Any],
 
388
  num_scale_frames: int,
389
  keyframe_interval: int,
390
  conf_percentile: float,
 
406
 
407
  points_kept, conf_threshold = _count_confident_points(vis_predictions, conf_percentile)
408
  summary = {
409
+ "model_variant": "Default",
410
+ "model_filename": MODEL_FILENAME,
411
  "frames_used": len(image_paths),
412
  "num_scale_frames": num_scale_frames,
413
  "keyframe_interval": keyframe_interval,
 
451
  def reconstruct_scene(
452
  image_files: list[Any],
453
  video_file: Any,
 
454
  fps: int,
455
  max_frames: int,
456
  num_scale_frames: int,
 
472
  num_scale_frames = min(num_scale_frames, int(images.shape[0]))
473
  predictions, images_cpu, runtime_summary = _run_inference(
474
  images,
 
475
  num_scale_frames=num_scale_frames,
476
  keyframe_interval=keyframe_interval,
477
  )
 
483
  images_cpu=images_cpu,
484
  input_summary=input_summary,
485
  runtime_summary=runtime_summary,
 
486
  num_scale_frames=num_scale_frames,
487
  keyframe_interval=keyframe_interval,
488
  conf_percentile=conf_percentile,
 
555
  file_types=["video"],
556
  type="filepath",
557
  )
 
 
 
 
 
558
  fps = gr.Slider(minimum=1, maximum=12, step=1, value=DEFAULT_FPS, label="Video sampling FPS")
559
  max_frames = gr.Slider(minimum=2, maximum=MAX_FRAMES_HARD_LIMIT, step=1, value=DEFAULT_MAX_FRAMES, label="Max frames")
560
  num_scale_frames = gr.Slider(minimum=1, maximum=8, step=1, value=DEFAULT_SCALE_FRAMES, label="Scale frames")
 
580
  inputs=[
581
  image_files,
582
  video_file,
 
583
  fps,
584
  max_frames,
585
  num_scale_frames,