dennny123 commited on
Commit
afd3356
·
verified ·
1 Parent(s): bafdd0f

Remove teaser image, startup note, and run summary accordion

Browse files
Files changed (1) hide show
  1. app.py +228 -23
app.py CHANGED
@@ -1,4 +1,6 @@
1
  import contextlib
 
 
2
  import json
3
  import os
4
  import shutil
@@ -15,6 +17,7 @@ import numpy as np
15
  import torch
16
  from huggingface_hub import hf_hub_download
17
  from PIL import Image, ImageDraw
 
18
 
19
  try:
20
  import spaces
@@ -55,6 +58,7 @@ DEFAULT_SCALE_FRAMES = 4
55
  DEFAULT_KEYFRAME_INTERVAL = 2
56
  DEFAULT_CONF_PERCENTILE = 50.0
57
  DEFAULT_CAMERA_ITERATIONS = 1
 
58
  IS_SPACE_RUNTIME = bool(os.getenv("SPACE_ID"))
59
  SKIP_EAGER_MODEL_LOAD = os.getenv("LINGBOT_SPACE_SKIP_MODEL_LOAD") == "1"
60
 
@@ -335,6 +339,14 @@ def _save_predictions_npz(predictions: dict[str, Any], output_path: Path) -> str
335
  return str(output_path)
336
 
337
 
 
 
 
 
 
 
 
 
338
  def _count_confident_points(vis_predictions: dict[str, Any], conf_percentile: float) -> tuple[int, float]:
339
  conf = vis_predictions.get("world_points_conf")
340
  if conf is None:
@@ -345,6 +357,179 @@ def _count_confident_points(vis_predictions: dict[str, Any], conf_percentile: fl
345
  return kept, float(threshold)
346
 
347
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  def _zip_outputs(work_dir: Path, paths: list[Path], output_name: str) -> str:
349
  zip_path = work_dir / output_name
350
  with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zip_file:
@@ -364,7 +549,7 @@ def _export_outputs(
364
  num_scale_frames: int,
365
  keyframe_interval: int,
366
  conf_percentile: float,
367
- ) -> tuple[str, str, dict[str, Any]]:
368
  vis_predictions = _prepare_for_visualization(predictions, images_cpu)
369
 
370
  glb_path = work_dir / "lingbot-map-reconstruction.glb"
@@ -377,6 +562,11 @@ def _export_outputs(
377
  )
378
  scene.export(glb_path)
379
 
 
 
 
 
 
380
  preview_path = Path(_make_preview_strip(images_cpu, work_dir / "preview.png"))
381
  npz_path = Path(_save_predictions_npz(predictions, work_dir / "predictions.npz"))
382
 
@@ -390,6 +580,7 @@ def _export_outputs(
390
  "confidence_percentile": conf_percentile,
391
  "confidence_threshold": round(conf_threshold, 4),
392
  "points_kept_for_glb": points_kept,
 
393
  "input_summary": input_summary,
394
  "runtime_summary": runtime_summary,
395
  }
@@ -399,10 +590,10 @@ def _export_outputs(
399
 
400
  artifact_path = _zip_outputs(
401
  work_dir,
402
- [glb_path, preview_path, npz_path, summary_path],
403
  output_name="lingbot-map-results.zip",
404
  )
405
- return str(glb_path), artifact_path, summary
406
 
407
 
408
  def _format_status(summary: dict[str, Any]) -> str:
@@ -416,6 +607,7 @@ def _format_status(summary: dict[str, Any]) -> str:
416
  f"- Runtime: `{runtime['runtime_seconds']}s` on `{runtime['device']}`",
417
  f"- GLB confidence percentile: `{summary['confidence_percentile']}`",
418
  f"- Points kept for GLB: `{summary['points_kept_for_glb']}`",
 
419
  ]
420
  if runtime.get("peak_memory_gb") is not None:
421
  lines.append(f"- Peak GPU memory: `{runtime['peak_memory_gb']} GB`")
@@ -446,7 +638,7 @@ def reconstruct_scene(
446
  keyframe_interval=keyframe_interval,
447
  )
448
 
449
- glb_path, artifact_path, summary = _export_outputs(
450
  work_dir=work_dir,
451
  image_paths=image_paths,
452
  predictions=predictions,
@@ -460,7 +652,7 @@ def reconstruct_scene(
460
 
461
  preview_path = str(work_dir / "preview.png")
462
  status = _format_status(summary)
463
- return glb_path, preview_path, artifact_path, summary, status
464
 
465
 
466
  def _build_startup_markdown() -> str:
@@ -479,6 +671,25 @@ css = """
479
  object-fit: cover !important;
480
  border-radius: 8px !important;
481
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  footer {display: none !important;}
483
  """
484
 
@@ -488,23 +699,12 @@ _eager_load_default_model()
488
 
489
  with gr.Blocks(title="LingBot 3D") as demo:
490
  with gr.Column(elem_id="container"):
491
- gr.Image(
492
- value=str(ROOT / "assets" / "teaser.png"),
493
- show_label=False,
494
- interactive=False,
495
- container=False,
496
- elem_classes=["teaser"],
497
- )
498
  gr.Markdown("# LingBot 3D")
499
  gr.Markdown(
500
  "Upload a short video clip and get back a navigable 3D scene. "
501
  "Powered by the LingBot-Map checkpoint, exported as a GLB plus a downloadable results bundle."
502
  )
503
 
504
- startup_md = _build_startup_markdown()
505
- if startup_md:
506
- gr.Markdown(startup_md)
507
-
508
  with gr.Row():
509
  with gr.Column():
510
  video_file = gr.Video(
@@ -514,12 +714,17 @@ with gr.Blocks(title="LingBot 3D") as demo:
514
  height=380,
515
  )
516
  with gr.Column():
517
- model_preview = gr.Model3D(
518
- label="3D preview",
519
- display_mode="point_cloud",
520
- clear_color=[1.0, 1.0, 1.0, 1.0],
521
- height=380,
522
  )
 
 
 
 
 
 
 
523
 
524
  run_button = gr.Button("Build 3D Scene", variant="primary")
525
  status_markdown = gr.Markdown()
@@ -544,8 +749,7 @@ with gr.Blocks(title="LingBot 3D") as demo:
544
  preview_image = gr.Image(label="Frame preview", interactive=False, height=200)
545
  artifact_file = gr.File(label="Download results bundle")
546
 
547
- with gr.Accordion("Run summary", open=False):
548
- summary_json = gr.JSON(label=None)
549
 
550
  run_button.click(
551
  fn=reconstruct_scene,
@@ -558,6 +762,7 @@ with gr.Blocks(title="LingBot 3D") as demo:
558
  conf_percentile,
559
  ],
560
  outputs=[
 
561
  model_preview,
562
  preview_image,
563
  artifact_file,
 
1
  import contextlib
2
+ import colorsys
3
+ import html
4
  import json
5
  import os
6
  import shutil
 
17
  import torch
18
  from huggingface_hub import hf_hub_download
19
  from PIL import Image, ImageDraw
20
+ from scipy.spatial.transform import Rotation
21
 
22
  try:
23
  import spaces
 
58
  DEFAULT_KEYFRAME_INTERVAL = 2
59
  DEFAULT_CONF_PERCENTILE = 50.0
60
  DEFAULT_CAMERA_ITERATIONS = 1
61
+ MAX_VISER_POINTS = 25_000
62
  IS_SPACE_RUNTIME = bool(os.getenv("SPACE_ID"))
63
  SKIP_EAGER_MODEL_LOAD = os.getenv("LINGBOT_SPACE_SKIP_MODEL_LOAD") == "1"
64
 
 
339
  return str(output_path)
340
 
341
 
342
+ def _empty_viser_preview(message: str) -> str:
343
+ return (
344
+ "<div class='viser-empty'>"
345
+ f"<div>{html.escape(message)}</div>"
346
+ "</div>"
347
+ )
348
+
349
+
350
  def _count_confident_points(vis_predictions: dict[str, Any], conf_percentile: float) -> tuple[int, float]:
351
  conf = vis_predictions.get("world_points_conf")
352
  if conf is None:
 
357
  return kept, float(threshold)
358
 
359
 
360
+ def _prepare_viser_point_cloud(
361
+ vis_predictions: dict[str, Any],
362
+ conf_percentile: float,
363
+ max_points: int = MAX_VISER_POINTS,
364
+ ) -> tuple[np.ndarray, np.ndarray, float]:
365
+ world_points = vis_predictions.get("world_points")
366
+ conf = vis_predictions.get("world_points_conf")
367
+ if world_points is None:
368
+ world_points = vis_predictions.get("world_points_from_depth")
369
+ conf = vis_predictions.get("depth_conf")
370
+ if world_points is None:
371
+ raise ValueError("Missing world point predictions.")
372
+
373
+ images = vis_predictions["images"]
374
+ if images.ndim == 4 and images.shape[1] == 3:
375
+ images = np.transpose(images, (0, 2, 3, 1))
376
+
377
+ points = np.asarray(world_points).reshape(-1, 3)
378
+ colors = (np.asarray(images).reshape(-1, 3) * 255).clip(0, 255).astype(np.uint8)
379
+
380
+ if conf is None:
381
+ conf_flat = np.ones(points.shape[0], dtype=np.float32)
382
+ threshold = 0.0
383
+ else:
384
+ conf_flat = np.asarray(conf).reshape(-1)
385
+ threshold = np.percentile(conf_flat, conf_percentile) if conf_percentile > 0 else 0.0
386
+
387
+ mask = (conf_flat >= threshold) & (conf_flat > 1e-5)
388
+ points = points[mask]
389
+ colors = colors[mask]
390
+
391
+ if points.shape[0] == 0:
392
+ return points.astype(np.float32), colors, float(threshold)
393
+
394
+ if points.shape[0] > max_points:
395
+ keep_indices = np.linspace(0, points.shape[0] - 1, num=max_points, dtype=np.int64)
396
+ points = points[keep_indices]
397
+ colors = colors[keep_indices]
398
+
399
+ return points.astype(np.float32), colors, float(threshold)
400
+
401
+
402
+ def _add_viser_cameras(
403
+ server: Any,
404
+ vis_predictions: dict[str, Any],
405
+ scene_extent: float,
406
+ ) -> list[np.ndarray]:
407
+ extrinsics = vis_predictions.get("extrinsic")
408
+ intrinsics = vis_predictions.get("intrinsic")
409
+ images = vis_predictions.get("images")
410
+ if extrinsics is None or intrinsics is None or images is None:
411
+ return []
412
+
413
+ extrinsics = np.asarray(extrinsics)
414
+ intrinsics = np.asarray(intrinsics)
415
+ images = np.asarray(images)
416
+
417
+ if images.ndim == 4 and images.shape[1] == 3:
418
+ _, _, image_height, image_width = images.shape
419
+ else:
420
+ _, image_height, image_width, _ = images.shape
421
+
422
+ camera_positions: list[np.ndarray] = []
423
+ frustum_scale = max(scene_extent * 0.05, 0.05)
424
+
425
+ for idx, world_to_camera_3x4 in enumerate(extrinsics):
426
+ world_to_camera = np.eye(4, dtype=np.float32)
427
+ world_to_camera[:3, :4] = world_to_camera_3x4
428
+ camera_to_world = np.linalg.inv(world_to_camera)
429
+ camera_positions.append(camera_to_world[:3, 3].copy())
430
+
431
+ intrinsic = intrinsics[idx]
432
+ fy = float(max(intrinsic[1, 1], 1e-6))
433
+ fov = float(np.clip(2 * np.arctan2(image_height / 2.0, fy), 0.1, np.pi - 0.1))
434
+ aspect = float(max(image_width / max(image_height, 1), 1e-3))
435
+
436
+ quat_xyzw = Rotation.from_matrix(camera_to_world[:3, :3]).as_quat()
437
+ wxyz = (
438
+ float(quat_xyzw[3]),
439
+ float(quat_xyzw[0]),
440
+ float(quat_xyzw[1]),
441
+ float(quat_xyzw[2]),
442
+ )
443
+ color = tuple(
444
+ int(channel * 255)
445
+ for channel in colorsys.hsv_to_rgb(idx / max(len(extrinsics), 1), 0.65, 1.0)
446
+ )
447
+
448
+ server.scene.add_camera_frustum(
449
+ f"/cameras/camera_{idx:02d}",
450
+ fov=fov,
451
+ aspect=aspect,
452
+ scale=frustum_scale,
453
+ color=color,
454
+ wxyz=wxyz,
455
+ position=tuple(float(x) for x in camera_to_world[:3, 3]),
456
+ variant="wireframe",
457
+ )
458
+
459
+ return camera_positions
460
+
461
+
462
+ def _build_viser_preview(
463
+ vis_predictions: dict[str, Any],
464
+ output_path: Path,
465
+ conf_percentile: float,
466
+ ) -> tuple[str, str | None, int]:
467
+ try:
468
+ import viser
469
+ except ModuleNotFoundError:
470
+ return (
471
+ _empty_viser_preview("Static Viser preview is unavailable because `viser` is not installed."),
472
+ None,
473
+ 0,
474
+ )
475
+
476
+ server = None
477
+ try:
478
+ points, colors, _ = _prepare_viser_point_cloud(vis_predictions, conf_percentile)
479
+ if points.shape[0] == 0:
480
+ return _empty_viser_preview("No confident points were available for the static Viser preview."), None, 0
481
+
482
+ server = viser.ViserServer(port=0, verbose=False)
483
+ server.scene.set_up_direction("+z")
484
+
485
+ if hasattr(server.scene, "world_axes"):
486
+ server.scene.world_axes.visible = False
487
+
488
+ lower = np.percentile(points, 5, axis=0)
489
+ upper = np.percentile(points, 95, axis=0)
490
+ scene_extent = float(np.linalg.norm(upper - lower))
491
+ scene_extent = max(scene_extent, 1e-3)
492
+ scene_center = points.mean(axis=0)
493
+
494
+ server.scene.add_point_cloud(
495
+ "/reconstruction",
496
+ points=points,
497
+ colors=colors,
498
+ point_size=max(scene_extent * 0.0025, 0.003),
499
+ )
500
+
501
+ camera_positions = _add_viser_cameras(server, vis_predictions, scene_extent)
502
+ if camera_positions:
503
+ camera_center = np.mean(np.asarray(camera_positions), axis=0)
504
+ scene_center = (scene_center + camera_center) / 2.0
505
+
506
+ server.initial_camera.look_at = tuple(float(x) for x in scene_center)
507
+ server.initial_camera.position = tuple(
508
+ float(x)
509
+ for x in scene_center + np.array([scene_extent, scene_extent, max(scene_extent * 0.65, 0.25)])
510
+ )
511
+ server.initial_camera.up = (0.0, 0.0, 1.0)
512
+
513
+ html_doc = server.scene.as_html(dark_mode=True)
514
+ output_path.write_text(html_doc, encoding="utf-8")
515
+ iframe_html = (
516
+ "<iframe class='viser-frame' "
517
+ "sandbox='allow-scripts allow-same-origin allow-downloads' "
518
+ f"srcdoc=\"{html.escape(html_doc, quote=True)}\"></iframe>"
519
+ )
520
+ return iframe_html, str(output_path), int(points.shape[0])
521
+ except Exception as exc:
522
+ return (
523
+ _empty_viser_preview(f"Static Viser preview could not be created for this run: {exc}"),
524
+ None,
525
+ 0,
526
+ )
527
+ finally:
528
+ if server is not None and hasattr(server, "stop"):
529
+ with contextlib.suppress(Exception):
530
+ server.stop()
531
+
532
+
533
  def _zip_outputs(work_dir: Path, paths: list[Path], output_name: str) -> str:
534
  zip_path = work_dir / output_name
535
  with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zip_file:
 
549
  num_scale_frames: int,
550
  keyframe_interval: int,
551
  conf_percentile: float,
552
+ ) -> tuple[str, str, str, dict[str, Any]]:
553
  vis_predictions = _prepare_for_visualization(predictions, images_cpu)
554
 
555
  glb_path = work_dir / "lingbot-map-reconstruction.glb"
 
562
  )
563
  scene.export(glb_path)
564
 
565
+ viser_preview_html, viser_preview_path, viser_points = _build_viser_preview(
566
+ vis_predictions,
567
+ work_dir / "viser-preview.html",
568
+ conf_percentile=conf_percentile,
569
+ )
570
  preview_path = Path(_make_preview_strip(images_cpu, work_dir / "preview.png"))
571
  npz_path = Path(_save_predictions_npz(predictions, work_dir / "predictions.npz"))
572
 
 
580
  "confidence_percentile": conf_percentile,
581
  "confidence_threshold": round(conf_threshold, 4),
582
  "points_kept_for_glb": points_kept,
583
+ "points_used_for_viser_preview": viser_points,
584
  "input_summary": input_summary,
585
  "runtime_summary": runtime_summary,
586
  }
 
590
 
591
  artifact_path = _zip_outputs(
592
  work_dir,
593
+ [glb_path, preview_path, npz_path, summary_path, Path(viser_preview_path) if viser_preview_path else work_dir / "__missing__"],
594
  output_name="lingbot-map-results.zip",
595
  )
596
+ return str(glb_path), viser_preview_html, artifact_path, summary
597
 
598
 
599
  def _format_status(summary: dict[str, Any]) -> str:
 
607
  f"- Runtime: `{runtime['runtime_seconds']}s` on `{runtime['device']}`",
608
  f"- GLB confidence percentile: `{summary['confidence_percentile']}`",
609
  f"- Points kept for GLB: `{summary['points_kept_for_glb']}`",
610
+ f"- Points used for static Viser preview: `{summary['points_used_for_viser_preview']}`",
611
  ]
612
  if runtime.get("peak_memory_gb") is not None:
613
  lines.append(f"- Peak GPU memory: `{runtime['peak_memory_gb']} GB`")
 
638
  keyframe_interval=keyframe_interval,
639
  )
640
 
641
+ glb_path, viser_preview_html, artifact_path, summary = _export_outputs(
642
  work_dir=work_dir,
643
  image_paths=image_paths,
644
  predictions=predictions,
 
652
 
653
  preview_path = str(work_dir / "preview.png")
654
  status = _format_status(summary)
655
+ return viser_preview_html, glb_path, preview_path, artifact_path, summary, status
656
 
657
 
658
  def _build_startup_markdown() -> str:
 
671
  object-fit: cover !important;
672
  border-radius: 8px !important;
673
  }
674
+ .viser-frame {
675
+ width: 100%;
676
+ height: 380px;
677
+ border: 1px solid #d7dce5;
678
+ border-radius: 12px;
679
+ background: #0f1720;
680
+ }
681
+ .viser-empty {
682
+ min-height: 380px;
683
+ border: 1px dashed #c9d1dd;
684
+ border-radius: 12px;
685
+ display: flex;
686
+ align-items: center;
687
+ justify-content: center;
688
+ padding: 24px;
689
+ text-align: center;
690
+ background: linear-gradient(180deg, #f8fafc 0%, #eef2f7 100%);
691
+ color: #334155;
692
+ }
693
  footer {display: none !important;}
694
  """
695
 
 
699
 
700
  with gr.Blocks(title="LingBot 3D") as demo:
701
  with gr.Column(elem_id="container"):
 
 
 
 
 
 
 
702
  gr.Markdown("# LingBot 3D")
703
  gr.Markdown(
704
  "Upload a short video clip and get back a navigable 3D scene. "
705
  "Powered by the LingBot-Map checkpoint, exported as a GLB plus a downloadable results bundle."
706
  )
707
 
 
 
 
 
708
  with gr.Row():
709
  with gr.Column():
710
  video_file = gr.Video(
 
714
  height=380,
715
  )
716
  with gr.Column():
717
+ gr.Markdown("### Static Viser Preview")
718
+ viser_preview = gr.HTML(
719
+ value=_empty_viser_preview("Run a reconstruction to load the static Viser preview."),
 
 
720
  )
721
+ with gr.Accordion("Fallback GLB preview", open=False):
722
+ model_preview = gr.Model3D(
723
+ label="GLB preview",
724
+ display_mode="point_cloud",
725
+ clear_color=[1.0, 1.0, 1.0, 1.0],
726
+ height=380,
727
+ )
728
 
729
  run_button = gr.Button("Build 3D Scene", variant="primary")
730
  status_markdown = gr.Markdown()
 
749
  preview_image = gr.Image(label="Frame preview", interactive=False, height=200)
750
  artifact_file = gr.File(label="Download results bundle")
751
 
752
+ summary_json = gr.JSON(visible=False)
 
753
 
754
  run_button.click(
755
  fn=reconstruct_scene,
 
762
  conf_percentile,
763
  ],
764
  outputs=[
765
+ viser_preview,
766
  model_preview,
767
  preview_image,
768
  artifact_file,