lingbot-3d-ZERO

Running on Zero

App Files Files Community

lingbot-3d-ZERO / app.py

Fifthoply

Update app.py

020b103 verified 8 days ago

raw

history blame contribute delete

26.8 kB

	import contextlib
	import colorsys
	import html
	import json
	import os
	import shutil
	import tempfile
	import threading
	import time
	import zipfile
	from pathlib import Path
	from typing import Any

	import cv2
	import gradio as gr
	import numpy as np
	import torch
	from huggingface_hub import hf_hub_download
	from PIL import Image, ImageDraw
	from scipy.spatial.transform import Rotation

	try:
	import spaces
	except ImportError:
	class _SpacesShim:
	@staticmethod
	def GPU(decorator_args, *decorator_kwargs):
	if decorator_args and callable(decorator_args[0]) and len(decorator_args) == 1 and not decorator_kwargs:
	return decorator_args[0]

	def _wrap(func):
	return func

	return _wrap

	spaces = _SpacesShim()

	from lingbot_map.models.gct_stream import GCTStream
	from lingbot_map.utils.geometry import closed_form_inverse_se3_general
	from lingbot_map.utils.load_fn import load_and_preprocess_images
	from lingbot_map.utils.pose_enc import pose_encoding_to_extri_intri
	from lingbot_map.vis.glb_export import predictions_to_glb


	ROOT = Path(__file__).resolve().parent
	OUTPUT_ROOT = ROOT / "app_output"
	OUTPUT_ROOT.mkdir(exist_ok=True)

	HF_MODEL_REPO = "robbyant/lingbot-map"
	MODEL_FILENAME = "lingbot-map.pt"

	IMAGE_SIZE = 518
	PATCH_SIZE = 14
	DEFAULT_FPS = 8
	DEFAULT_MAX_FRAMES = 24
	MAX_FRAMES_HARD_LIMIT = 24
	DEFAULT_SCALE_FRAMES = 4
	DEFAULT_KEYFRAME_INTERVAL = 2
	DEFAULT_CONF_PERCENTILE = 50.0
	DEFAULT_CAMERA_ITERATIONS = 1
	MAX_VISER_POINTS = 25_000
	IS_SPACE_RUNTIME = bool(os.getenv("SPACE_ID"))
	SKIP_EAGER_MODEL_LOAD = os.getenv("LINGBOT_SPACE_SKIP_MODEL_LOAD") == "1"

	MODEL_CACHE: dict[str, Any] = {}
	MODEL_CACHE_LOCK = threading.Lock()
	STARTUP_NOTES: list[str] = []


	def _resolve_path(file_obj: Any) -> str:
	if file_obj is None:
	return ""
	if isinstance(file_obj, str):
	return file_obj
	return getattr(file_obj, "name", "")


	def _cleanup_old_runs(keep_last: int = 8) -> None:
	run_dirs = sorted([p for p in OUTPUT_ROOT.iterdir() if p.is_dir()], key=lambda p: p.stat().st_mtime)
	for stale_dir in run_dirs[:-keep_last]:
	shutil.rmtree(stale_dir, ignore_errors=True)


	def _pick_runtime_device() -> torch.device:
	try:
	torch.empty(1, device="cuda")
	return torch.device("cuda")
	except Exception:
	return torch.device("cpu")


	def _load_model_bundle() -> dict[str, Any]:
	with MODEL_CACHE_LOCK:
	cached = MODEL_CACHE.get("default")
	if cached is not None:
	return cached

	device = _pick_runtime_device()
	weight_name = MODEL_FILENAME
	weight_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=weight_name)

	model = GCTStream(
	img_size=IMAGE_SIZE,
	patch_size=PATCH_SIZE,
	enable_3d_rope=True,
	max_frame_num=1024,
	kv_cache_sliding_window=64,
	kv_cache_scale_frames=8,
	kv_cache_cross_frame_special=True,
	kv_cache_include_scale_frames=True,
	use_sdpa=True,
	camera_num_iterations=DEFAULT_CAMERA_ITERATIONS,
	)

	checkpoint = torch.load(weight_path, map_location="cpu", weights_only=False)
	state_dict = checkpoint.get("model", checkpoint)
	missing, unexpected = model.load_state_dict(state_dict, strict=False)

	model = model.to(device).eval()
	inference_dtype = torch.bfloat16 if device.type == "cuda" else torch.float32
	if device.type == "cuda" and getattr(model, "aggregator", None) is not None:
	model.aggregator = model.aggregator.to(dtype=inference_dtype)

	bundle = {
	"model": model,
	"device": device,
	"dtype": inference_dtype,
	"weight_name": weight_name,
	"weight_path": str(weight_path),
	"missing_keys": len(missing),
	"unexpected_keys": len(unexpected),
	}
	MODEL_CACHE["default"] = bundle
	return bundle


	def _eager_load_default_model() -> None:
	if not IS_SPACE_RUNTIME or SKIP_EAGER_MODEL_LOAD:
	return
	try:
	bundle = _load_model_bundle()
	STARTUP_NOTES.append(
	f"Startup preload complete on `{bundle['device']}` with `{bundle['weight_name']}`."
	)
	except Exception as exc:
	STARTUP_NOTES.append(f"Startup preload failed: {exc}")


	def _extract_video_frames(video_file: str, frames_dir: Path, fps: int, max_frames: int) -> tuple[list[str], dict[str, Any]]:
	cap = cv2.VideoCapture(video_file)
	if not cap.isOpened():
	raise ValueError("Could not open the uploaded video.")

	source_fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
	interval = max(1, round(source_fps / max(fps, 1)))

	saved_paths = []
	frame_idx = 0
	while len(saved_paths) < max_frames:
	ok, frame = cap.read()
	if not ok:
	break
	if frame_idx % interval == 0:
	output_path = frames_dir / f"{len(saved_paths):06d}.jpg"
	cv2.imwrite(str(output_path), frame)
	saved_paths.append(str(output_path))
	frame_idx += 1

	cap.release()

	return saved_paths, {
	"source_fps": round(source_fps, 2),
	"sample_interval": interval,
	"original_frame_count": total_frames,
	}


	def _prepare_inputs(video_file: Any, fps: int, max_frames: int) -> tuple[torch.Tensor, list[str], Path, dict[str, Any]]:
	_cleanup_old_runs()
	work_dir = Path(tempfile.mkdtemp(prefix="lingbot-map-", dir=OUTPUT_ROOT))
	input_dir = work_dir / "inputs"
	input_dir.mkdir(parents=True, exist_ok=True)

	input_summary = {"input_mode": "video"}
	video_path = _resolve_path(video_file)
	if not video_path:
	raise ValueError("Upload one short video.")
	image_paths, video_summary = _extract_video_frames(video_path, input_dir, fps=fps, max_frames=max_frames)
	input_summary.update(video_summary)

	if len(image_paths) < 2:
	raise ValueError("Provide at least 2 frames. The Space is tuned for short multi-frame reconstructions.")

	images = load_and_preprocess_images(
	image_paths,
	mode="crop",
	image_size=IMAGE_SIZE,
	patch_size=PATCH_SIZE,
	)
	return images, image_paths, work_dir, input_summary


	def _squeeze_single_batch(key: str, value: torch.Tensor) -> torch.Tensor:
	batched_dims = {
	"pose_enc": 3,
	"depth": 5,
	"depth_conf": 4,
	"world_points": 5,
	"world_points_conf": 4,
	"extrinsic": 4,
	"intrinsic": 4,
	"images": 5,
	}
	expected_ndim = batched_dims.get(key)
	if expected_ndim is None or value.ndim != expected_ndim or value.shape[0] != 1:
	return value
	return value[0]


	def _postprocess_predictions(predictions: dict[str, Any], images: torch.Tensor) -> tuple[dict[str, Any], torch.Tensor]:
	extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images.shape[-2:])
	extrinsic_4x4 = torch.zeros((*extrinsic.shape[:-2], 4, 4), device=extrinsic.device, dtype=extrinsic.dtype)
	extrinsic_4x4[..., :3, :4] = extrinsic
	extrinsic_4x4[..., 3, 3] = 1.0
	extrinsic_4x4 = closed_form_inverse_se3_general(extrinsic_4x4)

	predictions["extrinsic"] = extrinsic_4x4[..., :3, :4]
	predictions["intrinsic"] = intrinsic
	predictions.pop("pose_enc_list", None)
	predictions.pop("images", None)

	for key, value in list(predictions.items()):
	if isinstance(value, torch.Tensor):
	predictions[key] = _squeeze_single_batch(key, value.detach().to("cpu"))

	images_cpu = images.detach().to("cpu")
	if torch.cuda.is_available():
	torch.cuda.synchronize()
	return predictions, images_cpu


	def _prepare_for_visualization(predictions: dict[str, Any], images: torch.Tensor) -> dict[str, Any]:
	vis_predictions = {}
	for key, value in predictions.items():
	if isinstance(value, torch.Tensor):
	vis_predictions[key] = _squeeze_single_batch(key, value).detach().cpu().numpy()
	else:
	vis_predictions[key] = value
	vis_predictions["images"] = _squeeze_single_batch("images", images).detach().cpu().numpy()
	return vis_predictions


	def _estimate_gpu_duration(images: torch.Tensor, num_scale_frames: int, keyframe_interval: int) -> int:
	frame_count = int(getattr(images, "shape", [DEFAULT_MAX_FRAMES])[0])
	del num_scale_frames, keyframe_interval
	return min(180, max(60, 24 + frame_count * 4))


	@spaces.GPU(duration=420)
	def _run_inference(images: torch.Tensor, num_scale_frames: int, keyframe_interval: int) -> tuple[dict[str, Any], torch.Tensor, dict[str, Any]]:
	bundle = _load_model_bundle()
	model = bundle["model"]
	device = bundle["device"]
	dtype = bundle["dtype"]

	if device.type == "cuda":
	torch.cuda.empty_cache()
	torch.cuda.reset_peak_memory_stats()

	images = images.to(device)
	output_device = torch.device("cpu")
	autocast_context = (
	torch.amp.autocast("cuda", dtype=dtype)
	if device.type == "cuda"
	else contextlib.nullcontext()
	)

	started_at = time.time()
	with torch.no_grad():
	with autocast_context:
	predictions = model.inference_streaming(
	images,
	num_scale_frames=num_scale_frames,
	keyframe_interval=keyframe_interval,
	output_device=output_device,
	)
	inference_seconds = time.time() - started_at

	images_for_post = predictions["images"]
	del images
	if device.type == "cuda":
	torch.cuda.empty_cache()

	predictions, images_cpu = _postprocess_predictions(predictions, images_for_post)
	return predictions, images_cpu, {
	"runtime_seconds": round(inference_seconds, 2),
	"device": str(device),
	"dtype": str(dtype),
	"weight_name": bundle["weight_name"],
	"weight_path": bundle["weight_path"],
	"missing_keys": bundle["missing_keys"],
	"unexpected_keys": bundle["unexpected_keys"],
	"peak_memory_gb": round(torch.cuda.max_memory_allocated() / 1e9, 2) if device.type == "cuda" else None,
	}


	def _make_preview_strip(images: torch.Tensor, output_path: Path) -> str:
	frames = _squeeze_single_batch("images", images.detach().cpu())
	count = frames.shape[0]
	indices = sorted({int(round(i)) for i in np.linspace(0, count - 1, num=min(4, count))})

	tiles = []
	for idx in indices:
	rgb = (frames[idx].permute(1, 2, 0).numpy() * 255).clip(0, 255).astype(np.uint8)
	tile = Image.fromarray(rgb).resize((320, 220))
	tiles.append(tile)

	banner = Image.new("RGB", (320 * len(tiles), 260), color=(245, 240, 228))
	draw = ImageDraw.Draw(banner)
	draw.text((18, 14), f"LingBot-Map preview \| {count} frames", fill=(31, 41, 55))
	draw.text((18, 38), "ZeroGPU demo export", fill=(87, 96, 110))

	x_offset = 0
	for tile in tiles:
	banner.paste(tile, (x_offset, 72))
	x_offset += tile.width

	banner.save(output_path)
	return str(output_path)


	def _save_predictions_npz(predictions: dict[str, Any], output_path: Path) -> str:
	arrays = {}
	for key, value in predictions.items():
	if isinstance(value, torch.Tensor):
	arrays[key] = value.detach().cpu().numpy()
	np.savez_compressed(output_path, **arrays)
	return str(output_path)


	def _empty_viser_preview(message: str) -> str:
	return (
	"<div class='viser-empty'>"
	f"<div>{html.escape(message)}</div>"
	"</div>"
	)


	def _count_confident_points(vis_predictions: dict[str, Any], conf_percentile: float) -> tuple[int, float]:
	conf = vis_predictions.get("world_points_conf")
	if conf is None:
	return 0, 0.0
	conf_flat = conf.reshape(-1)
	threshold = np.percentile(conf_flat, conf_percentile) if conf_percentile > 0 else 0.0
	kept = int(((conf_flat >= threshold) & (conf_flat > 1e-5)).sum())
	return kept, float(threshold)


	def _prepare_viser_point_cloud(
	vis_predictions: dict[str, Any],
	conf_percentile: float,
	max_points: int = MAX_VISER_POINTS,
	) -> tuple[np.ndarray, np.ndarray, float]:
	world_points = vis_predictions.get("world_points")
	conf = vis_predictions.get("world_points_conf")
	if world_points is None:
	world_points = vis_predictions.get("world_points_from_depth")
	conf = vis_predictions.get("depth_conf")
	if world_points is None:
	raise ValueError("Missing world point predictions.")

	images = vis_predictions["images"]
	if images.ndim == 4 and images.shape[1] == 3:
	images = np.transpose(images, (0, 2, 3, 1))

	points = np.asarray(world_points).reshape(-1, 3)
	colors = (np.asarray(images).reshape(-1, 3) * 255).clip(0, 255).astype(np.uint8)

	if conf is None:
	conf_flat = np.ones(points.shape[0], dtype=np.float32)
	threshold = 0.0
	else:
	conf_flat = np.asarray(conf).reshape(-1)
	threshold = np.percentile(conf_flat, conf_percentile) if conf_percentile > 0 else 0.0

	mask = (conf_flat >= threshold) & (conf_flat > 1e-5)
	points = points[mask]
	colors = colors[mask]

	if points.shape[0] == 0:
	return points.astype(np.float32), colors, float(threshold)

	if points.shape[0] > max_points:
	keep_indices = np.linspace(0, points.shape[0] - 1, num=max_points, dtype=np.int64)
	points = points[keep_indices]
	colors = colors[keep_indices]

	return points.astype(np.float32), colors, float(threshold)


	def _add_viser_cameras(
	server: Any,
	vis_predictions: dict[str, Any],
	scene_extent: float,
	) -> list[np.ndarray]:
	extrinsics = vis_predictions.get("extrinsic")
	intrinsics = vis_predictions.get("intrinsic")
	images = vis_predictions.get("images")
	if extrinsics is None or intrinsics is None or images is None:
	return []

	extrinsics = np.asarray(extrinsics)
	intrinsics = np.asarray(intrinsics)
	images = np.asarray(images)

	if images.ndim == 4 and images.shape[1] == 3:
	_, _, image_height, image_width = images.shape
	else:
	_, image_height, image_width, _ = images.shape

	camera_positions: list[np.ndarray] = []
	frustum_scale = max(scene_extent * 0.05, 0.05)

	for idx, world_to_camera_3x4 in enumerate(extrinsics):
	world_to_camera = np.eye(4, dtype=np.float32)
	world_to_camera[:3, :4] = world_to_camera_3x4
	camera_to_world = np.linalg.inv(world_to_camera)
	camera_positions.append(camera_to_world[:3, 3].copy())

	intrinsic = intrinsics[idx]
	fy = float(max(intrinsic[1, 1], 1e-6))
	fov = float(np.clip(2 * np.arctan2(image_height / 2.0, fy), 0.1, np.pi - 0.1))
	aspect = float(max(image_width / max(image_height, 1), 1e-3))

	quat_xyzw = Rotation.from_matrix(camera_to_world[:3, :3]).as_quat()
	wxyz = (
	float(quat_xyzw[3]),
	float(quat_xyzw[0]),
	float(quat_xyzw[1]),
	float(quat_xyzw[2]),
	)
	color = tuple(
	int(channel * 255)
	for channel in colorsys.hsv_to_rgb(idx / max(len(extrinsics), 1), 0.65, 1.0)
	)

	server.scene.add_camera_frustum(
	f"/cameras/camera_{idx:02d}",
	fov=fov,
	aspect=aspect,
	scale=frustum_scale,
	color=color,
	wxyz=wxyz,
	position=tuple(float(x) for x in camera_to_world[:3, 3]),
	variant="wireframe",
	)

	return camera_positions


	def _build_viser_preview(
	vis_predictions: dict[str, Any],
	output_path: Path,
	conf_percentile: float,
	) -> tuple[str, str \| None, int]:
	try:
	import viser
	except ModuleNotFoundError:
	return (
	_empty_viser_preview("Static Viser preview is unavailable because `viser` is not installed."),
	None,
	0,
	)

	server = None
	try:
	points, colors, _ = _prepare_viser_point_cloud(vis_predictions, conf_percentile)
	if points.shape[0] == 0:
	return _empty_viser_preview("No confident points were available for the static Viser preview."), None, 0

	server = viser.ViserServer(port=0, verbose=False)
	server.scene.set_up_direction("+z")

	if hasattr(server.scene, "world_axes"):
	server.scene.world_axes.visible = False

	lower = np.percentile(points, 5, axis=0)
	upper = np.percentile(points, 95, axis=0)
	scene_extent = float(np.linalg.norm(upper - lower))
	scene_extent = max(scene_extent, 1e-3)
	scene_center = points.mean(axis=0)

	server.scene.add_point_cloud(
	"/reconstruction",
	points=points,
	colors=colors,
	point_size=max(scene_extent * 0.0025, 0.003),
	)

	camera_positions = _add_viser_cameras(server, vis_predictions, scene_extent)
	if camera_positions:
	camera_center = np.mean(np.asarray(camera_positions), axis=0)
	scene_center = (scene_center + camera_center) / 2.0

	server.initial_camera.look_at = tuple(float(x) for x in scene_center)
	server.initial_camera.position = tuple(
	float(x)
	for x in scene_center + np.array([scene_extent, scene_extent, max(scene_extent * 0.65, 0.25)])
	)
	server.initial_camera.up = (0.0, 0.0, 1.0)

	html_doc = server.scene.as_html(dark_mode=True)
	output_path.write_text(html_doc, encoding="utf-8")
	iframe_html = (
	"<iframe class='viser-frame' "
	"sandbox='allow-scripts allow-same-origin allow-downloads' "
	f"srcdoc=\"{html.escape(html_doc, quote=True)}\"></iframe>"
	)
	return iframe_html, str(output_path), int(points.shape[0])
	except Exception as exc:
	return (
	_empty_viser_preview(f"Static Viser preview could not be created for this run: {exc}"),
	None,
	0,
	)
	finally:
	if server is not None and hasattr(server, "stop"):
	with contextlib.suppress(Exception):
	server.stop()


	def _zip_outputs(work_dir: Path, paths: list[Path], output_name: str) -> str:
	zip_path = work_dir / output_name
	with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zip_file:
	for path in paths:
	if path.exists():
	zip_file.write(path, arcname=path.name)
	return str(zip_path)


	def _export_outputs(
	work_dir: Path,
	image_paths: list[str],
	predictions: dict[str, Any],
	images_cpu: torch.Tensor,
	input_summary: dict[str, Any],
	runtime_summary: dict[str, Any],
	num_scale_frames: int,
	keyframe_interval: int,
	conf_percentile: float,
	) -> tuple[str, str, str, dict[str, Any]]:
	vis_predictions = _prepare_for_visualization(predictions, images_cpu)

	glb_path = work_dir / "lingbot-map-reconstruction.glb"
	scene = predictions_to_glb(
	vis_predictions,
	conf_thres=conf_percentile,
	show_cam=True,
	target_dir=str(work_dir),
	mask_sky=False,
	)
	scene.export(glb_path)

	viser_preview_html, viser_preview_path, viser_points = _build_viser_preview(
	vis_predictions,
	work_dir / "viser-preview.html",
	conf_percentile=conf_percentile,
	)
	preview_path = Path(_make_preview_strip(images_cpu, work_dir / "preview.png"))
	npz_path = Path(_save_predictions_npz(predictions, work_dir / "predictions.npz"))

	points_kept, conf_threshold = _count_confident_points(vis_predictions, conf_percentile)
	summary = {
	"model_variant": "Default",
	"model_filename": MODEL_FILENAME,
	"frames_used": len(image_paths),
	"num_scale_frames": num_scale_frames,
	"keyframe_interval": keyframe_interval,
	"confidence_percentile": conf_percentile,
	"confidence_threshold": round(conf_threshold, 4),
	"points_kept_for_glb": points_kept,
	"points_used_for_viser_preview": viser_points,
	"input_summary": input_summary,
	"runtime_summary": runtime_summary,
	}

	summary_path = work_dir / "summary.json"
	summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")

	artifact_path = _zip_outputs(
	work_dir,
	[glb_path, preview_path, npz_path, summary_path, Path(viser_preview_path) if viser_preview_path else work_dir / "__missing__"],
	output_name="lingbot-map-results.zip",
	)
	return str(glb_path), viser_preview_html, artifact_path, summary


	def _format_status(summary: dict[str, Any]) -> str:
	runtime = summary["runtime_summary"]
	input_summary = summary["input_summary"]
	lines = [
	"## Run Complete",
	f"- Model: `{summary['model_filename']}`",
	f"- Frames used: `{summary['frames_used']}`",
	f"- Input mode: `{input_summary['input_mode']}`",
	f"- Runtime: `{runtime['runtime_seconds']}s` on `{runtime['device']}`",
	f"- GLB confidence percentile: `{summary['confidence_percentile']}`",
	f"- Points kept for GLB: `{summary['points_kept_for_glb']}`",
	f"- Points used for static Viser preview: `{summary['points_used_for_viser_preview']}`",
	]
	if runtime.get("peak_memory_gb") is not None:
	lines.append(f"- Peak GPU memory: `{runtime['peak_memory_gb']} GB`")
	if input_summary.get("sample_interval"):
	lines.append(f"- Video sample interval: `every {input_summary['sample_interval']} frame(s)`")
	return "\n".join(lines)


	def reconstruct_scene(
	video_file: Any,
	fps: int,
	max_frames: int,
	num_scale_frames: int,
	keyframe_interval: int,
	conf_percentile: float,
	):
	max_frames = max(2, min(int(max_frames), MAX_FRAMES_HARD_LIMIT))
	num_scale_frames = max(1, int(num_scale_frames))
	keyframe_interval = max(1, int(keyframe_interval))
	conf_percentile = float(conf_percentile)

	images, image_paths, work_dir, input_summary = _prepare_inputs(video_file=video_file, fps=int(fps), max_frames=max_frames)

	num_scale_frames = min(num_scale_frames, int(images.shape[0]))
	predictions, images_cpu, runtime_summary = _run_inference(
	images,
	num_scale_frames=num_scale_frames,
	keyframe_interval=keyframe_interval,
	)

	glb_path, viser_preview_html, artifact_path, summary = _export_outputs(
	work_dir=work_dir,
	image_paths=image_paths,
	predictions=predictions,
	images_cpu=images_cpu,
	input_summary=input_summary,
	runtime_summary=runtime_summary,
	num_scale_frames=num_scale_frames,
	keyframe_interval=keyframe_interval,
	conf_percentile=conf_percentile,
	)

	preview_path = str(work_dir / "preview.png")
	status = _format_status(summary)
	return viser_preview_html, glb_path, preview_path, artifact_path, summary, status


	def _build_startup_markdown() -> str:
	if not STARTUP_NOTES:
	return ""
	return "\n".join([f"- {note}" for note in STARTUP_NOTES])


	css = """
	#container {
	max-width: 1200px;
	margin: 0 auto;
	}
	.teaser img {
	max-height: 260px !important;
	object-fit: cover !important;
	border-radius: 8px !important;
	}
	.viser-frame {
	width: 100%;
	height: 380px;
	border: 1px solid #d7dce5;
	border-radius: 12px;
	background: #0f1720;
	}
	.viser-empty {
	min-height: 380px;
	border: 1px dashed #c9d1dd;
	border-radius: 12px;
	display: flex;
	align-items: center;
	justify-content: center;
	padding: 24px;
	text-align: center;
	background: linear-gradient(180deg, #f8fafc 0%, #eef2f7 100%);
	color: #334155;
	}
	footer {display: none !important;}
	"""


	_eager_load_default_model()


	with gr.Blocks(title="LingBot 3D") as demo:
	with gr.Column(elem_id="container"):
	gr.Markdown("# LingBot 3D")
	gr.Markdown(
	"Upload a short video clip and get back a navigable 3D scene. "
	"Powered by the LingBot-Map checkpoint, exported as a GLB plus a downloadable results bundle."
	)

	with gr.Row():
	with gr.Column():
	video_file = gr.Video(
	label="Input video",
	sources=["upload"],
	format="mp4",
	height=380,
	)
	with gr.Column():
	gr.Markdown("### Static Viser Preview")
	viser_preview = gr.HTML(
	value=_empty_viser_preview("Run a reconstruction to load the static Viser preview."),
	)
	with gr.Accordion("Fallback GLB preview", open=False):
	model_preview = gr.Model3D(
	label="GLB preview",
	display_mode="point_cloud",
	clear_color=[1.0, 1.0, 1.0, 1.0],
	height=380,
	)

	run_button = gr.Button("Build 3D Scene", variant="primary")
	status_markdown = gr.Markdown()

	with gr.Accordion("Sampling & reconstruction settings", open=False):
	with gr.Row():
	fps = gr.Slider(minimum=1, maximum=12, step=1, value=DEFAULT_FPS, label="Sampling FPS")
	max_frames = gr.Slider(minimum=2, maximum=MAX_FRAMES_HARD_LIMIT, step=1, value=DEFAULT_MAX_FRAMES, label="Max frames")
	with gr.Row():
	num_scale_frames = gr.Slider(minimum=1, maximum=8, step=1, value=DEFAULT_SCALE_FRAMES, label="Scale frames")
	keyframe_interval = gr.Slider(minimum=1, maximum=8, step=1, value=DEFAULT_KEYFRAME_INTERVAL, label="Keyframe interval")
	conf_percentile = gr.Slider(
	minimum=0,
	maximum=90,
	step=5,
	value=DEFAULT_CONF_PERCENTILE,
	label="GLB confidence percentile",
	info="Higher = fewer, more confident points",
	)

	with gr.Row():
	preview_image = gr.Image(label="Frame preview", interactive=False, height=200)
	artifact_file = gr.File(label="Download results bundle")

	summary_json = gr.JSON(visible=False)

	run_button.click(
	fn=reconstruct_scene,
	inputs=[
	video_file,
	fps,
	max_frames,
	num_scale_frames,
	keyframe_interval,
	conf_percentile,
	],
	outputs=[
	viser_preview,
	model_preview,
	preview_image,
	artifact_file,
	summary_json,
	status_markdown,
	],
	show_progress="full",
	)

	demo.queue(default_concurrency_limit=1)


	if __name__ == "__main__":
	demo.launch(css=css)