zeyuren2002 commited on 9 days ago

Commit

ea3c0ad

verified ·

1 Parent(s): 87a49e9

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Depth-Anything-3/da3_streaming/loop_utils/__init__.py +15 -0
Depth-Anything-3/da3_streaming/loop_utils/alignment_torch.py +395 -0
Depth-Anything-3/da3_streaming/loop_utils/alignment_triton.py +543 -0
Depth-Anything-3/da3_streaming/loop_utils/config_utils.py +66 -0
Depth-Anything-3/da3_streaming/loop_utils/logging_utils.py +32 -0
Depth-Anything-3/da3_streaming/loop_utils/loop_detector.py +391 -0
Depth-Anything-3/da3_streaming/loop_utils/loop_refinement.py +268 -0
Depth-Anything-3/da3_streaming/loop_utils/sim3loop.py +399 -0
Depth-Anything-3/da3_streaming/loop_utils/sim3utils.py +1261 -0
Depth-Anything-3/da3_streaming/scripts/download_weights.sh +20 -0
Depth-Anything-3/docs/API.md +465 -0
Depth-Anything-3/docs/BENCHMARK.md +484 -0
Depth-Anything-3/docs/CLI.md +654 -0
Depth-Anything-3/docs/funcs/ref_view_strategy.md +183 -0
Depth-Anything-3/notebooks/da3.ipynb +0 -0
Depth-Anything-3/src/depth_anything_3/api.py +446 -0
Depth-Anything-3/src/depth_anything_3/app/css_and_html.py +594 -0
Depth-Anything-3/src/depth_anything_3/app/gradio_app.py +724 -0
Depth-Anything-3/src/depth_anything_3/app/modules/__init__.py +43 -0
Depth-Anything-3/src/depth_anything_3/app/modules/event_handlers.py +619 -0
Depth-Anything-3/src/depth_anything_3/app/modules/file_handlers.py +304 -0
Depth-Anything-3/src/depth_anything_3/app/modules/model_inference.py +260 -0
Depth-Anything-3/src/depth_anything_3/app/modules/ui_components.py +477 -0
Depth-Anything-3/src/depth_anything_3/app/modules/utils.py +207 -0
Depth-Anything-3/src/depth_anything_3/app/modules/visualization.py +434 -0
Depth-Anything-3/src/depth_anything_3/bench/__init__.py +45 -0
Depth-Anything-3/src/depth_anything_3/bench/configs/eval_bench.yaml +98 -0
Depth-Anything-3/src/depth_anything_3/bench/dataset.py +136 -0
Depth-Anything-3/src/depth_anything_3/bench/datasets/__init__.py +21 -0
Depth-Anything-3/src/depth_anything_3/bench/datasets/dtu.py +681 -0
Depth-Anything-3/src/depth_anything_3/bench/datasets/dtu64.py +182 -0
Depth-Anything-3/src/depth_anything_3/bench/datasets/eth3d.py +594 -0
Depth-Anything-3/src/depth_anything_3/bench/datasets/hiroom.py +440 -0
Depth-Anything-3/src/depth_anything_3/bench/datasets/scannetpp.py +591 -0
Depth-Anything-3/src/depth_anything_3/bench/datasets/sevenscenes.py +449 -0
Depth-Anything-3/src/depth_anything_3/bench/evaluator.py +752 -0
Depth-Anything-3/src/depth_anything_3/bench/print_metrics.py +618 -0
Depth-Anything-3/src/depth_anything_3/bench/registries.py +85 -0
Depth-Anything-3/src/depth_anything_3/bench/utils.py +525 -0
Depth-Anything-3/src/depth_anything_3/cfg.py +144 -0
Depth-Anything-3/src/depth_anything_3/cli.py +803 -0
Depth-Anything-3/src/depth_anything_3/configs/da3-base.yaml +45 -0
Depth-Anything-3/src/depth_anything_3/configs/da3-giant.yaml +71 -0
Depth-Anything-3/src/depth_anything_3/configs/da3-large.yaml +45 -0
Depth-Anything-3/src/depth_anything_3/configs/da3-small.yaml +45 -0
Depth-Anything-3/src/depth_anything_3/configs/da3metric-large.yaml +28 -0
Depth-Anything-3/src/depth_anything_3/configs/da3mono-large.yaml +28 -0
Depth-Anything-3/src/depth_anything_3/configs/da3nested-giant-large.yaml +10 -0
Depth-Anything-3/src/depth_anything_3/model/__init__.py +20 -0
Depth-Anything-3/src/depth_anything_3/model/cam_dec.py +45 -0

Depth-Anything-3/da3_streaming/loop_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from [VGGT-Long](https://github.com/DengKaiCQ/VGGT-Long)

Depth-Anything-3/da3_streaming/loop_utils/alignment_torch.py ADDED Viewed

	@@ -0,0 +1,395 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from [VGGT-Long](https://github.com/DengKaiCQ/VGGT-Long)
+import numpy as np
+import torch
+def weighted_estimate_se3_torch(source_points, target_points, weights):
+    source_points = torch.from_numpy(source_points).cuda().float()
+    target_points = torch.from_numpy(target_points).cuda().float()
+    weights = torch.from_numpy(weights).cuda().float()
+    total_weight = torch.sum(weights)
+    if total_weight < 1e-6:
+        return (
+            1.0,
+            np.zeros(3, dtype=np.float32),
+            np.zeros(3, dtype=np.float32),
+            np.zeros((3, 3), dtype=np.float32),
+        )
+    normalized_weights = weights / total_weight
+    mu_src = torch.sum(normalized_weights[:, None] * source_points, dim=0)
+    mu_tgt = torch.sum(normalized_weights[:, None] * target_points, dim=0)
+    src_centered = source_points - mu_src
+    tgt_centered = target_points - mu_tgt
+    weighted_src = src_centered * torch.sqrt(normalized_weights)[:, None]
+    weighted_tgt = tgt_centered * torch.sqrt(normalized_weights)[:, None]
+    H = weighted_src.T @ weighted_tgt
+    return 1.0, mu_src.cpu().numpy(), mu_tgt.cpu().numpy(), H.cpu().numpy()
+def weighted_estimate_sim3_torch(source_points, target_points, weights):
+    source_points = torch.from_numpy(source_points).cuda().float()
+    target_points = torch.from_numpy(target_points).cuda().float()
+    weights = torch.from_numpy(weights).cuda().float()
+    total_weight = torch.sum(weights)
+    if total_weight < 1e-6:
+        return (
+            -1.0,
+            np.zeros(3, dtype=np.float32),
+            np.zeros(3, dtype=np.float32),
+            np.zeros((3, 3), dtype=np.float32),
+        )
+    normalized_weights = weights / total_weight
+    mu_src = torch.sum(normalized_weights[:, None] * source_points, dim=0)
+    mu_tgt = torch.sum(normalized_weights[:, None] * target_points, dim=0)
+    src_centered = source_points - mu_src
+    tgt_centered = target_points - mu_tgt
+    scale_src = torch.sqrt(torch.sum(normalized_weights * torch.sum(src_centered**2, dim=1)))
+    scale_tgt = torch.sqrt(torch.sum(normalized_weights * torch.sum(tgt_centered**2, dim=1)))
+    s = scale_tgt / scale_src
+    weighted_src = (s * src_centered) * torch.sqrt(normalized_weights)[:, None]
+    weighted_tgt = tgt_centered * torch.sqrt(normalized_weights)[:, None]
+    H = weighted_src.T @ weighted_tgt
+    return s.cpu().numpy(), mu_src.cpu().numpy(), mu_tgt.cpu().numpy(), H.cpu().numpy()
+def weighted_estimate_sim3_numba_torch(source_points, target_points, weights, align_method="sim3"):
+    if align_method == "sim3":
+        s, mu_src, mu_tgt, H = weighted_estimate_sim3_torch(source_points, target_points, weights)
+    elif align_method == "se3" or align_method == "scale+se3":
+        s, mu_src, mu_tgt, H = weighted_estimate_se3_torch(source_points, target_points, weights)
+    if s < 0:
+        raise ValueError("Total weight too small for meaningful estimation")
+    H_torch = torch.from_numpy(H).cuda().float()
+    U, _, Vt = torch.linalg.svd(H_torch)
+    U = U.cpu().numpy()
+    Vt = Vt.cpu().numpy()
+    R = Vt.T @ U.T
+    if np.linalg.det(R) < 0:
+        Vt[2, :] *= -1
+        R = Vt.T @ U.T
+    mu_src = mu_src.astype(np.float32)
+    mu_tgt = mu_tgt.astype(np.float32)
+    R = R.astype(np.float32)
+    if align_method == "se3" or align_method == "scale+se3":
+        t = mu_tgt - R @ mu_src
+    else:
+        t = mu_tgt - s * R @ mu_src
+    return s, R, t.astype(np.float32)
+def huber_loss_torch(r, delta):
+    r_torch = torch.from_numpy(r).cuda().float()
+    delta_torch = torch.tensor(delta, device="cuda", dtype=torch.float32)
+    abs_r = torch.abs(r_torch)
+    result = torch.where(
+        abs_r <= delta_torch, 0.5 * r_torch**2, delta_torch * (abs_r - 0.5 * delta_torch)
+    )
+    return result.cpu().numpy()
+def compute_residuals_torch(tgt, transformed):
+    tgt_torch = torch.from_numpy(tgt).cuda().float()
+    transformed_torch = torch.from_numpy(transformed).cuda().float()
+    residuals = torch.sqrt(torch.sum((tgt_torch - transformed_torch) ** 2, dim=1))
+    return residuals.cpu().numpy()
+def compute_huber_weights_torch(residuals, delta):
+    residuals_torch = torch.from_numpy(residuals).cuda().float()
+    delta_torch = torch.tensor(delta, device="cuda", dtype=torch.float32)
+    weights = torch.ones_like(residuals_torch)
+    mask = residuals_torch > delta_torch
+    weights[mask] = delta_torch / residuals_torch[mask]
+    return weights.cpu().numpy()
+def apply_transformation_torch(src, s, R, t):
+    src_torch = torch.from_numpy(src).cuda().float()
+    R_torch = torch.from_numpy(R).cuda().float()
+    t_torch = torch.from_numpy(t).cuda().float()
+    s_torch = torch.tensor(s, device="cuda", dtype=torch.float32)
+    transformed = s_torch * (src_torch @ R_torch.T) + t_torch
+    return transformed.cpu().numpy()
+def robust_weighted_estimate_sim3_torch(
+    src, tgt, init_weights, delta=0.1, max_iters=20, tol=1e-9, align_method="sim3"
+):
+    src = src.astype(np.float32)
+    tgt = tgt.astype(np.float32)
+    init_weights = init_weights.astype(np.float32)
+    s, R, t = weighted_estimate_sim3_numba_torch(src, tgt, init_weights, align_method=align_method)
+    prev_error = float("inf")
+    for iter in range(max_iters):
+        transformed = apply_transformation_torch(src, s, R, t)
+        residuals = compute_residuals_torch(tgt, transformed)
+        print(f"Iter {iter}: Mean residual = {np.mean(residuals):.6f}")
+        huber_weights = compute_huber_weights_torch(residuals, delta)
+        combined_weights = init_weights * huber_weights
+        combined_weights /= np.sum(combined_weights) + 1e-12
+        s_new, R_new, t_new = weighted_estimate_sim3_numba_torch(
+            src, tgt, combined_weights, align_method=align_method
+        )
+        param_change = np.abs(s_new - s) + np.linalg.norm(t_new - t)
+        rot_angle = np.arccos(min(1.0, max(-1.0, (np.trace(R_new @ R.T) - 1) / 2)))
+        current_error = np.sum(huber_loss_torch(residuals, delta) * init_weights)
+        if (param_change < tol and rot_angle < np.radians(0.1)) or (
+            abs(prev_error - current_error) < tol * prev_error
+        ):
+            print(f"Converged at iteration {iter}")
+            break
+        s, R, t = s_new, R_new, t_new
+        prev_error = current_error
+    return s, R, t
+def apply_sim3_direct_torch(point_maps, s, R, t, device=None):
+    """
+    PyTorch SIM3
+    point_maps: (b, h, w, 3) numpy array
+    s: scalar or (b,) array
+    R: (3, 3) or (b, 3, 3) numpy array
+    t: (3,) or (b, 3) numpy array
+    """
+    if isinstance(point_maps, np.ndarray):
+        point_maps_torch = torch.from_numpy(point_maps).float()
+        R_torch = torch.from_numpy(R).float()
+        t_torch = torch.from_numpy(t).float()
+        s_torch = torch.tensor(s).float() if np.isscalar(s) else torch.from_numpy(s).float()
+    else:
+        point_maps_torch = point_maps
+        R_torch = R
+        t_torch = t
+        s_torch = s
+    if device is not None:
+        point_maps_torch = point_maps_torch.to(device)
+        R_torch = R_torch.to(device)
+        t_torch = t_torch.to(device)
+        s_torch = s_torch.to(device)
+    b, h, w, c = point_maps_torch.shape
+    points_flat = point_maps_torch.reshape(b, -1, 3)  # (b, h*w, 3)
+    if R_torch.dim() == 2:
+        R_torch = R_torch.unsqueeze(0).expand(b, 3, 3)  # (b, 3, 3)
+    if t_torch.dim() == 1:
+        t_torch = t_torch.unsqueeze(0).expand(b, 3)  # (b, 3)
+    if s_torch.dim() == 0:
+        s_torch = s_torch.unsqueeze(0).expand(b)  # (b,)
+    rotated_flat = torch.bmm(points_flat, R_torch.transpose(1, 2))  # (b, h*w, 3)
+    transformed_flat = s_torch[:, None, None] * rotated_flat + t_torch[:, None, :]
+    transformed = transformed_flat.reshape(b, h, w, 3)
+    if isinstance(point_maps, np.ndarray):
+        return transformed.cpu().numpy()
+    return transformed
+def depth_to_point_cloud_optimized_torch(depth, intrinsics, extrinsics, device=None):
+    input_is_numpy = isinstance(depth, np.ndarray)
+    if input_is_numpy:
+        depth_tensor = torch.from_numpy(depth).float()
+        intrinsics_tensor = torch.from_numpy(intrinsics).float()
+        extrinsics_tensor = torch.from_numpy(extrinsics).float()
+    else:
+        depth_tensor = depth
+        intrinsics_tensor = intrinsics
+        extrinsics_tensor = extrinsics
+    if device is not None:
+        depth_tensor = depth_tensor.to(device)
+        intrinsics_tensor = intrinsics_tensor.to(device)
+        extrinsics_tensor = extrinsics_tensor.to(device)
+    N, H, W = depth_tensor.shape
+    device = depth_tensor.device
+    u = torch.arange(W, device=device, dtype=torch.float32).view(1, 1, W)
+    v = torch.arange(H, device=device, dtype=torch.float32).view(1, H, 1)
+    u_expanded = u.expand(N, H, W)
+    v_expanded = v.expand(N, H, W)
+    ones = torch.ones((N, H, W), device=device)
+    pixel_coords = torch.stack([u_expanded, v_expanded, ones], dim=-1)  # [N, H, W, 3]
+    intrinsics_inv = torch.inverse(intrinsics_tensor)  # [N, 3, 3]
+    camera_coords = torch.einsum("nij,nhwj->nhwi", intrinsics_inv, pixel_coords)
+    camera_coords = camera_coords * depth_tensor.unsqueeze(-1)  # [N, H, W, 3]
+    camera_coords_homo = torch.cat(
+        [camera_coords, torch.ones((N, H, W, 1), device=device)], dim=-1
+    )
+    extrinsics_4x4 = torch.zeros(N, 4, 4, device=device)
+    extrinsics_4x4[:, :3, :4] = extrinsics_tensor
+    extrinsics_4x4[:, 3, 3] = 1.0
+    c2w = torch.inverse(extrinsics_4x4)  # [N, 4, 4]
+    world_coords_homo = torch.einsum("nij,nhwj->nhwi", c2w, camera_coords_homo)
+    point_cloud_world = world_coords_homo[..., :3]  # [N, H, W, 3]
+    if input_is_numpy:
+        return point_cloud_world.cpu().numpy()
+    return point_cloud_world
+def warmup_torch():
+    print("\nWarming up PyTorch alignment...")
+    src = np.random.randn(100000, 3).astype(np.float32)
+    tgt = np.random.randn(100000, 3).astype(np.float32)
+    weights = np.ones(100000, dtype=np.float32)
+    residuals = np.abs(np.random.randn(100000).astype(np.float32))
+    R = np.eye(3, dtype=np.float32)
+    t = np.zeros(3, dtype=np.float32)
+    s = np.float32(1.0)
+    delta = np.float32(1.0)
+    try:
+        _ = weighted_estimate_sim3_torch(src, tgt, weights)
+        print(" - weighted_estimate_sim3_torch warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up weighted_estimate_sim3_torch:", e)
+    try:
+        _ = weighted_estimate_se3_torch(src, tgt, weights)
+        print(" - weighted_estimate_se3_torch warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up weighted_estimate_se3_torch:", e)
+    try:
+        _ = huber_loss_torch(residuals, delta)
+        print(" - huber_loss_torch warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up huber_loss_torch:", e)
+    try:
+        _ = compute_huber_weights_torch(residuals, delta)
+        print(" - compute_huber_weights_torch warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up compute_huber_weights_torch:", e)
+    try:
+        _ = compute_residuals_torch(tgt, src)
+        print(" - compute_residuals_torch warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up compute_residuals_torch:", e)
+    try:
+        _ = apply_transformation_torch(src, s, R, t)
+        print(" - apply_transformation_torch warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up apply_transformation_torch:", e)
+    print("PyTorch warm-up complete.\n")
+def print_gpu_memory():
+    if torch.cuda.is_available():
+        allocated = torch.cuda.memory_allocated() / 1024**3  # GB
+        cached = torch.cuda.memory_reserved() / 1024**3  # GB
+        print(f"GPU Memory Allocated: {allocated:.2f} GB, Cached: {cached:.2f} GB")
+if __name__ == "__main__":
+    warmup_torch()
+    n_points = 7_500_000
+    src = np.random.randn(n_points, 3).astype(np.float32)
+    true_R = np.array([[0.866, -0.5, 0], [0.5, 0.866, 0], [0, 0, 1]], dtype=np.float32)
+    true_t = np.array([1.0, 2.0, 0.5], dtype=np.float32)
+    true_s = 1.2
+    tgt = true_s * (src @ true_R.T) + true_t
+    tgt += 0.01 * np.random.randn(*tgt.shape).astype(np.float32)
+    weights = np.ones(n_points, dtype=np.float32)
+    print_gpu_memory()
+    s, R, t = robust_weighted_estimate_sim3_torch(
+        src, tgt, weights, delta=0.1, max_iters=5, align_method="sim3"
+    )
+    print(f"\nEstimated scale: {s:.6f}")
+    print(f"Estimated rotation:\n{R}")
+    print(f"Estimated translation: {t}")
+    print_gpu_memory()

Depth-Anything-3/da3_streaming/loop_utils/alignment_triton.py ADDED Viewed

	@@ -0,0 +1,543 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from [VGGT-Long](https://github.com/DengKaiCQ/VGGT-Long)
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def apply_transformation_residual_kernel(
+    src_ptr,  # [n, 3]
+    tgt_ptr,  # [n, 3]
+    transformed_ptr,  # [n, 3]
+    residuals_ptr,  # [n]
+    s,
+    R00,
+    R01,
+    R02,
+    R10,
+    R11,
+    R12,
+    R20,
+    R21,
+    R22,
+    t0,
+    t1,
+    t2,
+    n_points,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_points
+    src_x = tl.load(src_ptr + offsets * 3 + 0, mask=mask)
+    src_y = tl.load(src_ptr + offsets * 3 + 1, mask=mask)
+    src_z = tl.load(src_ptr + offsets * 3 + 2, mask=mask)
+    tgt_x = tl.load(tgt_ptr + offsets * 3 + 0, mask=mask)
+    tgt_y = tl.load(tgt_ptr + offsets * 3 + 1, mask=mask)
+    tgt_z = tl.load(tgt_ptr + offsets * 3 + 2, mask=mask)
+    # transformed = s * (R @ p) + t
+    transformed_x = s * (R00 * src_x + R01 * src_y + R02 * src_z) + t0
+    transformed_y = s * (R10 * src_x + R11 * src_y + R12 * src_z) + t1
+    transformed_z = s * (R20 * src_x + R21 * src_y + R22 * src_z) + t2
+    tl.store(transformed_ptr + offsets * 3 + 0, transformed_x, mask=mask)
+    tl.store(transformed_ptr + offsets * 3 + 1, transformed_y, mask=mask)
+    tl.store(transformed_ptr + offsets * 3 + 2, transformed_z, mask=mask)
+    dx = tgt_x - transformed_x
+    dy = tgt_y - transformed_y
+    dz = tgt_z - transformed_z
+    residual = tl.sqrt(dx * dx + dy * dy + dz * dz)
+    tl.store(residuals_ptr + offsets, residual, mask=mask)
+@triton.jit
+def weighted_covariance_kernel(
+    src_ptr,  # [n, 3]
+    tgt_ptr,  # [n, 3]
+    weights_ptr,  # [n]
+    mu_src0,
+    mu_src1,
+    mu_src2,
+    mu_tgt0,
+    mu_tgt1,
+    mu_tgt2,
+    H_ptr,  # [3, 3]
+    n_points,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_points
+    w = tl.load(weights_ptr + offsets, mask=mask)
+    src_x = tl.load(src_ptr + offsets * 3 + 0, mask=mask)
+    src_y = tl.load(src_ptr + offsets * 3 + 1, mask=mask)
+    src_z = tl.load(src_ptr + offsets * 3 + 2, mask=mask)
+    tgt_x = tl.load(tgt_ptr + offsets * 3 + 0, mask=mask)
+    tgt_y = tl.load(tgt_ptr + offsets * 3 + 1, mask=mask)
+    tgt_z = tl.load(tgt_ptr + offsets * 3 + 2, mask=mask)
+    src_centered_x = src_x - mu_src0
+    src_centered_y = src_y - mu_src1
+    src_centered_z = src_z - mu_src2
+    tgt_centered_x = tgt_x - mu_tgt0
+    tgt_centered_y = tgt_y - mu_tgt1
+    tgt_centered_z = tgt_z - mu_tgt2
+    sqrt_w = tl.sqrt(w)
+    weighted_src_x = src_centered_x * sqrt_w
+    weighted_src_y = src_centered_y * sqrt_w
+    weighted_src_z = src_centered_z * sqrt_w
+    weighted_tgt_x = tgt_centered_x * sqrt_w
+    weighted_tgt_y = tgt_centered_y * sqrt_w
+    weighted_tgt_z = tgt_centered_z * sqrt_w
+    h00 = weighted_src_x * weighted_tgt_x
+    h01 = weighted_src_x * weighted_tgt_y
+    h02 = weighted_src_x * weighted_tgt_z
+    h10 = weighted_src_y * weighted_tgt_x
+    h11 = weighted_src_y * weighted_tgt_y
+    h12 = weighted_src_y * weighted_tgt_z
+    h20 = weighted_src_z * weighted_tgt_x
+    h21 = weighted_src_z * weighted_tgt_y
+    h22 = weighted_src_z * weighted_tgt_z
+    tl.atomic_add(H_ptr + 0, tl.sum(h00, axis=0))
+    tl.atomic_add(H_ptr + 1, tl.sum(h01, axis=0))
+    tl.atomic_add(H_ptr + 2, tl.sum(h02, axis=0))
+    tl.atomic_add(H_ptr + 3, tl.sum(h10, axis=0))
+    tl.atomic_add(H_ptr + 4, tl.sum(h11, axis=0))
+    tl.atomic_add(H_ptr + 5, tl.sum(h12, axis=0))
+    tl.atomic_add(H_ptr + 6, tl.sum(h20, axis=0))
+    tl.atomic_add(H_ptr + 7, tl.sum(h21, axis=0))
+    tl.atomic_add(H_ptr + 8, tl.sum(h22, axis=0))
+@triton.jit
+def compute_huber_weights_kernel(
+    residuals_ptr,
+    weights_ptr,
+    delta,
+    n_points,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_points
+    r = tl.load(residuals_ptr + offsets, mask=mask)
+    weight = tl.where(r > delta, delta / r, 1.0)
+    tl.store(weights_ptr + offsets, weight, mask=mask)
+@triton.jit
+def weighted_mean_kernel(
+    points_ptr,  # [n, 3]
+    weights_ptr,  # [n]
+    mean_ptr,  # [sum(w*x), sum(w*y), sum(w*z), sum(w)]
+    n_points,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_points
+    w = tl.load(weights_ptr + offsets, mask=mask)
+    x = tl.load(points_ptr + offsets * 3 + 0, mask=mask)
+    y = tl.load(points_ptr + offsets * 3 + 1, mask=mask)
+    z = tl.load(points_ptr + offsets * 3 + 2, mask=mask)
+    wx = w * x
+    wy = w * y
+    wz = w * z
+    tl.atomic_add(mean_ptr + 0, tl.sum(wx, axis=0))
+    tl.atomic_add(mean_ptr + 1, tl.sum(wy, axis=0))
+    tl.atomic_add(mean_ptr + 2, tl.sum(wz, axis=0))
+    tl.atomic_add(mean_ptr + 3, tl.sum(w, axis=0))
+def apply_transformation_residual_triton(src, tgt, s, R, t):
+    n_points = src.shape[0]
+    transformed = torch.empty_like(src)
+    residuals = torch.empty(n_points, device=src.device, dtype=src.dtype)
+    BLOCK_SIZE = 256
+    grid = (triton.cdiv(n_points, BLOCK_SIZE),)
+    R_flat = R.contiguous().view(-1)
+    t_flat = t.contiguous().view(-1)
+    apply_transformation_residual_kernel[grid](
+        src,
+        tgt,
+        transformed,
+        residuals,
+        float(s),
+        float(R_flat[0]),
+        float(R_flat[1]),
+        float(R_flat[2]),
+        float(R_flat[3]),
+        float(R_flat[4]),
+        float(R_flat[5]),
+        float(R_flat[6]),
+        float(R_flat[7]),
+        float(R_flat[8]),
+        float(t_flat[0]),
+        float(t_flat[1]),
+        float(t_flat[2]),
+        n_points,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    return transformed, residuals
+def compute_weighted_mean_triton(points, weights):
+    n_points = points.shape[0]
+    # [sum(w*x), sum(w*y), sum(w*z), sum(w)]
+    mean_buffer = torch.zeros(4, device=points.device, dtype=points.dtype)
+    BLOCK_SIZE = 256
+    grid = (triton.cdiv(n_points, BLOCK_SIZE),)
+    weighted_mean_kernel[grid](points, weights, mean_buffer, n_points, BLOCK_SIZE=BLOCK_SIZE)
+    total_weight = mean_buffer[3]
+    if total_weight > 1e-12:
+        mean = mean_buffer[:3] / total_weight
+    else:
+        mean = torch.zeros(3, device=points.device, dtype=points.dtype)
+    return mean, total_weight
+def compute_weighted_covariance_triton(src, tgt, weights, mu_src, mu_tgt):
+    n_points = src.shape[0]
+    H = torch.zeros(9, device=src.device, dtype=src.dtype)
+    BLOCK_SIZE = 256
+    grid = (triton.cdiv(n_points, BLOCK_SIZE),)
+    mu_src_flat = mu_src.contiguous().view(-1)
+    mu_tgt_flat = mu_tgt.contiguous().view(-1)
+    weighted_covariance_kernel[grid](
+        src,
+        tgt,
+        weights,
+        float(mu_src_flat[0]),
+        float(mu_src_flat[1]),
+        float(mu_src_flat[2]),
+        float(mu_tgt_flat[0]),
+        float(mu_tgt_flat[1]),
+        float(mu_tgt_flat[2]),
+        H,
+        n_points,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    return H.reshape(3, 3)
+def compute_huber_weights_triton(residuals, delta):
+    n_points = residuals.shape[0]
+    weights = torch.empty_like(residuals)
+    BLOCK_SIZE = 256
+    grid = (triton.cdiv(n_points, BLOCK_SIZE),)
+    compute_huber_weights_kernel[grid](
+        residuals, weights, float(delta), n_points, BLOCK_SIZE=BLOCK_SIZE
+    )
+    return weights
+def weighted_estimate_se3_triton(source_points, target_points, weights):
+    source_points = torch.from_numpy(source_points).cuda().float()
+    target_points = torch.from_numpy(target_points).cuda().float()
+    weights = torch.from_numpy(weights).cuda().float()
+    total_weight = torch.sum(weights)
+    if total_weight < 1e-6:
+        return (
+            1.0,
+            np.zeros(3, dtype=np.float32),
+            np.zeros(3, dtype=np.float32),
+            np.zeros((3, 3), dtype=np.float32),
+        )
+    normalized_weights = weights / total_weight
+    mu_src, _ = compute_weighted_mean_triton(source_points, normalized_weights)
+    mu_tgt, _ = compute_weighted_mean_triton(target_points, normalized_weights)
+    H = compute_weighted_covariance_triton(
+        source_points, target_points, normalized_weights, mu_src, mu_tgt
+    )
+    return 1.0, mu_src.cpu().numpy(), mu_tgt.cpu().numpy(), H.cpu().numpy()
+def weighted_estimate_sim3_triton(source_points, target_points, weights):
+    source_points = torch.from_numpy(source_points).cuda().float()
+    target_points = torch.from_numpy(target_points).cuda().float()
+    weights = torch.from_numpy(weights).cuda().float()
+    total_weight = torch.sum(weights)
+    if total_weight < 1e-6:
+        return (
+            -1.0,
+            np.zeros(3, dtype=np.float32),
+            np.zeros(3, dtype=np.float32),
+            np.zeros((3, 3), dtype=np.float32),
+        )
+    normalized_weights = weights / total_weight
+    mu_src, _ = compute_weighted_mean_triton(source_points, normalized_weights)
+    mu_tgt, _ = compute_weighted_mean_triton(target_points, normalized_weights)
+    src_centered = source_points - mu_src
+    tgt_centered = target_points - mu_tgt
+    scale_src = torch.sqrt(torch.sum(normalized_weights * torch.sum(src_centered**2, dim=1)))
+    scale_tgt = torch.sqrt(torch.sum(normalized_weights * torch.sum(tgt_centered**2, dim=1)))
+    s = scale_tgt / scale_src
+    weighted_src = s * src_centered
+    H = compute_weighted_covariance_triton(
+        weighted_src,
+        tgt_centered,
+        normalized_weights,
+        torch.zeros_like(mu_src),
+        torch.zeros_like(mu_tgt),
+    )
+    return s.cpu().numpy(), mu_src.cpu().numpy(), mu_tgt.cpu().numpy(), H.cpu().numpy()
+def weighted_estimate_sim3_numba_triton(
+    source_points, target_points, weights, align_method="sim3"
+):
+    if align_method == "sim3":
+        s, mu_src, mu_tgt, H = weighted_estimate_sim3_triton(source_points, target_points, weights)
+    elif align_method == "se3" or align_method == "scale+se3":
+        s, mu_src, mu_tgt, H = weighted_estimate_se3_triton(source_points, target_points, weights)
+    if s < 0:
+        raise ValueError("Total weight too small for meaningful estimation")
+    H_torch = torch.from_numpy(H).cuda().float()
+    U, _, Vt = torch.linalg.svd(H_torch)
+    U = U.cpu().numpy()
+    Vt = Vt.cpu().numpy()
+    R = Vt.T @ U.T
+    if np.linalg.det(R) < 0:
+        Vt[2, :] *= -1
+        R = Vt.T @ U.T
+    mu_src = mu_src.astype(np.float32)
+    mu_tgt = mu_tgt.astype(np.float32)
+    R = R.astype(np.float32)
+    if align_method == "se3" or align_method == "scale+se3":
+        t = mu_tgt - R @ mu_src
+    else:
+        t = mu_tgt - s * R @ mu_src
+    return s, R, t.astype(np.float32)
+def robust_weighted_estimate_sim3_triton(
+    src, tgt, init_weights, delta=0.1, max_iters=20, tol=1e-9, align_method="sim3"
+):
+    src = src.astype(np.float32)
+    tgt = tgt.astype(np.float32)
+    init_weights = init_weights.astype(np.float32)
+    src_torch = torch.from_numpy(src).cuda().float()
+    tgt_torch = torch.from_numpy(tgt).cuda().float()
+    init_weights_torch = torch.from_numpy(init_weights).cuda().float()
+    s, R, t = weighted_estimate_sim3_numba_triton(
+        src, tgt, init_weights, align_method=align_method
+    )
+    R_torch = torch.from_numpy(R).cuda().float()
+    t_torch = torch.from_numpy(t).cuda().float()
+    s_torch = torch.tensor(s, device="cuda", dtype=torch.float32)
+    prev_error = float("inf")
+    for iter in range(max_iters):
+        transformed, residuals = apply_transformation_residual_triton(
+            src_torch, tgt_torch, s_torch, R_torch, t_torch
+        )
+        mean_residual = torch.mean(residuals).cpu().numpy()
+        print(f"Iter {iter}: Mean residual = {mean_residual:.6f}")
+        huber_weights = compute_huber_weights_triton(residuals, delta)
+        combined_weights = init_weights_torch * huber_weights
+        combined_weights_sum = torch.sum(combined_weights)
+        if combined_weights_sum > 1e-12:
+            combined_weights /= combined_weights_sum
+        else:
+            combined_weights = init_weights_torch / torch.sum(init_weights_torch)
+        combined_weights_np = combined_weights.cpu().numpy()
+        s_new, R_new, t_new = weighted_estimate_sim3_numba_triton(
+            src, tgt, combined_weights_np, align_method=align_method
+        )
+        param_change = np.abs(s_new - s) + np.linalg.norm(t_new - t)
+        rot_angle = np.arccos(min(1.0, max(-1.0, (np.trace(R_new @ R.T) - 1) / 2)))
+        residuals_np = residuals.cpu().numpy()
+        huber_loss_values = np.where(
+            residuals_np <= delta, 0.5 * residuals_np**2, delta * (residuals_np - 0.5 * delta)
+        )
+        current_error = np.sum(huber_loss_values * init_weights)
+        if (param_change < tol and rot_angle < np.radians(0.1)) or (
+            abs(prev_error - current_error) < tol * prev_error
+        ):
+            print(f"Converged at iteration {iter}")
+            break
+        s, R, t = s_new, R_new, t_new
+        s_torch = torch.tensor(s, device="cuda", dtype=torch.float32)
+        R_torch = torch.from_numpy(R).cuda().float()
+        t_torch = torch.from_numpy(t).cuda().float()
+        prev_error = current_error
+    return s, R, t
+def warmup_triton():
+    print("\nWarming up Triton functions...")
+    n_points = 10000
+    src = np.random.randn(n_points, 3).astype(np.float32)
+    tgt = np.random.randn(n_points, 3).astype(np.float32)
+    weights = np.ones(n_points, dtype=np.float32)
+    src_torch = torch.from_numpy(src).cuda().float()
+    tgt_torch = torch.from_numpy(tgt).cuda().float()
+    weights_torch = torch.from_numpy(weights).cuda().float()
+    R = np.eye(3, dtype=np.float32)
+    t = np.zeros(3, dtype=np.float32)
+    s = np.float32(1.0)
+    delta = np.float32(0.1)
+    R_torch = torch.from_numpy(R).cuda().float()
+    t_torch = torch.from_numpy(t).cuda().float()
+    s_torch = torch.tensor(s, device="cuda", dtype=torch.float32)
+    try:
+        _, _ = apply_transformation_residual_triton(
+            src_torch, tgt_torch, s_torch, R_torch, t_torch
+        )
+        print(" - apply_transformation_residual_triton warmed up.")
+    except Exception as e:
+        print(f" ! Failed to warm up apply_transformation_residual_triton: {e}")
+    try:
+        _, _ = compute_weighted_mean_triton(src_torch, weights_torch)
+        print(" - compute_weighted_mean_triton warmed up.")
+    except Exception as e:
+        print(f" ! Failed to warm up compute_weighted_mean_triton: {e}")
+    try:
+        mu_src, _ = compute_weighted_mean_triton(src_torch, weights_torch)
+        mu_tgt, _ = compute_weighted_mean_triton(tgt_torch, weights_torch)
+        _ = compute_weighted_covariance_triton(src_torch, tgt_torch, weights_torch, mu_src, mu_tgt)
+        print(" - compute_weighted_covariance_triton warmed up.")
+    except Exception as e:
+        print(f" ! Failed to warm up compute_weighted_covariance_triton: {e}")
+    try:
+        residuals = torch.abs(torch.randn(n_points, device="cuda", dtype=torch.float32))
+        _ = compute_huber_weights_triton(residuals, delta)
+        print(" - compute_huber_weights_triton warmed up.")
+    except Exception as e:
+        print(f" ! Failed to warm up compute_huber_weights_triton: {e}")
+    print("Triton warm-up complete.\n")
+def print_gpu_memory():
+    if torch.cuda.is_available():
+        allocated = torch.cuda.memory_allocated() / 1024**3  # GB
+        cached = torch.cuda.memory_reserved() / 1024**3  # GB
+        print(f"GPU Memory Allocated: {allocated:.2f} GB, Cached: {cached:.2f} GB")
+if __name__ == "__main__":
+    warmup_triton()
+    n_points = 7_500_000
+    src = np.random.randn(n_points, 3).astype(np.float32)
+    true_R = np.array([[0.866, -0.5, 0], [0.5, 0.866, 0], [0, 0, 1]], dtype=np.float32)
+    true_t = np.array([1.0, 2.0, 0.5], dtype=np.float32)
+    true_s = 1.2
+    tgt = true_s * (src @ true_R.T) + true_t
+    tgt += 0.01 * np.random.randn(*tgt.shape).astype(np.float32)
+    weights = np.ones(n_points, dtype=np.float32)
+    print_gpu_memory()
+    s, R, t = robust_weighted_estimate_sim3_triton(
+        src, tgt, weights, delta=0.1, max_iters=5, align_method="sim3"
+    )
+    print(f"\nEstimated scale: {s:.6f}")
+    print(f"Estimated rotation:\n{R}")
+    print(f"Estimated translation: {t}")
+    print_gpu_memory()

Depth-Anything-3/da3_streaming/loop_utils/config_utils.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from [VGGT-Long](https://github.com/DengKaiCQ/VGGT-Long)
+import yaml
+def load_config(path, default_path=None):
+    """
+    Loads config file.
+    Args:
+        path (str): path to config file.
+        default_path (str, optional): whether to use default path. Defaults to None.
+    Returns:
+        cfg (dict): config dict.
+    """
+    # load configuration from per scene/dataset cfg.
+    with open(path) as f:
+        cfg_special = yaml.full_load(f)
+    inherit_from = cfg_special.get("inherit_from")
+    if inherit_from is not None:
+        cfg = load_config(inherit_from, default_path)
+    elif default_path is not None:
+        with open(default_path) as f:
+            cfg = yaml.full_load(f)
+    else:
+        cfg = dict()
+    # merge per dataset cfg. and main cfg.
+    update_recursive(cfg, cfg_special)
+    return cfg
+def update_recursive(dict1, dict2):
+    """
+    Update two config dictionaries recursively. dict1 get masked by dict2, and we retuen dict1.
+    Args:
+        dict1 (dict): first dictionary to be updated.
+        dict2 (dict): second dictionary which entries should be used.
+    """
+    for k, v in dict2.items():
+        if k not in dict1:
+            dict1[k] = dict()
+        if isinstance(v, dict):
+            update_recursive(dict1[k], v)
+        else:
+            dict1[k] = v

Depth-Anything-3/da3_streaming/loop_utils/logging_utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from [VGGT-Long](https://github.com/DengKaiCQ/VGGT-Long)
+import rich
+_log_styles = {
+    "DA3-Streaming": "bold green",
+}
+def get_style(tag):
+    if tag in _log_styles.keys():
+        return _log_styles[tag]
+    return "bold blue"
+def Log(*args, tag="DA3-Streaming"):
+    style = get_style(tag)
+    rich.print(f"[{style}]{tag}:[/{style}]", *args)

Depth-Anything-3/da3_streaming/loop_utils/loop_detector.py ADDED Viewed

	@@ -0,0 +1,391 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from [VGGT-Long](https://github.com/DengKaiCQ/VGGT-Long)
+import argparse
+import os
+import sys
+from pathlib import Path
+import faiss
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torch import nn
+from tqdm import tqdm
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+SALAD_ROOT = os.path.join(CURRENT_DIR, "salad")
+if SALAD_ROOT not in sys.path:
+    sys.path.insert(0, SALAD_ROOT)
+from loop_utils.salad.models import helper
+class VPRModel(nn.Module):
+    """This is the main model for Visual Place Recognition
+    we use Pytorch Lightning for modularity purposes.
+    Args:
+        pl (_type_): _description_
+    """
+    def __init__(
+        self,
+        # ---- Backbone
+        backbone_arch="resnet50",
+        backbone_config={},
+        # ---- Aggregator
+        agg_arch="ConvAP",
+        agg_config={},
+    ):
+        super().__init__()
+        # Backbone
+        self.encoder_arch = backbone_arch
+        self.backbone_config = backbone_config
+        # Aggregator
+        self.agg_arch = agg_arch
+        self.agg_config = agg_config
+        # ----------------------------------
+        # get the backbone and the aggregator
+        self.backbone = helper.get_backbone(backbone_arch, backbone_config)
+        self.aggregator = helper.get_aggregator(agg_arch, agg_config)
+    # the forward pass of the lightning model
+    def forward(self, x):
+        x = self.backbone(x)
+        x = self.aggregator(x)
+        return x
+class LoopDetector:
+    """Loop detector class for detecting loop closures in image sequences"""
+    def __init__(self, image_dir, output="loop_closures.txt", config=None):
+        """Initialize the loop detector
+        Args:
+            image_dir: Directory path containing images
+            ckpt_path: Model checkpoint path
+            image_size: Image resize dimensions [height width]
+            batch_size: Batch size for processing
+            similarity_threshold: Similarity threshold for loop closure
+            top_k: Number of nearest neighbors to check for each image
+            use_nms: Whether to use Non-Maximum Suppression (NMS) filtering
+            nms_threshold: NMS threshold for minimum frame difference between loop pairs
+            output: Output file path
+        """
+        self.config = config
+        self.image_dir = image_dir
+        self.ckpt_path = self.config["Weights"]["SALAD"]
+        self.image_size = self.config["Loop"]["SALAD"]["image_size"]
+        self.batch_size = self.config["Loop"]["SALAD"]["batch_size"]
+        self.similarity_threshold = self.config["Loop"]["SALAD"]["similarity_threshold"]
+        self.top_k = self.config["Loop"]["SALAD"]["top_k"]
+        self.use_nms = self.config["Loop"]["SALAD"]["use_nms"]
+        self.nms_threshold = self.config["Loop"]["SALAD"]["nms_threshold"]
+        self.output = output
+        self.model = None
+        self.device = None
+        self.image_paths = None
+        self.descriptors = None
+        self.loop_closures = None
+    def _input_transform(self, image_size=None):
+        """Create image transformation function"""
+        MEAN = [0.485, 0.456, 0.406]
+        STD = [0.229, 0.224, 0.225]
+        if image_size:
+            return T.Compose(
+                [
+                    T.Resize(image_size, interpolation=T.InterpolationMode.BILINEAR),
+                    T.ToTensor(),
+                    T.Normalize(mean=MEAN, std=STD),
+                ]
+            )
+        else:
+            return T.Compose([T.ToTensor(), T.Normalize(mean=MEAN, std=STD)])
+    def load_model(self):
+        """Load model"""
+        model = VPRModel(
+            backbone_arch="dinov2_vitb14",
+            backbone_config={
+                "num_trainable_blocks": 4,
+                "return_token": True,
+                "norm_layer": True,
+            },
+            agg_arch="SALAD",
+            agg_config={
+                "num_channels": 768,
+                "num_clusters": 64,
+                "cluster_dim": 128,
+                "token_dim": 256,
+            },
+        )
+        model.load_state_dict(torch.load(self.ckpt_path))
+        model = model.eval()
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = model.to(device)
+        print(f"Model loaded: {self.ckpt_path}")
+        self.model = model
+        self.device = device
+        return model, device
+    def get_image_paths(self):
+        """Get paths of all image files in directory"""
+        image_extensions = [".jpg", ".jpeg", ".png"]
+        image_paths = []
+        for ext in image_extensions:
+            image_paths.extend(list(Path(self.image_dir).glob(f"*{ext}")))
+            image_paths.extend(list(Path(self.image_dir).glob(f"*{ext.upper()}")))
+        image_paths = sorted(image_paths)
+        self.image_paths = image_paths
+        return image_paths
+    def extract_descriptors(self):
+        """Extract image feature descriptors"""
+        if self.model is None or self.device is None:
+            self.load_model()
+        if self.image_paths is None:
+            self.get_image_paths()
+        transform = self._input_transform(self.image_size)
+        descriptors = []
+        for i in tqdm(
+            range(0, len(self.image_paths), self.batch_size), desc="Extracting features"
+        ):
+            batch_paths = self.image_paths[i : i + self.batch_size]
+            batch_imgs = []
+            for path in batch_paths:
+                try:
+                    img = Image.open(path).convert("RGB")
+                    img = transform(img)
+                    batch_imgs.append(img)
+                except Exception as e:
+                    print(f"Error processing image {path}: {e}")
+                    img = (
+                        torch.zeros(3, 224, 224)
+                        if self.image_size is None
+                        else torch.zeros(3, self.image_size[0], self.image_size[1])
+                    )
+                    batch_imgs.append(img)
+            batch_tensor = torch.stack(batch_imgs).to(self.device)
+            with torch.no_grad():
+                with torch.autocast(
+                    device_type="cuda" if torch.cuda.is_available() else "cpu", dtype=torch.float16
+                ):
+                    batch_descriptors = self.model(batch_tensor).cpu()
+            descriptors.append(batch_descriptors)
+        self.descriptors = torch.cat(descriptors)
+        return self.descriptors
+    def _apply_nms_filter(self, loop_closures, nms_threshold):
+        """Apply Non-Maximum Suppression (NMS) filtering to loop pairs"""
+        if not loop_closures or nms_threshold <= 0:
+            return loop_closures
+        sorted_loops = sorted(loop_closures, key=lambda x: x[2], reverse=True)
+        filtered_loops = []
+        suppressed = set()
+        max_frame = max(max(idx1, idx2) for idx1, idx2, _ in loop_closures)
+        for idx1, idx2, sim in sorted_loops:
+            if idx1 in suppressed or idx2 in suppressed:
+                continue
+            filtered_loops.append((idx1, idx2, sim))
+            suppress_range = set()
+            start1 = max(0, idx1 - nms_threshold)
+            end1 = min(idx1 + nms_threshold + 1, idx2)
+            suppress_range.update(range(start1, end1))
+            start2 = max(idx1 + 1, idx2 - nms_threshold)
+            end2 = min(idx2 + nms_threshold + 1, max_frame + 1)
+            suppress_range.update(range(start2, end2))
+            suppressed.update(suppress_range)
+        return filtered_loops
+    def _ensure_decending_order(self, tuples_list):
+        return [(max(a, b), min(a, b), score) for a, b, score in tuples_list]
+    def find_loop_closures(self):
+        """Find loop closures"""
+        if self.descriptors is None:
+            self.extract_descriptors()
+        embed_size = self.descriptors.shape[1]
+        faiss_index = faiss.IndexFlatIP(embed_size)
+        normalized_descriptors = self.descriptors.numpy()
+        faiss_index.add(normalized_descriptors)
+        similarities, indices = faiss_index.search(
+            normalized_descriptors, self.top_k + 1
+        )  # +1 because self is most similar
+        loop_closures = []
+        for i in range(len(self.descriptors)):
+            # Skip first result (self)
+            for j in range(1, self.top_k + 1):
+                neighbor_idx = indices[i, j]
+                similarity = similarities[i, j]
+                if similarity > self.similarity_threshold and abs(i - neighbor_idx) > 10:
+                    if i < neighbor_idx:
+                        loop_closures.append((i, neighbor_idx, similarity))
+                    else:
+                        loop_closures.append((neighbor_idx, i, similarity))
+        loop_closures = list(set(loop_closures))
+        loop_closures.sort(key=lambda x: x[2], reverse=True)
+        if self.use_nms and self.nms_threshold > 0:
+            loop_closures = self._apply_nms_filter(loop_closures, self.nms_threshold)
+        self.loop_closures = self._ensure_decending_order(loop_closures)
+        return self.loop_closures
+    def save_results(self):
+        """Save loop detection results to file"""
+        if self.loop_closures is None:
+            self.find_loop_closures()
+        with open(self.output, "w") as f:
+            f.write("# Loop Detection Results (index1, index2, similarity)\n")
+            if self.use_nms:
+                f.write(f"# NMS filtering applied, threshold: {self.nms_threshold}\n")
+            f.write("\n# Loop pairs:\n")
+            for i, j, sim in self.loop_closures:
+                f.write(f"{i}, {j}, {sim:.4f}\n")
+            f.write("\n# Image path list:\n")
+            for i, path in enumerate(self.image_paths):
+                f.write(f"# {i}: {path}\n")
+        print(f"Found {len(self.loop_closures)} loop pairs, results saved to {self.output}")
+        if self.use_nms:
+            print(f"NMS filtering applied, threshold: {self.nms_threshold}")
+        if self.loop_closures:
+            print("\nTop 10 loop pairs:")
+            for i, (idx1, idx2, sim) in enumerate(self.loop_closures[:10]):
+                print(f"{idx1}, {idx2}, similarity: {sim:.4f}")
+                if i >= 9:
+                    break
+    def get_loop_list(self):
+        return [(idx1, idx2) for idx1, idx2, _ in self.loop_closures]
+    def run(self):
+        """Run complete loop detection pipeline"""
+        print("Loading model...")
+        if self.model is None:
+            self.load_model()
+        self.get_image_paths()
+        if not self.image_paths:
+            print(f"No image files found in {self.image_dir}")
+            return
+        print(f"Found {len(self.image_paths)} image files")
+        self.extract_descriptors()
+        self.find_loop_closures()
+        self.save_results()
+        return self.loop_closures
+def main():
+    parser = argparse.ArgumentParser(description="Loop detection using SALAD model")
+    parser.add_argument(
+        "--image_dir",
+        type=str,
+        default="/media/deng/Data/KITTIdataset/data_odometry_color/dataset/sequences/00/image_2",
+        help="Directory path containing images",
+    )
+    parser.add_argument(
+        "--ckpt_path", type=str, default="./weights/dino_salad.ckpt", help="Model checkpoint path"
+    )
+    parser.add_argument(
+        "--image_size",
+        nargs=2,
+        type=int,
+        default=[336, 336],
+        help="Image resize dimensions [height width]",
+    )
+    parser.add_argument("--batch_size", type=int, default=32, help="Batch size for processing")
+    parser.add_argument(
+        "--similarity_threshold",
+        type=float,
+        default=0.7,
+        help="Similarity threshold for loop closure",
+    )
+    parser.add_argument(
+        "--top_k", type=int, default=5, help="Number of nearest neighbors to check for each image"
+    )
+    parser.add_argument("--output", type=str, default="loop_closures.txt", help="Output file path")
+    parser.add_argument(
+        "--use_nms",
+        action="store_true",
+        default=True,
+        help="Whether to use Non-Maximum Suppression (NMS) filtering",
+    )
+    parser.add_argument(
+        "--nms_threshold",
+        type=int,
+        default=25,
+        help="NMS threshold for minimum frame difference between loop pairs",
+    )
+    args = parser.parse_args()
+    detector = LoopDetector(
+        image_dir=args.image_dir,
+        ckpt_path=args.ckpt_path,
+        image_size=args.image_size,
+        batch_size=args.batch_size,
+        similarity_threshold=args.similarity_threshold,
+        top_k=args.top_k,
+        use_nms=args.use_nms,
+        nms_threshold=args.nms_threshold,
+        output=args.output,
+    )
+    detector.run()
+if __name__ == "__main__":
+    main()

Depth-Anything-3/da3_streaming/loop_utils/loop_refinement.py ADDED Viewed

	@@ -0,0 +1,268 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from [VGGT-Long](https://github.com/DengKaiCQ/VGGT-Long)
+import numba as nb
+import numpy as np
+import pypose as pp
+import sim3solve
+import torch
+from einops import parse_shape, rearrange
+from scipy.spatial.transform import Rotation as R
+def make_pypose_Sim3(rot, t, s):
+    q = R.from_matrix(rot).as_quat()
+    data = np.concatenate([t, q, np.array(s).reshape((1,))])
+    return pp.Sim3(data)
+def SE3_to_Sim3(x: pp.SE3):
+    out = torch.cat((x.data, torch.ones_like(x.data[..., :1])), dim=-1)
+    return pp.Sim3(out)
+@nb.njit(cache=True)
+def _format(es):
+    return np.asarray(es, dtype=np.int64).reshape((-1, 2))[1:]
+@nb.njit(cache=True)
+def reduce_edges(flow_mag, ii, jj, max_num_edges, nms):
+    es = [(-1, -1)]
+    if ii.size == 0:
+        return _format(es)
+    Ni, Nj = (ii.max() + 1), (jj.max() + 1)
+    ignore_lookup = np.zeros((Ni, Nj), dtype=nb.bool_)
+    idxs = np.argsort(flow_mag)
+    for idx in idxs:  # edge index
+        if len(es) > max_num_edges:
+            break
+        i = ii[idx]
+        j = jj[idx]
+        mag = flow_mag[idx]
+        if (j - i) < 30:
+            continue
+        if mag >= 1000:  # i.e., inf
+            continue
+        if ignore_lookup[i, j]:
+            continue
+        es.append((i, j))
+        for di in range(-nms, nms + 1):
+            i1 = i + di
+            if 0 <= i1 < Ni:
+                ignore_lookup[i1, j] = True
+    return _format(es)
+@nb.njit(cache=True)
+def umeyama_alignment(x: np.ndarray, y: np.ndarray):
+    """
+    The following function was copied from:
+    https://github.com/MichaelGrupp/evo/blob/3067541b350528fe46375423e5bc3a7c42c06c63/evo/core/geometry.py#L35
+    Computes the least squares solution parameters of an Sim(m) matrix
+    that minimizes the distance between a set of registered points.
+    Umeyama, Shinji: Least-squares estimation of transformation parameters
+                     between two point patterns. IEEE PAMI, 1991
+    :param x: mxn matrix of points, m = dimension, n = nr. of data points
+    :param y: mxn matrix of points, m = dimension, n = nr. of data points
+    :param with_scale: set to True to align also the scale (default: 1.0 scale)
+    :return: r, t, c - rotation matrix, translation vector and scale factor
+    """
+    # m = dimension, n = nr. of data points
+    m, n = x.shape
+    # means, eq. 34 and 35
+    mean_x = x.sum(axis=1) / n
+    mean_y = y.sum(axis=1) / n
+    # variance, eq. 36
+    # "transpose" for column subtraction
+    sigma_x = 1.0 / n * (np.linalg.norm(x - mean_x[:, np.newaxis]) ** 2)
+    # covariance matrix, eq. 38
+    outer_sum = np.zeros((m, m))
+    for i in range(n):
+        outer_sum += np.outer((y[:, i] - mean_y), (x[:, i] - mean_x))
+    cov_xy = np.multiply(1.0 / n, outer_sum)
+    # SVD (text betw. eq. 38 and 39)
+    u, d, v = np.linalg.svd(cov_xy)
+    if np.count_nonzero(d > np.finfo(d.dtype).eps) < m - 1:
+        return None, None, None  # Degenerate covariance rank, Umeyama alignment is not possible
+    # S matrix, eq. 43
+    s = np.eye(m)
+    if np.linalg.det(u) * np.linalg.det(v) < 0.0:
+        # Ensure a RHS coordinate system (Kabsch algorithm).
+        s[m - 1, m - 1] = -1
+    # rotation, eq. 40
+    r = u.dot(s).dot(v)
+    # scale & translation, eq. 42 and 41
+    c = 1 / sigma_x * np.trace(np.diag(d).dot(s))
+    t = mean_y - np.multiply(c, r.dot(mean_x))
+    return r, t, c
+@nb.njit(cache=True)
+def ransac_umeyama(src_points, dst_points, iterations=1, threshold=0.1):
+    best_inliers = 0
+    best_R = None
+    best_t = None
+    best_s = None
+    for _ in range(iterations):
+        # Randomly select three points
+        indices = np.random.choice(src_points.shape[0], 3, replace=False)
+        src_sample = src_points[indices]
+        dst_sample = dst_points[indices]
+        # Estimate transformation
+        R, t, s = umeyama_alignment(src_sample.T, dst_sample.T)
+        if t is None:
+            continue
+        # Apply transformation
+        transformed = (src_points @ (R * s).T) + t
+        # Count inliers (not ideal because depends on scene scale)
+        distances = np.sum((transformed - dst_points) ** 2, axis=1) ** 0.5
+        inlier_mask = distances < threshold
+        inliers = np.sum(inlier_mask)
+        # Update best transformation
+        if inliers > best_inliers:
+            best_inliers = inliers
+            best_R, best_t, best_s = umeyama_alignment(
+                src_points[inlier_mask].T, dst_points[inlier_mask].T
+            )
+    return best_R, best_t, best_s, best_inliers
+def batch_jacobian(func, x):
+    def _func_sum(*x):
+        return func(*x).sum(dim=0)
+    _, b, c = torch.autograd.functional.jacobian(_func_sum, x, vectorize=True)
+    return rearrange(torch.stack((b, c)), "N O B I -> N B O I", N=2)
+def _residual(C, Gi, Gj):
+    assert parse_shape(C, "N _") == parse_shape(Gi, "N _") == parse_shape(Gj, "N _")
+    out = C @ pp.Exp(Gi) @ pp.Exp(Gj).Inv()
+    return out.Log().tensor()
+def residual(Ginv, input_poses, dSloop, ii, jj, jacobian=False):
+    # prep
+    device = Ginv.device
+    assert parse_shape(input_poses, "_ d") == dict(d=7)
+    pred_inv_poses = SE3_to_Sim3(input_poses).Inv()
+    # free variables
+    n, _ = pred_inv_poses.shape
+    kk = torch.arange(1, n, device=device)
+    ll = kk - 1
+    # constants
+    Ti = pred_inv_poses[kk]
+    Tj = pred_inv_poses[ll]
+    dSij = Tj @ Ti.Inv()
+    constants = torch.cat((dSij, dSloop), dim=0)
+    iii = torch.cat((kk, ii))
+    jjj = torch.cat((ll, jj))
+    resid = _residual(constants, Ginv[iii], Ginv[jjj])
+    if not jacobian:
+        return resid
+    J_Ginv_i, J_Ginv_j = batch_jacobian(_residual, (constants, Ginv[iii], Ginv[jjj]))
+    return resid, (J_Ginv_i, J_Ginv_j, iii, jjj)
+def perform_updates(
+    input_poses, dSloop, ii_loop, jj_loop, iters=30, ep=0.0, lmbda=1e-6, fix_opt_window=False
+):
+    """Run the Levenberg Marquardt algorithm"""
+    input_poses = input_poses.clone()
+    if fix_opt_window:
+        freen = torch.cat((ii_loop, jj_loop)).max().item() + 1
+    else:
+        freen = -1
+    Ginv = SE3_to_Sim3(input_poses).Inv().Log()
+    residual_history = []
+    for itr in range(iters):
+        resid, (J_Ginv_i, J_Ginv_j, iii, jjj) = residual(
+            Ginv, input_poses, dSloop, ii_loop, jj_loop, jacobian=True
+        )
+        residual_history.append(resid.square().mean().item())
+        print(f"resid: {resid.square().mean().item()}")
+        (delta_pose,) = sim3solve.solve_system(
+            J_Ginv_i, J_Ginv_j, iii, jjj, resid, ep, lmbda, freen
+        )
+        assert Ginv.shape == delta_pose.shape
+        Ginv_tmp = Ginv + delta_pose
+        new_resid = residual(Ginv_tmp, input_poses, dSloop, ii_loop, jj_loop)
+        if new_resid.square().mean() < residual_history[-1]:
+            Ginv = Ginv_tmp
+            lmbda /= 2
+        else:
+            lmbda *= 2
+        if (
+            (residual_history[-1] < 1e-5)
+            and (itr >= 4)
+            and ((residual_history[-5] / residual_history[-1]) < 1.5)
+        ):
+            break
+    return pp.Exp(Ginv).Inv()
+def pose_refinement(pred_poses, loop_poses, loop_ii, loop_jj):
+    final_est = perform_updates(pred_poses, loop_poses, loop_ii, loop_jj, iters=30)
+    safe_i = loop_ii.max().item() + 1
+    aa = SE3_to_Sim3(pred_poses.cpu())
+    final_est = (aa[[safe_i]] * final_est[[safe_i]].Inv()) * final_est
+    output = final_est[:safe_i]
+    return output

Depth-Anything-3/da3_streaming/loop_utils/sim3loop.py ADDED Viewed

	@@ -0,0 +1,399 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from [VGGT-Long](https://github.com/DengKaiCQ/VGGT-Long)
+import time
+from typing import List, Tuple
+import numpy as np
+import pypose as pp
+import torch
+from fastloop.solve_python import solve_system_py
+from scipy.spatial.transform import Rotation as R
+cpp_version = False
+try:
+    import sim3solve
+    cpp_version = True
+except Exception:
+    print("Sim3solve of C++ Version failed, Will using Python Version.")
+class Sim3LoopOptimizer:
+    """
+    Loop closure optimizer for sequences of Sim3 transformations
+    Input:
+    - sequential_transforms: List[Tuple[float, np.ndarray, np.ndarray]]
+      Each element is (s, R, t), where s is scalar scale, R is [3,3] rotation matrix,
+      t is [3,] translation vector
+    - loop_constraints: List[Tuple[int, int, Tuple[float, np.ndarray, np.ndarray]]]
+      Each element is (i, j, (s, R, t)), representing a loop closure constraint
+      from frame i to frame j
+    Output:
+    - Optimized sequential_transforms
+    """
+    def __init__(self, config, device="cpu"):
+        self.device = device
+        self.config = config
+        self.solve_system_version = self.config["Loop"]["SIM3_Optimizer"][
+            "lang_version"
+        ]  # choose between 'python' and 'cpp'
+        if not cpp_version:
+            self.solve_system_version = "python"
+    def numpy_to_pypose_sim3(self, s: float, R_mat: np.ndarray, t_vec: np.ndarray) -> pp.Sim3:
+        """Convert numpy s,R,t to pypose Sim3"""
+        q = R.from_matrix(R_mat).as_quat()  # [x,y,z,w]
+        # pypose requires [t, q, s] format
+        data = np.concatenate([t_vec, q, np.array([s])])
+        return pp.Sim3(torch.from_numpy(data).float().to(self.device))
+    def pypose_sim3_to_numpy(self, sim3: pp.Sim3) -> Tuple[float, np.ndarray, np.ndarray]:
+        """Convert pypose Sim3 to numpy s,R,t"""
+        data = sim3.data.cpu().numpy()
+        t = data[:3]
+        q = data[3:7]  # [x,y,z,w]
+        s = data[7]
+        R_mat = R.from_quat(q).as_matrix()
+        return s, R_mat, t
+    def sequential_to_absolute_poses(
+        self, sequential_transforms: List[Tuple[float, np.ndarray, np.ndarray]]
+    ) -> torch.Tensor:
+        """
+        Convert sequential relative transforms to absolute pose sequence
+        S_01, S_12, S_23, ... -> T_0, T_1, T_2, T_3, ...
+        Where T_i is the transform from world coordinate to frame i
+        """
+        len(sequential_transforms) + 1
+        poses = []
+        identity = pp.Sim3(
+            torch.tensor([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], device=self.device)
+        )
+        poses.append(identity)
+        current_pose = identity
+        for s, R_mat, t_vec in sequential_transforms:
+            rel_transform = self.numpy_to_pypose_sim3(s, R_mat, t_vec)
+            current_pose = current_pose @ rel_transform
+            poses.append(current_pose)
+        return torch.stack(poses)
+    def absolute_to_sequential_transforms(
+        self, absolute_poses: pp.Sim3
+    ) -> List[Tuple[float, np.ndarray, np.ndarray]]:
+        """
+        Convert absolute pose sequence back to sequential relative transforms
+        T_0, T_1, T_2, ... -> S_01, S_12, S_23, ...
+        """
+        sequential_transforms = []
+        n = absolute_poses.shape[0]
+        for i in range(n - 1):
+            rel_transform = absolute_poses[i].Inv() @ absolute_poses[i + 1]
+            s, R_mat, t_vec = self.pypose_sim3_to_numpy(rel_transform)
+            sequential_transforms.append((s, R_mat, t_vec))
+        return sequential_transforms
+    def SE3_to_Sim3(self, x: torch.Tensor) -> pp.Sim3:
+        """Convert SE3 to Sim3 (add unit scale)"""
+        ones = torch.ones_like(x[..., :1])
+        out = torch.cat((x, ones), dim=-1)
+        return pp.Sim3(out)
+    def build_loop_constraints(
+        self, loop_constraints: List[Tuple[int, int, Tuple[float, np.ndarray, np.ndarray]]]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Build loop closure constraints"""
+        if not loop_constraints:
+            return (
+                torch.empty(0, 8, device=self.device),
+                torch.empty(0, dtype=torch.long),
+                torch.empty(0, dtype=torch.long),
+            )
+        loop_transforms = []
+        ii_loop = []
+        jj_loop = []
+        for i, j, (s, R_mat, t_vec) in loop_constraints:
+            loop_sim3 = self.numpy_to_pypose_sim3(s, R_mat, t_vec)
+            loop_transforms.append(loop_sim3.data)
+            ii_loop.append(i)
+            jj_loop.append(j)
+        dSloop = pp.Sim3(torch.stack(loop_transforms))
+        ii_loop = torch.tensor(ii_loop, dtype=torch.long, device=self.device)
+        jj_loop = torch.tensor(jj_loop, dtype=torch.long, device=self.device)
+        return dSloop, ii_loop, jj_loop
+    def residual(self, Ginv, input_poses, dSloop, ii, jj, jacobian=False):
+        """Compute residuals (modified from original code)"""
+        def _residual(C, Gi, Gj):
+            out = C @ pp.Exp(Gi) @ pp.Exp(Gj).Inv()
+            return out.Log().tensor()
+        pred_inv_poses = pp.Sim3(input_poses).Inv()
+        n, _ = pred_inv_poses.shape
+        if n > 1:
+            kk = torch.arange(1, n, device=self.device)
+            ll = kk - 1
+            Ti = pred_inv_poses[kk]
+            Tj = pred_inv_poses[ll]
+            dSij = Tj @ Ti.Inv()
+        else:
+            kk = torch.empty(0, dtype=torch.long, device=self.device)
+            ll = torch.empty(0, dtype=torch.long, device=self.device)
+            dSij = pp.Sim3(torch.empty(0, 8, device=self.device))
+        constants = (
+            torch.cat((dSij.data, dSloop.data), dim=0) if dSloop.shape[0] > 0 else dSij.data
+        )
+        if constants.shape[0] > 0:
+            constants = pp.Sim3(constants)
+            iii = torch.cat((kk, ii))
+            jjj = torch.cat((ll, jj))
+            resid = _residual(constants, Ginv[iii], Ginv[jjj])
+        else:
+            iii = torch.empty(0, dtype=torch.long, device=self.device)
+            jjj = torch.empty(0, dtype=torch.long, device=self.device)
+            resid = torch.empty(0, device=self.device)
+        if not jacobian:
+            return resid
+        if constants.shape[0] > 0:
+            def batch_jacobian(func, x):
+                def _func_sum(*x):
+                    return func(*x).sum(dim=0)
+                _, b, c = torch.autograd.functional.jacobian(_func_sum, x, vectorize=True)
+                from einops import rearrange
+                return rearrange(torch.stack((b, c)), "N O B I -> N B O I", N=2)
+            J_Ginv_i, J_Ginv_j = batch_jacobian(_residual, (constants, Ginv[iii], Ginv[jjj]))
+        else:
+            J_Ginv_i = torch.empty(0, device=self.device)
+            J_Ginv_j = torch.empty(0, device=self.device)
+        return resid, (J_Ginv_i, J_Ginv_j, iii, jjj)
+    def optimize(
+        self,
+        sequential_transforms: List[Tuple[float, np.ndarray, np.ndarray]],
+        loop_constraints: List[Tuple[int, int, Tuple[float, np.ndarray, np.ndarray]]],
+        max_iterations: int = None,
+        lambda_init: float = None,
+    ) -> List[Tuple[float, np.ndarray, np.ndarray]]:
+        """
+        Main optimization function
+        Args:
+            sequential_transforms: Input sequence of transforms
+            loop_constraints: List of loop closure constraints
+            max_iterations: Maximum iterations
+            lambda_init: Initial lambda for L-M algorithm
+        Returns:
+            Optimized sequence of transforms
+        """
+        if max_iterations is None:
+            max_iterations = self.config["Loop"]["SIM3_Optimizer"]["max_iterations"]
+        if lambda_init is None:
+            lambda_init = eval(self.config["Loop"]["SIM3_Optimizer"]["lambda_init"])
+        input_poses = self.sequential_to_absolute_poses(sequential_transforms)
+        dSloop, ii_loop, jj_loop = self.build_loop_constraints(loop_constraints)
+        if len(loop_constraints) == 0:
+            print("Warning: No loop constraints provided, returning original transforms")
+            return sequential_transforms
+        Ginv = pp.Sim3(input_poses).Inv().Log()
+        lmbda = lambda_init
+        residual_history = []
+        print(
+            f"Starting optimization with {len(sequential_transforms)} poses \
+                and {len(loop_constraints)} loop constraints"
+        )
+        # L-M loop
+        for itr in range(max_iterations):
+            resid, (J_Ginv_i, J_Ginv_j, iii, jjj) = self.residual(
+                Ginv, input_poses, dSloop, ii_loop, jj_loop, jacobian=True
+            )
+            if resid.numel() == 0:
+                print("No residuals to optimize")
+                break
+            current_cost = resid.square().mean().item()
+            residual_history.append(current_cost)
+            try:  # Solve linear system
+                begin_time = time.time()
+                if self.solve_system_version == "cpp":
+                    (delta_pose,) = sim3solve.solve_system(
+                        J_Ginv_i, J_Ginv_j, iii, jjj, resid, 0.0, lmbda, -1
+                    )
+                elif self.solve_system_version == "python":
+                    delta_pose = solve_system_py(
+                        J_Ginv_i, J_Ginv_j, iii, jjj, resid, 0.0, lmbda, -1
+                    )
+                else:
+                    print("Solver version has not been chosen! ('python' or 'cpp')")
+                end_time = time.time()
+            except Exception as e:
+                print(f"Solver failed at iteration {itr}: {e}")
+                break
+            Ginv_tmp = Ginv + delta_pose
+            new_resid = self.residual(Ginv_tmp, input_poses, dSloop, ii_loop, jj_loop)
+            new_cost = new_resid.square().mean().item() if new_resid.numel() > 0 else float("inf")
+            # L-M
+            if new_cost < current_cost:
+                Ginv = Ginv_tmp
+                lmbda /= 2
+                print(
+                    f"Iteration {itr}: cost {current_cost:.14f} -> {new_cost:.14f} (accepted)",
+                    end=" | ",
+                )
+            else:
+                lmbda *= 2
+                print(
+                    f"Iteration {itr}: cost {current_cost:.14f} -> {new_cost:.14f} (rej)     ",
+                    end=" | ",
+                )  # more readible to accepted
+            print(
+                f"Time of solver ({self.solve_system_version}): \
+                    {(end_time - begin_time)*1000:.4f} ms"
+            )
+            if (current_cost < 1e-5) and (itr >= 4):
+                if len(residual_history) >= 5:
+                    improvement_ratio = residual_history[-5] / residual_history[-1]
+                    if improvement_ratio < 1.5:
+                        print(f"Converged at iteration {itr}")
+                        break
+        optimized_absolute_poses = pp.Exp(Ginv).Inv()
+        optimized_sequential = self.absolute_to_sequential_transforms(optimized_absolute_poses)
+        print(
+            f"Optimization completed. Final cost: \
+                {residual_history[-1] if residual_history else 'N/A'}"
+        )
+        return optimized_sequential
+# ======== TEST CODE ========
+def create_ring_transforms(num_poses=6, radius=5.0, rot_noise_deg=2.0):
+    """Generate a ring of Sim3 transforms with rotation, adding slight rotational noise"""
+    transforms = []
+    angle_step = 2 * np.pi / num_poses
+    for i in range(num_poses):
+        angle = angle_step
+        # Main rotation (around Z-axis)
+        R_z = R.from_euler("z", angle, degrees=False)
+        # Add slight rotational noise (Gaussian noise in degrees)
+        noise_angles_deg = np.random.normal(loc=0.0, scale=rot_noise_deg, size=3)
+        R_noise = R.from_euler("xyz", noise_angles_deg, degrees=True)
+        # Combine rotations
+        R_mat = (R_noise * R_z).as_matrix()
+        # Translation: simulate a circular trajectory
+        t = np.array([radius * np.sin(angle), radius * (1 - np.cos(angle)), 0.0])
+        s = np.random.uniform(0.8, 1.2)
+        transforms.append((s, R_mat, t))
+    return transforms
+def example_usage():
+    optimizer = Sim3LoopOptimizer(solve_system_version="cpp")
+    # Build rotating ring
+    sequential_transforms = create_ring_transforms(num_poses=20, radius=3.0)
+    # Add loop closure constraint: from frame 5 back to frame 0
+    loop_constraints = [
+        (20, 0, (1.0, np.eye(3), np.zeros(3)))  # Temporary unit loop for simulation
+    ]
+    # Trajectory before/after optimization
+    input_abs_poses = optimizer.sequential_to_absolute_poses(sequential_transforms)
+    optimized_transforms = optimizer.optimize(sequential_transforms, loop_constraints)
+    optimized_abs_poses = optimizer.sequential_to_absolute_poses(optimized_transforms)
+    def extract_xyz(pose_tensor):
+        poses = pose_tensor.cpu().numpy()
+        return poses[:, 0], poses[:, 1], poses[:, 2]
+    x0, y0, z0 = extract_xyz(input_abs_poses)
+    x1, y1, z1 = extract_xyz(optimized_abs_poses)
+    # Visualize trajectory
+    import matplotlib
+    import matplotlib.pyplot as plt
+    matplotlib.use("Agg")
+    plt.figure(figsize=(8, 6))
+    plt.plot(x0, y0, "o--", label="Before Optimization")
+    plt.plot(x1, y1, "o-", label="After Optimization")
+    for i, j, _ in loop_constraints:
+        plt.plot([x0[i], x0[j]], [y0[i], y0[j]], "r--", label="Loop (Before)" if i == 5 else "")
+        plt.plot([x1[i], x1[j]], [y1[i], y1[j]], "g-", label="Loop (After)" if i == 5 else "")
+    plt.gca().set_aspect("equal")
+    plt.title("Sim3 Loop Closure Optimization (Rotating Ring)")
+    plt.xlabel("x")
+    plt.ylabel("y")
+    plt.legend()
+    plt.grid(True)
+    plt.axis("equal")
+    plt.show()
+    return optimized_transforms
+if __name__ == "__main__":
+    example_usage()

Depth-Anything-3/da3_streaming/loop_utils/sim3utils.py ADDED Viewed

	@@ -0,0 +1,1261 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from [VGGT-Long](https://github.com/DengKaiCQ/VGGT-Long)
+import bisect
+import glob
+import os
+import numpy as np
+import trimesh
+from loop_utils.alignment_torch import robust_weighted_estimate_sim3_torch
+from loop_utils.alignment_triton import robust_weighted_estimate_sim3_triton
+from numba import njit
+from sklearn.linear_model import LinearRegression, RANSACRegressor
+def accumulate_sim3_transforms(transforms):
+    """
+    Accumulate adjacent SIM(3) transforms into transforms
+    from the initial frame to each subsequent frame.
+    Args:
+    transforms: list, each element is a tuple (R, s, t)
+        R: 3x3 rotation matrix (np.array)
+        s: scale factor (scalar)
+        t: 3x1 translation vector (np.array)
+    Returns:
+    Cumulative transforms list, each element is (R_cum, s_cum, t_cum)
+        representing the transform from frame 0 to frame k
+    """
+    if not transforms:
+        return []
+    cumulative_transforms = [transforms[0]]
+    for i in range(1, len(transforms)):
+        s_cum_prev, R_cum_prev, t_cum_prev = cumulative_transforms[i - 1]
+        s_next, R_next, t_next = transforms[i]
+        R_cum_new = R_cum_prev @ R_next
+        s_cum_new = s_cum_prev * s_next
+        t_cum_new = s_cum_prev * (R_cum_prev @ t_next) + t_cum_prev
+        cumulative_transforms.append((s_cum_new, R_cum_new, t_cum_new))
+    return cumulative_transforms
+def estimate_sim3(source_points, target_points):
+    mu_src = np.mean(source_points, axis=0)
+    mu_tgt = np.mean(target_points, axis=0)
+    src_centered = source_points - mu_src
+    tgt_centered = target_points - mu_tgt
+    scale_src = np.sqrt((src_centered**2).sum(axis=1).mean())
+    scale_tgt = np.sqrt((tgt_centered**2).sum(axis=1).mean())
+    s = scale_tgt / scale_src
+    src_scaled = src_centered * s
+    H = src_scaled.T @ tgt_centered
+    U, _, Vt = np.linalg.svd(H)
+    R = Vt.T @ U.T
+    if np.linalg.det(R) < 0:
+        Vt[2, :] *= -1
+        R = Vt.T @ U.T
+    t = mu_tgt - s * R @ mu_src
+    return s, R, t
+def align_point_maps(point_map1, conf1, point_map2, conf2, conf_threshold):
+    """point_map2 -> point_map1"""
+    b1, _, _, _ = point_map1.shape
+    b2, _, _, _ = point_map2.shape
+    b = min(b1, b2)
+    aligned_points1 = []
+    aligned_points2 = []
+    for i in range(b):
+        mask1 = conf1[i] > conf_threshold
+        mask2 = conf2[i] > conf_threshold
+        valid_mask = mask1 & mask2
+        idx = np.where(valid_mask)
+        if len(idx[0]) == 0:
+            continue
+        pts1 = point_map1[i][idx]
+        pts2 = point_map2[i][idx]
+        aligned_points1.append(pts1)
+        aligned_points2.append(pts2)
+    if len(aligned_points1) == 0:
+        raise ValueError("No matching point pairs were found!")
+    all_pts1 = np.concatenate(aligned_points1, axis=0)
+    all_pts2 = np.concatenate(aligned_points2, axis=0)
+    print(f"The number of corresponding points matched: {all_pts1.shape[0]}")
+    s, R, t = estimate_sim3(all_pts2, all_pts1)
+    mean_error = compute_alignment_error(
+        point_map1, conf1, point_map2, conf2, conf_threshold, s, R, t
+    )
+    print(f"Mean error: {mean_error}")
+    return s, R, t
+def apply_sim3(points, s, R, t):
+    return (s * (R @ points.T)).T + t
+def apply_sim3_direct(point_maps, s, R, t):
+    # point_maps: (b, h, w, 3) -> (b, h, w, 3, 1)
+    point_maps_expanded = point_maps[..., np.newaxis]  # (b, h, w, 3, 1)
+    # R: (3, 3) -> (b, h, w, 3, 1) = (3, 3) @ (3, 1)
+    rotated = np.matmul(R, point_maps_expanded)  # (b, h, w, 3, 1)
+    rotated = rotated.squeeze(-1)  # (b, h, w, 3)
+    transformed = s * rotated + t  # (b, h, w, 3)
+    return transformed
+def compute_alignment_error(point_map1, conf1, point_map2, conf2, conf_threshold, s, R, t):
+    """
+    Compute the average point alignment error (using only original inputs)
+    Args:
+    point_map1: target point map (b, h, w, 3)
+    conf1: target confidence map (b, h, w)
+    point_map2: source point map (b, h, w, 3)
+    conf2: source confidence map (b, h, w)
+    conf_threshold: confidence threshold
+    s, R, t: transformation parameters
+    """
+    b1, h1, w1, _ = point_map1.shape
+    b2, h2, w2, _ = point_map2.shape
+    b = min(b1, b2)
+    h = min(h1, h2)
+    w = min(w1, w2)
+    target_points = []
+    source_points = []
+    for i in range(b):
+        mask1 = conf1[i, :h, :w] > conf_threshold
+        mask2 = conf2[i, :h, :w] > conf_threshold
+        valid_mask = mask1 & mask2
+        idx = np.where(valid_mask)
+        if len(idx[0]) == 0:
+            continue
+        t_pts = point_map1[i, :h, :w][idx]
+        s_pts = point_map2[i, :h, :w][idx]
+        target_points.append(t_pts)
+        source_points.append(s_pts)
+    if len(target_points) == 0:
+        print("Warning: No matching point pairs found for error calculation")
+        return np.nan
+    all_target = np.concatenate(target_points, axis=0)
+    all_source = np.concatenate(source_points, axis=0)
+    transformed = (s * (R @ all_source.T)).T + t
+    errors = np.linalg.norm(transformed - all_target, axis=1)
+    mean_error = np.mean(errors)
+    std_error = np.std(errors)
+    median_error = np.median(errors)
+    max_error = np.max(errors)
+    print(
+        f"Alignment error statistics [using {len(errors)} points]: "
+        f"mean={mean_error:.4f}, std={std_error:.4f}, "
+        f"median={median_error:.4f}, max={max_error:.4f}"
+    )
+    return mean_error
+def save_confident_pointcloud(
+    points, colors, confs, output_path, conf_threshold, sample_ratio=1.0
+):
+    """
+    Filter points based on confidence threshold
+    and save as PLY file, with optional random sampling ratio.
+    Args:
+    - points: np.ndarray, shape (H, W, 3) or (N, 3)
+    - colors: np.ndarray, shape (H, W, 3) or (N, 3)
+    - confs: np.ndarray, shape (H, W) or (N,)
+    - output_path: str, output PLY file path
+    - conf_threshold: float, confidence threshold for point filtering
+    - sample_ratio: float, sampling ratio (0 < sample_ratio <= 1.0)
+    """
+    points = points.reshape(-1, 3).astype(np.float32, copy=False)
+    colors = colors.reshape(-1, 3).astype(np.uint8, copy=False)
+    confs = confs.reshape(-1).astype(np.float32, copy=False)
+    conf_mask = (confs >= conf_threshold) & (confs > 1e-5)
+    points = points[conf_mask]
+    colors = colors[conf_mask]
+    if 0 < sample_ratio < 1.0 and len(points) > 0:
+        num_samples = int(len(points) * sample_ratio)
+        indices = np.random.choice(len(points), num_samples, replace=False)
+        points = points[indices]
+        colors = colors[indices]
+    os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
+    print(f"shape of sampled point: {points.shape}")
+    trimesh.PointCloud(points, colors=colors).export(output_path)
+    print(f"Saved point cloud with {len(points)} points to {output_path}")
+def save_confident_pointcloud_batch(
+    points, colors, confs, output_path, conf_threshold, sample_ratio=1.0, batch_size=1000000
+):
+    """
+    - points: np.ndarray,  (b, H, W, 3) / (N, 3)
+    - colors: np.ndarray,  (b, H, W, 3) / (N, 3)
+    - confs: np.ndarray,  (b, H, W) / (N,)
+    - output_path: str
+    - conf_threshold: float,
+    - sample_ratio: float (0 < sample_ratio <= 1.0)
+    - batch_size: int
+    """
+    if points.ndim == 2:
+        b = 1
+        points = points[np.newaxis, ...]
+        colors = colors[np.newaxis, ...]
+        confs = confs[np.newaxis, ...]
+    elif points.ndim == 4:
+        b = points.shape[0]
+    else:
+        raise ValueError("Unsupported points dimension. Must be 2 (N,3) or 4 (b,H,W,3)")
+    total_valid = 0
+    for i in range(b):
+        cfs = confs[i].reshape(-1)
+        total_valid += np.count_nonzero((cfs >= conf_threshold) & (cfs > 1e-5))
+    num_samples = int(total_valid * sample_ratio) if sample_ratio < 1.0 else total_valid
+    if num_samples == 0:
+        save_ply(np.zeros((0, 3), dtype=np.float32), np.zeros((0, 3), dtype=np.uint8), output_path)
+        return
+    if sample_ratio == 1.0:
+        with open(output_path, "wb") as f:
+            write_ply_header(f, num_samples)
+            for i in range(b):
+                pts = points[i].reshape(-1, 3).astype(np.float32)
+                cls = colors[i].reshape(-1, 3).astype(np.uint8)
+                cfs = confs[i].reshape(-1).astype(np.float32)
+                mask = (cfs >= conf_threshold) & (cfs > 1e-5)
+                valid_pts = pts[mask]
+                valid_cls = cls[mask]
+                for j in range(0, len(valid_pts), batch_size):
+                    batch_pts = valid_pts[j : j + batch_size]
+                    batch_cls = valid_cls[j : j + batch_size]
+                    write_ply_batch(f, batch_pts, batch_cls)
+    else:
+        reservoir_pts = np.zeros((num_samples, 3), dtype=np.float32)
+        reservoir_clr = np.zeros((num_samples, 3), dtype=np.uint8)
+        count = 0
+        for i in range(b):
+            pts = points[i].reshape(-1, 3).astype(np.float32)
+            cls = colors[i].reshape(-1, 3).astype(np.uint8)
+            cfs = confs[i].reshape(-1).astype(np.float32)
+            mask = (cfs >= conf_threshold) & (cfs > 1e-5)
+            valid_pts = pts[mask]
+            valid_cls = cls[mask]
+            n_valid = len(valid_pts)
+            if count < num_samples:
+                fill_count = min(num_samples - count, n_valid)
+                reservoir_pts[count : count + fill_count] = valid_pts[:fill_count]
+                reservoir_clr[count : count + fill_count] = valid_cls[:fill_count]
+                count += fill_count
+                if fill_count < n_valid:
+                    remaining_pts = valid_pts[fill_count:]
+                    remaining_cls = valid_cls[fill_count:]
+                    count, reservoir_pts, reservoir_clr = optimized_vectorized_reservoir_sampling(
+                        remaining_pts, remaining_cls, count, reservoir_pts, reservoir_clr
+                    )
+            else:
+                count, reservoir_pts, reservoir_clr = optimized_vectorized_reservoir_sampling(
+                    valid_pts, valid_cls, count, reservoir_pts, reservoir_clr
+                )
+        save_ply(reservoir_pts, reservoir_clr, output_path)
+""" The following function is deprecated"""
+# def vectorized_reservoir_sampling(new_pts, new_cls, current_count, reservoir_pts, reservoir_clr):
+#     """
+#     - new_pts:  (M, 3)
+#     - new_cls:  (M, 3)
+#     - current_count
+#     - reservoir_pts:  (K, 3)
+#     - reservoir_clr:  (K, 3)
+#     """
+#     k = len(reservoir_pts)
+#     n_new = len(new_pts)
+#     rand_indices = np.random.randint(0, current_count + n_new, size=n_new)
+#     replace_mask = rand_indices < k
+#     replace_indices = rand_indices[replace_mask]
+#     replace_pts = new_pts[replace_mask]
+#     replace_cls = new_cls[replace_mask]
+#     reservoir_pts[replace_indices] = replace_pts
+#     reservoir_clr[replace_indices] = replace_cls
+#     return current_count + n_new, reservoir_pts, reservoir_clr
+"""
+    Function `vectorized_reservoir_sampling`  is not mathematically accurate in sampling.
+    This leads to inconsistent density in the downsampled point clouds.
+    The `optimized_vectorized_reservoir_sampling` function has fixed this bug.
+    Special thanks to @Horace89 for the detailed analysis and code assistance.
+    See https://github.com/DengKaiCQ/VGGT-Long/issues/28 for details
+"""
+def optimized_vectorized_reservoir_sampling(
+    new_points: np.ndarray,
+    new_colors: np.ndarray,
+    current_count: int,
+    reservoir_points: np.ndarray,
+    reservoir_colors: np.ndarray,
+) -> tuple[int, np.ndarray, np.ndarray]:
+    """
+    Optimized vectorized reservoir sampling with batch probability calculations.
+    This maintains mathematical correctness while improving performance through
+    vectorized operations where possible.
+    Args:
+        new_points: New point coordinates to consider, shape (M, 3)
+        new_colors: New point colors to consider, shape (M, 3)
+        current_count: Number of elements seen so far
+        reservoir_points: Current reservoir of sampled points, shape (K, 3)
+        reservoir_colors: Current reservoir of sampled colors, shape (K, 3)
+    Returns:
+        Tuple of (updated_count, updated_reservoir_points, updated_reservoir_colors)
+    """
+    random_gen = np.random
+    reservoir_size = len(reservoir_points)
+    num_new_points = len(new_points)
+    if num_new_points == 0:
+        return current_count, reservoir_points, reservoir_colors
+    # Calculate sequential indices for each new point
+    point_indices = np.arange(current_count + 1, current_count + num_new_points + 1)
+    # Generate random numbers for each point
+    random_values = random_gen.randint(0, point_indices, size=num_new_points)
+    # Determine which points should replace reservoir elements
+    replacement_mask = random_values < reservoir_size
+    replacement_positions = random_values[replacement_mask]
+    # Apply replacements
+    if np.any(replacement_mask):
+        points_to_replace = new_points[replacement_mask]
+        colors_to_replace = new_colors[replacement_mask]
+        reservoir_points[replacement_positions] = points_to_replace
+        reservoir_colors[replacement_positions] = colors_to_replace
+    return current_count + num_new_points, reservoir_points, reservoir_colors
+def write_ply_header(f, num_vertices):
+    header = [
+        "ply",
+        "format binary_little_endian 1.0",
+        f"element vertex {num_vertices}",
+        "property float x",
+        "property float y",
+        "property float z",
+        "property uchar red",
+        "property uchar green",
+        "property uchar blue",
+        "end_header",
+    ]
+    f.write("\n".join(header).encode() + b"\n")
+def write_ply_batch(f, points, colors):
+    structured = np.zeros(
+        len(points),
+        dtype=[
+            ("x", np.float32),
+            ("y", np.float32),
+            ("z", np.float32),
+            ("red", np.uint8),
+            ("green", np.uint8),
+            ("blue", np.uint8),
+        ],
+    )
+    structured["x"] = points[:, 0]
+    structured["y"] = points[:, 1]
+    structured["z"] = points[:, 2]
+    structured["red"] = colors[:, 0]
+    structured["green"] = colors[:, 1]
+    structured["blue"] = colors[:, 2]
+    f.write(structured.tobytes())
+def save_ply(points, colors, filename):
+    with open(filename, "wb") as f:
+        write_ply_header(f, len(points))
+        write_ply_batch(f, points, colors)
+def find_chunk_index(chunks, idx):
+    """
+    Find the 0-based chunk index that contains the given index idx.
+    chunks: List of (begin_idx, end_idx).
+    idx: The index to search for.
+    Returns the 0-based chunk index.
+    """
+    starts = [chunk[0] for chunk in chunks]
+    pos = bisect.bisect_right(starts, idx) - 1  # Find position of idx in starts
+    if pos < 0 or pos >= len(chunks):
+        raise ValueError(f"Index {idx} not found in any chunk")
+    chunk_begin, chunk_end = chunks[pos]
+    if idx < chunk_begin or idx > chunk_end:
+        raise ValueError(f"Index {idx} not found in any chunk")
+    return pos
+def get_frame_range(chunk, idx, half_window=10):
+    """
+    Calculate the frame range centered at idx with half_window
+    frames on each side within chunk boundaries.
+    If near boundaries, take 2 * half_window frames starting from the boundary.
+    chunk: (begin_idx, end_idx).
+    idx: Center index.
+    half_window: Number of frames to take on each side of center index.
+    Returns (start, end).
+    """
+    begin, end = chunk
+    window_size = 2 * half_window
+    if idx - half_window < begin:
+        start = begin
+        end_candidate = begin + window_size
+        end = min(end, end_candidate)
+    elif idx + half_window > end:
+        end_candidate = end
+        start_candidate = end - window_size
+        start = max(begin, start_candidate)
+    else:
+        start = idx - half_window
+        end = idx + half_window
+    return (start, end)
+def process_loop_list(chunk_index, loop_list, half_window=10):
+    """
+    Process loop_list and return chunk indices and frame ranges for each (idx1, idx2) pair.
+    chunk_index: List of (begin_idx, end_idx) tuples.
+    loop_list: List of (idx1, idx2) tuples.
+    half_window: Number of frames to take on each side of center index (default 10).
+    Returns list of (chunk_idx1, range1, chunk_idx2, range2) tuples where:
+      - chunk_idx1, chunk_idx2: Chunk indices (1-based).
+      - range1, range2: Frame range tuples (start, end).
+    """
+    results = []
+    for idx1, idx2 in loop_list:
+        try:
+            chunk_idx1_0based = find_chunk_index(chunk_index, idx1)
+            chunk1 = chunk_index[chunk_idx1_0based]
+            range1 = get_frame_range(chunk1, idx1, half_window)
+            chunk_idx2_0based = find_chunk_index(chunk_index, idx2)
+            chunk2 = chunk_index[chunk_idx2_0based]
+            range2 = get_frame_range(chunk2, idx2, half_window)
+            result = (chunk_idx1_0based, range1, chunk_idx2_0based, range2)
+            results.append(result)
+        except ValueError as e:
+            print(f"Skipping pair ({idx1}, {idx2}): {e}")
+    return results
+def compute_sim3_ab(S_a, S_b):
+    s_a, R_a, T_a = S_a
+    s_b, R_b, T_b = S_b
+    s_ab = s_b / s_a
+    R_ab = R_b @ R_a.T
+    T_ab = T_b - s_ab * (R_ab @ T_a)
+    return (s_ab, R_ab, T_ab)
+def merge_ply_files(input_dir, output_path):
+    """
+    Merge all PLY files in a directory into one file (without loading into memory)
+    Args:
+    - input_dir: Input directory containing multiple '{idx}_pcd.ply' files
+    - output_path: Output file path (e.g., 'combined.ply')
+    """
+    print("Merging PLY files...")
+    input_files = sorted(glob.glob(os.path.join(input_dir, "*_pcd.ply")))
+    if not input_files:
+        print("No PLY files found")
+        return
+    idx_file = 0
+    len(input_files)
+    total_vertices = 0
+    for file in input_files:  # Count total vertices
+        with open(file, "rb") as f:
+            for line in f:
+                if line.startswith(b"element vertex"):
+                    vertex_count = int(line.split()[-1])
+                    total_vertices += vertex_count
+                elif line.startswith(b"end_header"):
+                    break
+    with open(output_path, "wb") as out_f:
+        # Write new header
+        out_f.write(b"ply\n")
+        out_f.write(b"format binary_little_endian 1.0\n")
+        out_f.write(f"element vertex {total_vertices}\n".encode())
+        out_f.write(b"property float x\n")
+        out_f.write(b"property float y\n")
+        out_f.write(b"property float z\n")
+        out_f.write(b"property uchar red\n")
+        out_f.write(b"property uchar green\n")
+        out_f.write(b"property uchar blue\n")
+        out_f.write(b"end_header\n")
+        for file in input_files:
+            print(f"Processing {idx_file}/{len(input_files)}: {file}")
+            idx_file += 1
+            with open(file, "rb") as in_f:
+                # Skip the head
+                in_header = True
+                while in_header:
+                    line = in_f.readline()
+                    if line.startswith(b"end_header"):
+                        in_header = False
+                data = in_f.read()
+                out_f.write(data)
+    print(f"Merge completed! Total points: {total_vertices}")
+    print(f"Output file: {output_path}")
+def weighted_estimate_se3(source_points, target_points, weights):
+    """
+    source_points:  (Nx3)
+    target_points:  (Nx3)
+    :weights:  (N,) [0,1]
+    """
+    total_weight = np.sum(weights)
+    if total_weight < 1e-6:
+        raise ValueError("Total weight too small for meaningful estimation")
+    normalized_weights = weights / total_weight
+    mu_src = np.sum(normalized_weights[:, None] * source_points, axis=0)
+    mu_tgt = np.sum(normalized_weights[:, None] * target_points, axis=0)
+    src_centered = source_points - mu_src
+    tgt_centered = target_points - mu_tgt
+    weighted_src = src_centered * np.sqrt(normalized_weights)[:, None]
+    weighted_tgt = tgt_centered * np.sqrt(normalized_weights)[:, None]
+    H = weighted_src.T @ weighted_tgt
+    U, _, Vt = np.linalg.svd(H)
+    R = Vt.T @ U.T
+    if np.linalg.det(R) < 0:
+        Vt[2, :] *= -1
+        R = Vt.T @ U.T
+    t = mu_tgt - R @ mu_src
+    return 1.0, R, t
+def weighted_estimate_sim3(source_points, target_points, weights):
+    """
+    source_points:  (Nx3)
+    target_points:  (Nx3)
+    :weights:  (N,) [0,1]
+    """
+    total_weight = np.sum(weights)
+    if total_weight < 1e-6:
+        raise ValueError("Total weight too small for meaningful estimation")
+    normalized_weights = weights / total_weight
+    mu_src = np.sum(normalized_weights[:, None] * source_points, axis=0)
+    mu_tgt = np.sum(normalized_weights[:, None] * target_points, axis=0)
+    src_centered = source_points - mu_src
+    tgt_centered = target_points - mu_tgt
+    scale_src = np.sqrt(np.sum(normalized_weights * np.sum(src_centered**2, axis=1)))
+    scale_tgt = np.sqrt(np.sum(normalized_weights * np.sum(tgt_centered**2, axis=1)))
+    s = scale_tgt / scale_src
+    weighted_src = (s * src_centered) * np.sqrt(normalized_weights)[:, None]
+    weighted_tgt = tgt_centered * np.sqrt(normalized_weights)[:, None]
+    H = weighted_src.T @ weighted_tgt
+    U, _, Vt = np.linalg.svd(H)
+    R = Vt.T @ U.T
+    if np.linalg.det(R) < 0:
+        Vt[2, :] *= -1
+        R = Vt.T @ U.T
+    t = mu_tgt - s * R @ mu_src
+    return s, R, t
+def huber_loss(r, delta):
+    abs_r = np.abs(r)
+    return np.where(abs_r <= delta, 0.5 * r**2, delta * (abs_r - 0.5 * delta))
+def robust_weighted_estimate_sim3(
+    src, tgt, init_weights, delta=0.1, max_iters=20, tol=1e-9, align_method="sim3"
+):
+    """
+    src:  (Nx3)
+    tgt:  (Nx3)
+    init_weights:  (N,)
+    """
+    if align_method == "sim3":
+        s, R, t = weighted_estimate_sim3(src, tgt, init_weights)
+    elif align_method == "se3" or align_method == "scale+se3":
+        s, R, t = weighted_estimate_se3(src, tgt, init_weights)
+    prev_error = float("inf")
+    for iter in range(max_iters):
+        transformed = s * (src @ R.T) + t
+        residuals = np.linalg.norm(tgt - transformed, axis=1)  # (N,)
+        print(f"Residuals: {np.mean(residuals)}")
+        abs_res = np.abs(residuals)
+        huber_weights = np.ones_like(residuals)
+        large_res_mask = abs_res > delta
+        huber_weights[large_res_mask] = delta / abs_res[large_res_mask]
+        combined_weights = init_weights * huber_weights
+        combined_weights /= np.sum(combined_weights) + 1e-12
+        if align_method == "se3":
+            s_new, R_new, t_new = weighted_estimate_se3(src, tgt, combined_weights)
+        elif align_method == "sim3" or align_method == "scale+se3":
+            s_new, R_new, t_new = weighted_estimate_sim3(src, tgt, combined_weights)
+        param_change = np.abs(s_new - s) + np.linalg.norm(t_new - t)
+        rot_angle = np.arccos(min(1.0, max(-1.0, (np.trace(R_new @ R.T) - 1) / 2)))
+        current_error = np.sum(huber_loss(residuals, delta) * init_weights)
+        if (param_change < tol and rot_angle < np.radians(0.1)) or (
+            abs(prev_error - current_error) < tol * prev_error
+        ):
+            break
+        s, R, t = s_new, R_new, t_new
+        prev_error = current_error
+    return s, R, t
+# ===== Speed Up Begin =====
+@njit(cache=True)
+def _weighted_estimate_se3_numba(source_points, target_points, weights):
+    # Ensure float32
+    source_points = source_points.astype(np.float32)
+    target_points = target_points.astype(np.float32)
+    weights = weights.astype(np.float32)
+    total_weight = np.sum(weights)
+    if total_weight < 1e-6:
+        return (
+            1.0,
+            np.zeros(3, dtype=np.float32),
+            np.zeros(3, dtype=np.float32),
+            np.zeros((3, 3), dtype=np.float32),
+        )
+    normalized_weights = weights / total_weight
+    mu_src = np.sum(normalized_weights[:, None] * source_points, axis=0)
+    mu_tgt = np.sum(normalized_weights[:, None] * target_points, axis=0)
+    src_centered = source_points - mu_src
+    tgt_centered = target_points - mu_tgt
+    weighted_src = src_centered * np.sqrt(normalized_weights)[:, None]
+    weighted_tgt = tgt_centered * np.sqrt(normalized_weights)[:, None]
+    H = weighted_src.T @ weighted_tgt
+    return 1.0, mu_src, mu_tgt, H
+@njit(cache=True)
+def _weighted_estimate_sim3_numba(source_points, target_points, weights):
+    # Ensure float32
+    source_points = source_points.astype(np.float32)
+    target_points = target_points.astype(np.float32)
+    weights = weights.astype(np.float32)
+    total_weight = np.sum(weights)
+    if total_weight < 1e-6:
+        return (
+            -1.0,
+            np.zeros(3, dtype=np.float32),
+            np.zeros(3, dtype=np.float32),
+            np.zeros((3, 3), dtype=np.float32),
+        )
+    normalized_weights = weights / total_weight
+    mu_src = np.sum(normalized_weights[:, None] * source_points, axis=0)
+    mu_tgt = np.sum(normalized_weights[:, None] * target_points, axis=0)
+    src_centered = source_points - mu_src
+    tgt_centered = target_points - mu_tgt
+    scale_src = np.sqrt(np.sum(normalized_weights * np.sum(src_centered**2, axis=1)))
+    scale_tgt = np.sqrt(np.sum(normalized_weights * np.sum(tgt_centered**2, axis=1)))
+    s = scale_tgt / scale_src
+    weighted_src = (s * src_centered) * np.sqrt(normalized_weights)[:, None]
+    weighted_tgt = tgt_centered * np.sqrt(normalized_weights)[:, None]
+    H = weighted_src.T @ weighted_tgt
+    return s, mu_src, mu_tgt, H
+def weighted_estimate_sim3_numba(source_points, target_points, weights, align_method="sim3"):
+    if align_method == "sim3":
+        s, mu_src, mu_tgt, H = _weighted_estimate_sim3_numba(source_points, target_points, weights)
+    elif align_method == "se3" or align_method == "scale+se3":
+        s, mu_src, mu_tgt, H = _weighted_estimate_se3_numba(source_points, target_points, weights)
+    if s < 0:
+        raise ValueError("Total weight too small for meaningful estimation")
+    # Ensure float32
+    H = H.astype(np.float32)
+    U, _, Vt = np.linalg.svd(H.astype(np.float32))  # float32 SVD
+    R = Vt.T @ U.T
+    if np.linalg.det(R) < 0:
+        Vt[2, :] *= -1
+        R = Vt.T @ U.T
+    if align_method == "se3" or align_method == "scale+se3":
+        t = mu_tgt - R @ mu_src
+    else:
+        t = mu_tgt - s * R @ mu_src
+    return s, R, t
+@njit(cache=True)
+def huber_loss_numba(r, delta):
+    r = r.astype(np.float32)
+    delta = np.float32(delta)
+    abs_r = np.abs(r)
+    result = np.where(abs_r <= delta, 0.5 * r**2, delta * (abs_r - 0.5 * delta))
+    return result.astype(np.float32)
+@njit(cache=True)
+def compute_residuals_numba(tgt, transformed):
+    residuals = np.empty(tgt.shape[0], dtype=np.float32)
+    for i in range(tgt.shape[0]):
+        diff = tgt[i] - transformed[i]
+        residuals[i] = np.sqrt(np.sum(diff**2))
+    return residuals
+@njit(cache=True)
+def compute_huber_weights_numba(residuals, delta):
+    weights = np.ones(residuals.shape, dtype=np.float32)
+    for i in range(residuals.shape[0]):
+        r = residuals[i]
+        if r > delta:
+            weights[i] = delta / r
+    return weights
+@njit(cache=True)
+def apply_transformation_numba(src, s, R, t):
+    transformed = np.empty_like(src)
+    for i in range(src.shape[0]):
+        p = src[i]
+        transformed[i] = s * (R @ p) + t
+    return transformed
+def robust_weighted_estimate_sim3_numba(
+    src, tgt, init_weights, delta=0.1, max_iters=20, tol=1e-9, align_method="sim3"
+):
+    src = src.astype(np.float32)
+    tgt = tgt.astype(np.float32)
+    init_weights = init_weights.astype(np.float32)
+    s, R, t = weighted_estimate_sim3_numba(src, tgt, init_weights, align_method=align_method)
+    prev_error = float("inf")
+    for iter in range(max_iters):
+        transformed = apply_transformation_numba(src, s, R, t)
+        residuals = compute_residuals_numba(tgt, transformed)
+        print(f"Residuals: {np.mean(residuals)}")
+        huber_weights = compute_huber_weights_numba(residuals, delta)
+        combined_weights = init_weights * huber_weights
+        combined_weights /= np.sum(combined_weights) + 1e-12
+        s_new, R_new, t_new = weighted_estimate_sim3_numba(
+            src, tgt, combined_weights, align_method=align_method
+        )
+        param_change = np.abs(s_new - s) + np.linalg.norm(t_new - t)
+        rot_angle = np.arccos(min(1.0, max(-1.0, (np.trace(R_new @ R.T) - 1) / 2)))
+        current_error = np.sum(huber_loss_numba(residuals, delta) * init_weights)
+        if (param_change < tol and rot_angle < np.radians(0.1)) or (
+            abs(prev_error - current_error) < tol * prev_error
+        ):
+            break
+        s, R, t = s_new, R_new, t_new
+        prev_error = current_error
+    return s, R, t
+def warmup_numba():
+    print("\nWarming up Numba JIT-compiled functions...")
+    src = np.random.randn(50000, 3).astype(np.float32)
+    tgt = np.random.randn(50000, 3).astype(np.float32)
+    weights = np.ones(50000, dtype=np.float32)
+    residuals = np.abs(np.random.randn(50000).astype(np.float32))
+    R = np.eye(3, dtype=np.float32)
+    t = np.zeros(3, dtype=np.float32)
+    s = np.float32(1.0)
+    delta = np.float32(1.0)
+    try:
+        _ = _weighted_estimate_sim3_numba(src, tgt, weights)
+        print(" - _weighted_estimate_sim3_numba warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up _weighted_estimate_sim3_numba:", e)
+    try:
+        _ = _weighted_estimate_se3_numba(src, tgt, weights)
+        print(" - _weighted_estimate_se3_numba warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up _weighted_estimate_se3_numba:", e)
+    try:
+        _ = huber_loss_numba(residuals, delta)
+        print(" - huber_loss_numba warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up huber_loss_numba:", e)
+    try:
+        _ = compute_huber_weights_numba(residuals, delta)
+        print(" - compute_huber_weights_numba warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up compute_huber_weights_numba:", e)
+    try:
+        _ = compute_residuals_numba(tgt, src)
+        print(" - compute_residuals_numba warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up compute_residuals_numba:", e)
+    try:
+        _ = apply_transformation_numba(src, s, R, t)
+        print(" - apply_transformation_numba warmed up.")
+    except Exception as e:
+        print(" ! Failed to warm up apply_transformation_numba:", e)
+    print("Numba warm-up complete.\n")
+# ===== Speed Up End =====
+# ===== Scale precompute begin =====
+def compute_scale_ransac(
+    depth1, depth2, conf1, conf2, conf_threshold_ratio=0.1, max_samples=10000
+):
+    """
+    Args:
+        depth1: (n1, h, w)
+        depth2: (n2, h, w)
+        conf1: (n1, h, w)
+        conf2: (n2, h, w)
+    """
+    depth1_flat = depth1.reshape(-1)
+    depth2_flat = depth2.reshape(-1)
+    conf1_flat = conf1.reshape(-1)
+    conf2_flat = conf2.reshape(-1)
+    conf_threshold = max(
+        np.median(conf1_flat) * conf_threshold_ratio,
+        np.median(conf2_flat) * conf_threshold_ratio,
+        1e-6,
+    )
+    valid_mask = (
+        (conf1_flat > conf_threshold)
+        & (conf2_flat > conf_threshold)
+        & (depth1_flat > 1e-3)
+        & (depth2_flat > 1e-3)
+        & (depth1_flat < 100)
+        & (depth2_flat < 100)
+    )
+    if np.sum(valid_mask) < 100:
+        print(f"Warning: Only {np.sum(valid_mask)} valid points, using default scale 1.0")
+        return 1.0, 0.0
+    valid_depth1 = depth1_flat[valid_mask]
+    valid_depth2 = depth2_flat[valid_mask]
+    if len(valid_depth1) > max_samples:
+        indices = np.random.choice(len(valid_depth1), max_samples, replace=False)
+        valid_depth1 = valid_depth1[indices]
+        valid_depth2 = valid_depth2[indices]
+    X = valid_depth2.reshape(-1, 1)
+    y = valid_depth1
+    base_estimator = LinearRegression(fit_intercept=False)
+    ransac = RANSACRegressor(
+        estimator=base_estimator,
+        max_trials=1000,
+        min_samples=max(10, len(X) // 100),
+        residual_threshold=0.1,
+        random_state=42,
+    )
+    ransac.fit(X, y)
+    scale_factor = ransac.estimator_.coef_[0]
+    inlier_mask = ransac.inlier_mask_
+    inlier_ratio = np.sum(inlier_mask) / len(inlier_mask)
+    print(f"RANSAC scale: {scale_factor:.6f}, inlier ratio: {inlier_ratio:.4f}")
+    if 0.1 < scale_factor < 10.0:
+        return scale_factor, inlier_ratio
+    else:
+        print(f"Warning: Unreasonable scale {scale_factor}, using 1.0")
+        return 1.0, inlier_ratio
+def compute_scale_weighted(
+    depth1, depth2, conf1, conf2, conf_threshold_ratio=0.1, weight_power=2.0, robust_quantile=0.9
+):
+    """
+    Args:
+        depth1: (n1, h, w)
+        depth2: (n2, h, w)
+        conf1: (n1, h, w)
+        conf2: (n2, h, w)
+    """
+    depth1_flat = depth1.reshape(-1)
+    depth2_flat = depth2.reshape(-1)
+    conf1_flat = conf1.reshape(-1)
+    conf2_flat = conf2.reshape(-1)
+    conf_threshold = max(
+        np.median(conf1_flat) * conf_threshold_ratio,
+        np.median(conf2_flat) * conf_threshold_ratio,
+        1e-6,
+    )
+    valid_mask = (
+        (conf1_flat > conf_threshold)
+        & (conf2_flat > conf_threshold)
+        & (depth1_flat > 1e-3)
+        & (depth2_flat > 1e-3)
+        & (depth1_flat < 100)
+        & (depth2_flat < 100)
+    )
+    if np.sum(valid_mask) < 100:
+        print(f"Warning: Only {np.sum(valid_mask)} valid points, using default scale 1.0")
+        return 1.0, 0.0
+    valid_depth1 = depth1_flat[valid_mask]
+    valid_depth2 = depth2_flat[valid_mask]
+    valid_conf1 = conf1_flat[valid_mask]
+    valid_conf2 = conf2_flat[valid_mask]
+    combined_weights = (valid_conf1 * valid_conf2) ** weight_power
+    combined_weights = combined_weights / (np.sum(combined_weights) + 1e-8)
+    ratios = valid_depth1 / (valid_depth2 + 1e-8)
+    sorted_indices = np.argsort(ratios)
+    sorted_ratios = ratios[sorted_indices]
+    sorted_weights = combined_weights[sorted_indices]
+    cumulative_weights = np.cumsum(sorted_weights)
+    median_idx = np.searchsorted(cumulative_weights, 0.5)
+    scale_median = sorted_ratios[median_idx] if median_idx < len(sorted_ratios) else 1.0
+    quantile_idx = np.searchsorted(cumulative_weights, robust_quantile)
+    scale_quantile = (
+        sorted_ratios[quantile_idx] if quantile_idx < len(sorted_ratios) else scale_median
+    )
+    weight_entropy = -np.sum(combined_weights * np.log(combined_weights + 1e-8))
+    max_entropy = np.log(len(combined_weights))
+    confidence_score = 1.0 - (weight_entropy / max_entropy) if max_entropy > 0 else 0.0
+    print(f"Weighted scale: {scale_quantile:.6f}, confidence: {confidence_score:.4f}")
+    if 0.1 < scale_quantile < 10.0:
+        return scale_quantile, confidence_score
+    else:
+        print(f"Warning: Unreasonable scale {scale_quantile}, using 1.0")
+        return 1.0, confidence_score
+def compute_chunk_scale_advanced(depth1, depth2, conf1, conf2, method="auto"):
+    """
+    method: 'auto', 'ransac', 'weighted'
+    """
+    if method == "ransac":
+        scale, score = compute_scale_ransac(depth1, depth2, conf1, conf2)
+        return scale, score, "ransac"
+    elif method == "weighted":
+        scale, score = compute_scale_weighted(depth1, depth2, conf1, conf2)
+        return scale, score, "weighted"
+    elif method == "auto":
+        scale_ransac, inlier_ratio = compute_scale_ransac(depth1, depth2, conf1, conf2)
+        scale_weighted, conf_score = compute_scale_weighted(depth1, depth2, conf1, conf2)
+        ransac_quality = inlier_ratio
+        weighted_quality = conf_score
+        print(f"RANSAC quality: {ransac_quality:.4f}, Weighted quality: {weighted_quality:.4f}")
+        if ransac_quality > 0.7 and weighted_quality > 0.7:
+            # both method are good, we take both of them by average
+            final_scale = (scale_ransac + scale_weighted) / 2
+            final_method = "average"
+        elif ransac_quality > weighted_quality:
+            final_scale = scale_ransac
+            final_method = "ransac"
+        else:
+            final_scale = scale_weighted
+            final_method = "weighted"
+        final_quality = max(ransac_quality, weighted_quality)
+        return final_scale, final_quality, final_method
+def precompute_scale_chunks_with_depth(
+    chunk1_depth, chunk1_conf, chunk2_depth, chunk2_conf, method="auto"
+):
+    """
+    Args:
+        chunk1_depth: (n1, h, w)
+        chunk1_conf: (n1, h, w)
+        chunk2_depth: (n2, h, w)
+        chunk2_conf: (n2, h, w)
+        method: 'auto', 'ransac', 'weighted'
+    """
+    scale_factor, quality_score, method_used = compute_chunk_scale_advanced(
+        chunk1_depth, chunk2_depth, chunk1_conf, chunk2_conf, method
+    )
+    print(f"Final scale: {scale_factor:.6f}, quality: {quality_score:.4f}, method: {method_used}")
+    return scale_factor, quality_score, method_used
+# ===== Scale precompute end =====
+def weighted_align_point_maps(
+    point_map1, conf1, point_map2, conf2, conf_threshold, config, precompute_scale=None
+):
+    """point_map2 -> point_map1"""
+    b1, _, _, _ = point_map1.shape
+    b2, _, _, _ = point_map2.shape
+    b = min(b1, b2)
+    if precompute_scale is not None:  # meaning we are using align method 'scale+se3'
+        point_map2 *= precompute_scale
+    aligned_points1 = []
+    aligned_points2 = []
+    confidence_weights = []
+    for i in range(b):
+        mask1 = conf1[i] > conf_threshold
+        mask2 = conf2[i] > conf_threshold
+        valid_mask = mask1 & mask2
+        idx = np.where(valid_mask)
+        if len(idx[0]) == 0:
+            continue
+        pts1 = point_map1[i][idx]
+        pts2 = point_map2[i][idx]
+        combined_conf = np.sqrt(conf1[i][idx] * conf2[i][idx])
+        aligned_points1.append(pts1)
+        aligned_points2.append(pts2)
+        confidence_weights.append(combined_conf)
+    if len(aligned_points1) == 0:
+        raise ValueError("No matching point pairs were found!")
+    all_pts1 = np.concatenate(aligned_points1, axis=0)
+    all_pts2 = np.concatenate(aligned_points2, axis=0)
+    all_weights = np.concatenate(confidence_weights, axis=0)
+    print(f"The number of corresponding points matched: {all_pts1.shape[0]}")
+    if config["Model"]["align_lib"] == "numba":
+        s, R, t = robust_weighted_estimate_sim3_numba(
+            all_pts2,
+            all_pts1,
+            all_weights,
+            delta=config["Model"]["IRLS"]["delta"],
+            max_iters=config["Model"]["IRLS"]["max_iters"],
+            tol=eval(config["Model"]["IRLS"]["tol"]),
+            align_method=config["Model"]["align_method"],
+        )
+    elif config["Model"]["align_lib"] == "numpy":  # numpy
+        s, R, t = robust_weighted_estimate_sim3(
+            all_pts2,
+            all_pts1,
+            all_weights,
+            delta=config["Model"]["IRLS"]["delta"],
+            max_iters=config["Model"]["IRLS"]["max_iters"],
+            tol=eval(config["Model"]["IRLS"]["tol"]),
+            align_method=config["Model"]["align_method"],
+        )
+    elif config["Model"]["align_lib"] == "torch":  # torch
+        s, R, t = robust_weighted_estimate_sim3_torch(
+            all_pts2,
+            all_pts1,
+            all_weights,
+            delta=config["Model"]["IRLS"]["delta"],
+            max_iters=config["Model"]["IRLS"]["max_iters"],
+            tol=eval(config["Model"]["IRLS"]["tol"]),
+            align_method=config["Model"]["align_method"],
+        )
+    elif config["Model"]["align_lib"] == "triton":  # triton
+        s, R, t = robust_weighted_estimate_sim3_triton(
+            all_pts2,
+            all_pts1,
+            all_weights,
+            delta=config["Model"]["IRLS"]["delta"],
+            max_iters=config["Model"]["IRLS"]["max_iters"],
+            tol=eval(config["Model"]["IRLS"]["tol"]),
+            align_method=config["Model"]["align_method"],
+        )
+    else:
+        raise ValueError(f"Unknown align_lib: {config['Model']['align_lib']}")
+    if precompute_scale is not None:  # meaning we are using align method 'scale+se3'
+        # we need this precompute_scale for loop align
+        s = precompute_scale
+    mean_error = compute_alignment_error(
+        point_map1, conf1, point_map2, conf2, conf_threshold, s, R, t
+    )
+    print(f"Mean error: {mean_error}")
+    return s, R, t

Depth-Anything-3/da3_streaming/scripts/download_weights.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+mkdir weights
+cd ./weights
+# SALAD (~ 340 MiB)
+echo "Downloading SALAD weights (~ 340 MiB) ..."
+SALAD_URL="https://github.com/serizba/salad/releases/download/v1.0.0/dino_salad.ckpt"
+curl -L "$SALAD_URL" -o "./dino_salad.ckpt"
+# DA3NESTED-GIANT-LARGE-1.1
+echo "Downloading DA3NESTED-GIANT-LARGE-1.1 weights and config (~ 6.76 GiB)..."
+BASE_URL="https://huggingface.co/depth-anything/DA3NESTED-GIANT-LARGE-1.1/resolve/main"
+# download config.json (~ 3.1 KiB)
+curl -L "$BASE_URL/config.json" -o "./config.json"
+# download model.safetensors (~ 6.76 GiB)
+curl -L "$BASE_URL/model.safetensors" -o "./model.safetensors"

Depth-Anything-3/docs/API.md ADDED Viewed

	@@ -0,0 +1,465 @@

+# 📚 DepthAnything3 API Documentation
+## 📑 Table of Contents
+1. [📖 Overview](#overview)
+2. [💡 Usage Examples](#usage-examples)
+3. [🔧 Core API](#core-api)
+   - [DepthAnything3 Class](#depthanything3-class)
+   - [inference() Method](#inference-method)
+4. [⚙️ Parameters](#parameters)
+   - [Input Parameters](#input-parameters)
+   - [Pose Alignment Parameters](#pose-alignment-parameters)
+   - [Feature Export Parameters](#feature-export-parameters)
+   - [Rendering Parameters](#rendering-parameters)
+   - [Processing Parameters](#processing-parameters)
+   - [Export Parameters](#export-parameters)
+5. [📤 Export Formats](#export-formats)
+6. [↩️ Return Value](#return-value)
+## 📖 Overview
+This documentation provides comprehensive API reference for DepthAnything3, including usage examples, parameter specifications, export formats, and advanced features. It covers both basic pose and depth estimation workflows and advanced pose-conditioned processing with multiple export capabilities.
+## 💡 Usage Examples
+Here are quick examples to get you started:
+### 🚀 Basic Depth Estimation
+```python
+from depth_anything_3.api import DepthAnything3
+# Initialize and run inference
+model = DepthAnything3.from_pretrained("depth-anything/DA3NESTED-GIANT-LARGE").to("cuda")
+prediction = model.inference(["image1.jpg", "image2.jpg"])
+```
+### 📷 Pose-Conditioned Depth Estimation
+```python
+import numpy as np
+# With camera parameters for better consistency
+prediction = model.inference(
+    image=["image1.jpg", "image2.jpg"],
+    extrinsics=extrinsics_array,  # (N, 4, 4)
+    intrinsics=intrinsics_array   # (N, 3, 3)
+)
+```
+### 📤 Export Results
+```python
+# Export depth data and 3D visualization
+prediction = model.inference(
+    image=image_paths,
+    export_dir="./output",
+    export_format="mini_npz-glb"
+)
+```
+### 🔍 Feature Extraction
+```python
+# Export intermediate features from specific layers
+prediction = model.inference(
+    image=image_paths,
+    export_dir="./output",
+    export_format="feat_vis",
+    export_feat_layers=[0, 1, 2]  # Export features from layers 0, 1, 2
+)
+```
+### ✨ Advanced Export with Gaussian Splatting
+```python
+# Export multiple formats including Gaussian Splatting
+# Note: infer_gs=True requires da3-giant or da3nested-giant-large model
+model = DepthAnything3(model_name="da3-giant").to("cuda")
+prediction = model.inference(
+    image=image_paths,
+    extrinsics=extrinsics_array,
+    intrinsics=intrinsics_array,
+    export_dir="./output",
+    export_format="npz-glb-gs_ply-gs_video",
+    align_to_input_ext_scale=True,
+    infer_gs=True,  # Required for gs_ply and gs_video exports
+)
+```
+### 🎨 Advanced Export with Feature Visualization
+```python
+# Export with intermediate feature visualization
+prediction = model.inference(
+    image=image_paths,
+    export_dir="./output",
+    export_format="mini_npz-glb-depth_vis-feat_vis",
+    export_feat_layers=[0, 5, 10, 15, 20],
+    feat_vis_fps=30,
+)
+```
+### 📐 Using Ray-Based Pose Estimation
+```python
+# Use ray-based pose estimation instead of camera decoder
+prediction = model.inference(
+    image=image_paths,
+    export_dir="./output",
+    export_format="glb",
+    use_ray_pose=True,  # Enable ray-based pose estimation
+)
+```
+### 🎯 Reference View Selection
+```python
+# For multi-view inputs, automatically select the best reference view
+prediction = model.inference(
+    image=image_paths,
+    ref_view_strategy="saddle_balanced",  # Default: balanced selection
+)
+# For video sequences, use middle frame as reference
+prediction = model.inference(
+    image=video_frames,
+    ref_view_strategy="middle",  # Good for temporally ordered inputs
+)
+```
+## 🔧 Core API
+### 🔨 DepthAnything3 Class
+The main API class that provides depth estimation capabilities with optional pose conditioning.
+#### 🎯 Initialization
+```python
+from depth_anything_3 import DepthAnything3
+# Initialize the model with a model name
+model = DepthAnything3(model_name="da3-large")
+model = model.to("cuda")  # Move to GPU
+```
+**Parameters:**
+- `model_name` (str, default: "da3-large"): The name of the model preset to use.
+  - **Available models:**
+    - 🦾 `"da3-giant"` - 1.15B params, any-view model with GS support
+    - ⭐ `"da3-large"` - 0.35B params, any-view model (recommended for most use cases)
+    - 📦 `"da3-base"` - 0.12B params, any-view model
+    - 🪶 `"da3-small"` - 0.08B params, any-view model
+    - 👁️ `"da3mono-large"` - 0.35B params, monocular depth only
+    - 📏 `"da3metric-large"` - 0.35B params, metric depth with sky segmentation
+    - 🎯 `"da3nested-giant-large"` - 1.40B params, nested model with all features
+### 🚀 inference() Method
+The primary inference method that processes images and returns depth predictions.
+```python
+prediction = model.inference(
+    image=image_list,
+    extrinsics=extrinsics_array,      # Optional
+    intrinsics=intrinsics_array,      # Optional
+    align_to_input_ext_scale=True,   # Whether to align predicted poses to input scale
+    infer_gs=True,                   # Enable Gaussian branch for gs exports
+    use_ray_pose=False,              # Use ray-based pose estimation instead of camera decoder
+    ref_view_strategy="saddle_balanced",  # Reference view selection strategy
+    render_exts=render_extrinsics,    # Optional renders for gs_video
+    render_ixts=render_intrinsics,    # Optional renders for gs_video
+    render_hw=(height, width),        # Optional renders for gs_video
+    process_res=504,
+    process_res_method="upper_bound_resize",
+    export_dir="output_directory",    # Optional
+    export_format="mini_npz",
+    export_feat_layers=[],            # List of layer indices to export features from
+    conf_thresh_percentile=40.0,      # Confidence threshold percentile for depth map in GLB export
+    num_max_points=1_000_000,         # Maximum number of points to export in GLB export
+    show_cameras=True,                # Whether to show cameras in GLB export
+    feat_vis_fps=15,                  # Frames per second for feature visualization in feat_vis export
+    export_kwargs={}                  # Optional, additional arguments to export functions. export_format:key:val, see 'Parameters/Export Parameters' for details
+)
+```
+## ⚙️ Parameters
+### 📸 Input Parameters
+#### `image` (required)
+- **Type**: `List[Union[np.ndarray, Image.Image, str]]`
+- **Description**: List of input images. Can be numpy arrays, PIL Images, or file paths.
+- **Example**:
+  ```python
+  # From file paths
+  image = ["image1.jpg", "image2.jpg", "image3.jpg"]
+  # From numpy arrays
+  image = [np.array(img1), np.array(img2)]
+  # From PIL Images
+  image = [Image.open("image1.jpg"), Image.open("image2.jpg")]
+  ```
+#### `extrinsics` (optional)
+- **Type**: `Optional[np.ndarray]`
+- **Shape**: `(N, 4, 4)` where N is the number of input images
+- **Description**: Camera extrinsic matrices (world-to-camera transformation). When provided, enables pose-conditioned depth estimation mode.
+- **Note**: If not provided, the model operates in standard depth estimation mode.
+#### `intrinsics` (optional)
+- **Type**: `Optional[np.ndarray]`
+- **Shape**: `(N, 3, 3)` where N is the number of input images
+- **Description**: Camera intrinsic matrices containing focal length and principal point information. When provided, enables pose-conditioned depth estimation mode.
+### 🎯 Pose Alignment Parameters
+#### `align_to_input_ext_scale` (default: True)
+- **Type**: `bool`
+- **Description**: When True the predicted extrinsics are replaced with the input
+  ones and the depth maps are rescaled to match their metric scale. When False the
+  function returns the internally aligned poses computed via Umeyama alignment.
+#### `infer_gs` (default: False)
+- **Type**: `bool`
+- **Description**: Enable Gaussian Splatting branch for gaussian splatting exports. Required when using `gs_ply` or `gs_video` export formats.
+#### `use_ray_pose` (default: False)
+- **Type**: `bool`
+- **Description**: Use ray-based pose estimation instead of camera decoder for pose prediction. When True, the model uses ray prediction heads to estimate camera poses; when False, it uses the camera decoder approach.
+#### `ref_view_strategy` (default: "saddle_balanced")
+- **Type**: `str`
+- **Description**: Strategy for selecting the reference view from multiple input views. Options: `"first"`, `"middle"`, `"saddle_balanced"`, `"saddle_sim_range"`. Only applied when number of views ≥ 3. See [detailed documentation](funcs/ref_view_strategy.md) for strategy comparisons.
+- **Available strategies**:
+  - `"saddle_balanced"`: Selects view with balanced features across multiple metrics (recommended default)
+  - `"saddle_sim_range"`: Selects view with largest similarity range
+  - `"first"`: Always uses first view (not recommended, equivalent to no reordering for views < 3)
+  - `"middle"`: Uses middle view (recommended for video sequences)
+### 🔍 Feature Export Parameters
+#### `export_feat_layers` (default: [])
+- **Type**: `List[int]`
+- **Description**: List of layer indices to export intermediate features from. Features are stored in the `aux` dictionary of the Prediction object with keys like `feat_layer_0`, `feat_layer_1`, etc.
+### 🎥 Rendering Parameters
+These arguments are only used when exporting Gaussian-splatting videos (include
+`"gs_video"` in `export_format`). They describe an auxiliary camera trajectory
+with ``M`` views.
+#### `render_exts` (optional)
+- **Type**: `Optional[np.ndarray]`
+- **Shape**: `(M, 4, 4)`
+- **Description**: Camera extrinsics for the synthesized trajectory. If omitted,
+  the exporter falls back to the predicted poses.
+#### `render_ixts` (optional)
+- **Type**: `Optional[np.ndarray]`
+- **Shape**: `(M, 3, 3)`
+- **Description**: Camera intrinsics for each rendered frame. Leave `None` to
+  reuse the input intrinsics.
+#### `render_hw` (optional)
+- **Type**: `Optional[Tuple[int, int]]`
+- **Description**: Explicit output resolution `(height, width)` for the rendered
+  frames. Defaults to the input resolution when not provided.
+### ⚡ Processing Parameters
+#### `process_res` (default: 504)
+- **Type**: `int`
+- **Description**: Base resolution for processing. The model will resize images to this resolution for inference.
+#### `process_res_method` (default: "upper_bound_resize")
+- **Type**: `str`
+- **Description**: Method for resizing images to the target resolution.
+- **Options**:
+  - `"upper_bound_resize"`: Resize so that the specified dimension (504) becomes the longer side
+  - `"lower_bound_resize"`: Resize so that the specified dimension (504) becomes the shorter side
+- **Example**:
+  - Input: 1200×1600 → Output: 378×504 (with `process_res=504`, `process_res_method="upper_bound_resize"`)
+  - Input: 504×672 → Output: 504×672 (no change needed)
+### 📦 Export Parameters
+#### `export_dir` (optional)
+- **Type**: `Optional[str]`
+- **Description**: Directory path where exported files will be saved. If not provided, no files will be exported.
+#### `export_format` (default: "mini_npz")
+- **Type**: `str`
+- **Description**: Format for exporting results. Supports multiple formats separated by `-`.
+- **Example**: `"mini_npz-glb"` exports both mini_npz and glb formats.
+#### 🌐 GLB Export Parameters
+These parameters are passed directly to the `inference()` method and only apply when `export_format` includes `"glb"`.
+##### `conf_thresh_percentile` (default: 40.0)
+- **Type**: `float`
+- **Description**: Lower percentile for adaptive confidence threshold. Points below this confidence percentile will be filtered out from the point cloud.
+##### `num_max_points` (default: 1,000,000)
+- **Type**: `int`
+- **Description**: Maximum number of points in the exported point cloud. If the point cloud exceeds this limit, it will be downsampled.
+##### `show_cameras` (default: True)
+- **Type**: `bool`
+- **Description**: Whether to include camera wireframes in the exported GLB file for visualization.
+#### 🎨 Feature Visualization Parameters
+These parameters are passed directly to the `inference()` method and only apply when `export_format` includes `"feat_vis"`.
+##### `feat_vis_fps` (default: 15)
+- **Type**: `int`
+- **Description**: Frame rate for the output video when visualizing features across multiple images.
+#### ✨🎥 3DGS and 3DGS Video Parameters
+These parameters are passed directly to the `inference()` method and only apply when `export_format` includes `"gs_ply"` or `"gs_video"`.
+##### `export_kwargs` (default: `{}`)
+- Type: `dict[str, dict[str, Any]]`
+- Description: Per-format extra arguments passed to export functions, mainly for `"gs_ply"` and `"gs_video"`.
+  - Access pattern: `export_kwargs[export_format][key] = value`
+  - Example:
+    ```python
+    {
+        "gs_ply": {
+            "gs_views_interval": 1,
+        },
+        "gs_video": {
+            "trj_mode": "interpolate_smooth",
+            "chunk_size": 1,
+            "vis_depth": None,
+        },
+    }
+    ```
+## 📤 Export Formats
+The API supports multiple export formats for different use cases:
+### 📊 `mini_npz`
+- **Description**: Minimal NPZ format containing essential data
+- **Contents**: `depth`, `conf`, `exts`, `ixts`
+- **Use case**: Lightweight storage for depth data with camera parameters
+### 📦 `npz`
+- **Description**: Full NPZ format with comprehensive data
+- **Contents**: `depth`, `conf`, `exts`, `ixts`, `image`, etc.
+- **Use case**: Complete data export for advanced processing
+### 🌐 `glb`
+- **Description**: 3D visualization format with point cloud and camera poses
+- **Contents**:
+  - Point cloud with colors from original images
+  - Camera wireframes for visualization
+  - Confidence-based filtering and downsampling
+- **Use case**: 3D visualization, inspection, and analysis
+- **Features**:
+  - Automatic sky depth handling
+  - Confidence threshold filtering
+  - Background filtering (black/white)
+  - Scene scale normalization
+- **Parameters** (passed via `inference()` method directly):
+  - `conf_thresh_percentile` (float, default: 40.0): Lower percentile for adaptive confidence threshold. Points below this confidence percentile will be filtered out.
+  - `num_max_points` (int, default: 1,000,000): Maximum number of points in the exported point cloud. If exceeded, points will be downsampled.
+  - `show_cameras` (bool, default: True): Whether to include camera wireframes in the exported GLB file for visualization.
+### ✨ `gs_ply`
+- **Description**: Gaussian Splatting point cloud format
+- **Contents**: 3DGS data in PLY format. Compatible with standard 3DGS viewers such as [SuperSplat](https://superspl.at/editor) (recommended), [SPARK](https://sparkjs.dev/viewer/).
+- **Use case**: Gaussian Splatting reconstruction
+- **Requirements**: Must set `infer_gs=True` when calling `inference()`. Only supported by `da3-giant` and `da3nested-giant-large` models.
+- **Additional configs**, provided via `export_kwargs` (see [Export Parameters](#export-parameters)):
+  - `gs_views_interval`: Export to 3DGS every N views, default: `1`.
+### 🎥 `gs_video`
+- **Description**: Rasterized 3DGS to obtain videos
+- **Contents**: A video of 3DGS-rasterized views using either provided viewpoints or a predefined camera trajectory.
+- **Use case**: Video rendering for Gaussian Splatting
+- **Requirements**: Must set `infer_gs=True` when calling `inference()`. Only supported by `da3-giant` and `da3nested-giant-large` models.
+- **Note**: Can optionally use `render_exts`, `render_ixts`, and `render_hw` parameters in `inference()` method to specify novel viewpoints.
+- **Additional configs**, provided via `export_kwargs` (see [Export Parameters](#export-parameters)):
+  - `extrinsics`: Optional world-to-camera poses for novel views. Falls back to the predicted poses of input views if not provided. (Alternatively, use `render_exts` parameter in `inference()`)
+  - `intrinsics`: Optional camera intrinsics for novel views. Falls back to the predicted intrinsics of input views if not provided. (Alternatively, use `render_ixts` parameter in `inference()`)
+  - `out_image_hw`: Optional output resolution `H x W`. Falls back to input resolution if not provided. (Alternatively, use `render_hw` parameter in `inference()`)
+  - `chunk_size`: Number of views rasterized per batch. Default: `8`.
+  - `trj_mode`: Predefined camera trajectory for novel-view rendering.
+  - `color_mode`: Same as `render_mode` in [gsplat](https://docs.gsplat.studio/main/apis/rasterization.html#gsplat.rasterization).
+  - `vis_depth`: How depth is combined with RGB. Default: `hcat` (horizontal concatenation).
+  - `enable_tqdm`: Whether to display a tqdm progress bar during rendering.
+  - `output_name`: File name of the rendered video.
+  - `video_quality`: Video quality to save. Default: `high`.
+    - `high`: High quality video (default)
+    - `medium`: Medium quality video (balance of storage space and quality)
+    - `low`: Low quality video (fewer storage space)
+### 🔍 `feat_vis`
+- **Description**: Feature visualization format
+- **Contents**: PCA-visualized intermediate features from specified layers
+- **Use case**: Model interpretability and feature analysis
+- **Note**: Requires `export_feat_layers` to be specified
+- **Parameters** (passed via `inference()` method directly):
+  - `feat_vis_fps` (int, default: 15): Frame rate for the output video when visualizing features across multiple images.
+### 🎨 `depth_vis`
+- **Description**: Depth visualization format
+- **Contents**: Color-coded depth maps alongside original images
+- **Use case**: Visual inspection of depth estimation quality
+### 🔗 Multiple Format Export
+You can export multiple formats simultaneously by separating them with `-`:
+```python
+# Export both mini_npz and glb formats
+export_format = "mini_npz-glb"
+# Export multiple formats
+export_format = "npz-glb-gs_ply"
+```
+## ↩️ Return Value
+The `inference()` method returns a `Prediction` object with the following attributes:
+### 📊 Core Outputs
+- **depth**: `np.ndarray` - Estimated depth maps with shape `(N, H, W)` where N is the number of images, H is height, and W is width.
+- **conf**: `np.ndarray` - Confidence maps with shape `(N, H, W)` indicating prediction reliability (optional, depends on model).
+### 📷 Camera Parameters
+- **extrinsics**: `np.ndarray` - Camera extrinsic matrices with shape `(N, 3, 4)` representing world-to-camera transformations. Only present if camera poses were estimated or provided as input.
+- **intrinsics**: `np.ndarray` - Camera intrinsic matrices with shape `(N, 3, 3)` containing focal length and principal point information. Only present if poses were estimated or provided as input.
+### 🎁 Additional Outputs
+- **processed_images**: `np.ndarray` - Preprocessed input images with shape `(N, H, W, 3)` in RGB format (0-255 uint8).
+- **aux**: `dict` - Auxiliary outputs including:
+  - `feat_layer_X`: Intermediate features from layer X (if `export_feat_layers` was specified)
+  - `gaussians`: 3D Gaussian Splats data (if `infer_gs=True`)
+### 💻 Usage Example
+```python
+prediction = model.inference(image=["img1.jpg", "img2.jpg"])
+# Access depth maps
+depth_maps = prediction.depth  # shape: (2, H, W)
+# Access confidence
+if hasattr(prediction, 'conf'):
+    confidence = prediction.conf
+# Access camera parameters (if available)
+if hasattr(prediction, 'extrinsics'):
+    camera_poses = prediction.extrinsics  # shape: (2, 4, 4)
+if hasattr(prediction, 'intrinsics'):
+    camera_intrinsics = prediction.intrinsics  # shape: (2, 3, 3)
+# Access intermediate features (if export_feat_layers was set)
+if hasattr(prediction, 'aux') and 'feat_layer_0' in prediction.aux:
+    features = prediction.aux['feat_layer_0']
+```

Depth-Anything-3/docs/BENCHMARK.md ADDED Viewed

	@@ -0,0 +1,484 @@

+# 📏 Visual Geometry Benchmark
+This document provides comprehensive instructions for running benchmark evaluation on Depth Anything 3.
+## ✨ Highlights
+- 🗂️ **Diverse and Challenging Datasets**: 5 datasets (ETH3D, 7Scenes, ScanNet++, HiRoom, DTU) covering from objects to indoor and outdoor scenes. Part of datasets are recalibrated for high accuracy (see [ScanNet++](#scannet) details). All preprocessed datasets are uploaded to [depth-anything/DA3-BENCH](https://huggingface.co/datasets/depth-anything/DA3-BENCH).
+- 🔧 **Robust Evaluation Pipeline**: Standardized pipeline featuring RANSAC-based pose alignment for better coordinate system alignment, TSDF fusion for directly reflecting depth 3D consistency.
+- 📊 **Standardized Metrics**: Performance measured using established metrics: AUC for pose accuracy, F1-score and Chamfer Distance for reconstruction.
+---
+## 📑 Table of Contents
+- [🚀 Quick Start](#quick-start)
+- [📥 Dataset Download](#dataset-download)
+- [⚙️ Evaluation Pipeline](#evaluation-pipeline)
+- [🔧 Configuration](#configuration)
+- [📊 Metrics](#metrics)
+- [🗂️ Dataset Details](#dataset-details)
+- [💻 Command Reference](#command-reference)
+- [🔍 Troubleshooting](#troubleshooting)
+---
+## 🚀 Quick Start
+### 1. Download Benchmark Data
+> 💡 **Note:** Install HuggingFace CLI first: `pip install -U huggingface_hub[cli]`
+>
+> 🌐 **Mirror:** If download is slow, try: `export HF_ENDPOINT=https://hf-mirror.com`
+```bash
+cd da3_release
+# Create directory and download from HuggingFace
+mkdir -p workspace/benchmark_dataset
+hf download depth-anything/DA3-BENCH \
+    --local-dir workspace/benchmark_dataset \
+    --repo-type dataset
+# Extract all datasets
+cd workspace/benchmark_dataset
+for f in *.zip; do unzip -q "$f"; done
+```
+### 2. Run Evaluation
+```bash
+# Set model (default: depth-anything/DA3-GIANT)
+MODEL=depth-anything/DA3-GIANT
+# Full evaluation (all datasets, all modes)
+python -m depth_anything_3.bench.evaluator model.path=$MODEL
+# View results
+python -m depth_anything_3.bench.evaluator eval.print_only=true
+```
+---
+## 📥 Dataset Download
+All benchmark datasets are hosted on HuggingFace: **[depth-anything/DA3-BENCH](https://huggingface.co/datasets/depth-anything/DA3-BENCH)**
+| Dataset | File | Size | Description |
+|---------|------|------|-------------|
+| ETH3D | `eth3d.zip` | ~14.1 GB | High-resolution multi-view stereo (indoor/outdoor) |
+| ScanNet++ | `scannetpp.zip` | ~10.1 GB | High-quality RGB-D indoor scenes |
+| DTU-49 | `dtu.zip` | ~8.3 GB | Multi-view stereo benchmark (22 scenes × 49 views) |
+| 7Scenes | `7scenes.zip` | ~3.3 GB | RGB-D indoor localization |
+| DTU-64 | `dtu64.zip` | ~1.7 GB | DTU subset for pose evaluation (13 scenes × 64 views) |
+| HiRoom | `hiroom.zip` | ~0.7 GB | High-resolution indoor rooms |
+### Download Options
+**Option 1: Download All (Recommended)**
+```bash
+hf download depth-anything/DA3-BENCH \
+    --local-dir workspace/benchmark_dataset \
+    --repo-type dataset
+```
+**Option 2: Download Specific Dataset**
+```bash
+# Download only HiRoom
+hf download depth-anything/DA3-BENCH hiroom.zip \
+    --local-dir workspace/benchmark_dataset \
+    --repo-type dataset
+```
+**Option 3: Manual Download**
+Visit [https://huggingface.co/datasets/depth-anything/DA3-BENCH](https://huggingface.co/datasets/depth-anything/DA3-BENCH) and download the zip files manually.
+### Extract Datasets
+```bash
+cd workspace/benchmark_dataset
+# Extract all
+for f in *.zip; do unzip -q "$f"; done
+# Or extract specific dataset
+unzip hiroom.zip
+```
+### Expected Directory Structure
+After extraction, your directory should look like:
+```
+workspace/benchmark_dataset/
+├── eth3d/
+│   ├── courtyard/
+│   ├── electro/
+│   └── ...
+├── 7scenes/
+│   └── 7Scenes/
+│       ├── chess/
+│       └── ...
+├── scannetpp/
+│   ├── 09c1414f1b/
+│   └── ...
+├── hiroom/
+│   ├── data/
+│   ├── fused_pcd/
+│   └── selected_scene_list_val.txt
+├── dtu/
+│   ├── Rectified/
+│   ├── Cameras/
+│   ├── Points/
+│   ├── SampleSet/
+│   └── depth_raw/
+└── dtu64/
+    ├── Cameras/
+    ├── scan105/
+    └── ...
+```
+---
+## ⚙️ Evaluation Pipeline
+### Evaluation Modes
+| Mode | Description | Metrics |
+|------|-------------|---------|
+| `pose` | Camera pose estimation | AUC@3°, AUC@30° |
+| `recon_unposed` | 3D reconstruction with **predicted** poses | F-score, Overall |
+| `recon_posed` | 3D reconstruction with **GT** poses | F-score, Overall |
+### Basic Usage
+```bash
+cd da3_release
+MODEL=depth-anything/DA3-GIANT
+# Full evaluation (inference + evaluation + print results)
+python -m depth_anything_3.bench.evaluator model.path=$MODEL
+# Skip inference, only evaluate existing predictions
+python -m depth_anything_3.bench.evaluator eval.eval_only=true
+# Only print saved metrics
+python -m depth_anything_3.bench.evaluator eval.print_only=true
+```
+### Selective Evaluation
+```bash
+# Evaluate specific datasets
+python -m depth_anything_3.bench.evaluator model.path=$MODEL eval.datasets=[hiroom]
+# Evaluate specific modes
+python -m depth_anything_3.bench.evaluator model.path=$MODEL eval.modes=[pose,recon_unposed]
+# Combine dataset and mode selection
+python -m depth_anything_3.bench.evaluator model.path=$MODEL \
+    eval.datasets=[hiroom] \
+    eval.modes=[pose]
+```
+### 🖥️ Multi-GPU Inference
+The evaluator automatically distributes inference across available GPUs:
+```bash
+# Use 4 GPUs
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m depth_anything_3.bench.evaluator model.path=$MODEL
+# Use all available GPUs (default)
+python -m depth_anything_3.bench.evaluator model.path=$MODEL
+# Single GPU
+CUDA_VISIBLE_DEVICES=0 python -m depth_anything_3.bench.evaluator model.path=$MODEL
+```
+---
+## 🔧 Configuration
+### Config File
+Default config: `src/depth_anything_3/bench/configs/eval_bench.yaml`
+```yaml
+# Model path
+model:
+  path: depth-anything/DA3-GIANT
+# Workspace directory
+workspace:
+  work_dir: ./workspace/evaluation
+# Evaluation settings
+eval:
+  datasets: [eth3d, 7scenes, scannetpp, hiroom, dtu, dtu64]
+  modes: [pose, recon_unposed, recon_posed]
+  max_frames: 100      # Max frames per scene (-1 = no limit)
+  scenes: null         # Specific scenes (null = all)
+# Inference settings
+inference:
+  num_fusion_workers: 4
+  debug: false
+```
+### Output Structure
+```
+workspace/evaluation/
+├── model_results/              # Inference outputs
+│   ├── eth3d/
+│   │   └── {scene}/
+│   │       ├── unposed/       # Predictions for recon_unposed
+│   │       └── posed/         # Predictions for recon_posed
+│   ├── 7scenes/
+│   ├── scannetpp/
+│   ├── hiroom/
+│   ├── dtu/
+│   └── dtu64/
+└── metric_results/             # Evaluation metrics (JSON)
+    ├── eth3d_pose.json
+    ├── eth3d_recon_unposed.json
+    ├── eth3d_recon_posed.json
+    └── ...
+```
+---
+## 📊 Metrics
+### 🎯 Pose Estimation
+| Metric | Description |
+|--------|-------------|
+| **Auc3** | Area Under Curve at 3° angular error threshold |
+| **Auc30** | Area Under Curve at 30° angular error threshold |
+### 🏗️ 3D Reconstruction
+| Metric | Description | Note |
+|--------|-------------|------|
+| **F-score** | Harmonic mean of Precision and Recall | Higher is better |
+| **Overall** | (Accuracy + Completeness) / 2 | Lower is better (error in meters/mm) |
+> **Note:** DTU reports Overall in millimeters; other datasets report in meters.
+### Expected Results for DA3-GIANT
+If your setup is correct, you should get the following results when evaluating the **DA3-GIANT** model:
+```
+========================================================
+📊 SUMMARY
+========================================================
+🎯 POSE ESTIMATION
+---------------------------------------------------------------------------------------
+Metric         Avg         HiRoom      ETH3D       DTU-64      7Scenes     ScanNet++
+---------------------------------------------------------------------------------------
+Auc3           0.6705      0.8030      0.4872      0.9408      0.2744      0.8470
+Auc30          0.9436      0.9592      0.9153      0.9939      0.8668      0.9827
+🏗️  RECON_UNPOSED (Pred Pose)
+---------------------------------------------------------------------------------------
+Metric         Avg*        HiRoom      ETH3D       DTU         7Scenes     ScanNet++
+---------------------------------------------------------------------------------------
+F-score        0.7345      0.8629      0.7876      N/A         0.5043      0.7831
+Overall        0.1682      0.0457      0.4366      1.7927      0.1230      0.0676
+🏗️  RECON_POSED (GT Pose)
+---------------------------------------------------------------------------------------
+Metric         Avg*        HiRoom      ETH3D       DTU         7Scenes     ScanNet++
+---------------------------------------------------------------------------------------
+F-score        0.7978      0.9546      0.8685      N/A         0.5635      0.8045
+Overall        0.1408      0.0213      0.3679      1.7488      0.1092      0.0649
+* Avg F-score / Overall = average over HiRoom, ETH3D, 7Scenes, ScanNet++ (4 datasets)
+```
+---
+## 🗂️ Dataset Details
+### ETH3D
+High-resolution multi-view stereo benchmark with laser-scanned ground truth.
+- **Scenes:** 11 (courtyard, electro, kicker, pipes, relief, delivery_area, facade, office, playground, relief_2, terrains)
+- **Resolution:** Variable (high-res DSLR images)
+- **GT:** Laser-scanned meshes + depth maps
+> **⚠️ Image Filtering:** Some images with unusual camera rotations are filtered out for stable evaluation. See `ETH3D_FILTER_KEYS` in `constants.py`.
+### 7Scenes
+RGB-D dataset for camera relocalization.
+- **Scenes:** 7 (chess, fire, heads, office, pumpkin, redkitchen, stairs)
+- **Resolution:** 640×480
+- **GT:** Poses from KinectFusion, meshes from TSDF fusion
+### ScanNet++
+High-quality indoor RGB-D dataset with dense annotations.
+- **Scenes:** 20 validation scenes
+- **Resolution:** 768×1024 (after undistortion)
+- **GT:** High-quality meshes from FARO scanner
+> **⚠️ Camera Pose Re-calibration:** The default ScanNet++ poses are often inaccurate due to motion blur and textureless frames from iPhone captures. We re-ran COLMAP with the following improvements:
+> - **Frame filtering:** Removed blurry images during frame extraction
+> - **Fisheye calibration:** Jointly calibrated fisheye camera for wider FOV and better accuracy
+> - **Exhaustive matching:** Used COLMAP's exhaustive matcher and mapper for reliable poses (takes several days per scene but necessary for quality)
+> - All processed scenes are available at [haotongl/scannetpp_zipnerf](https://huggingface.co/datasets/haotongl/scannetpp_zipnerf)
+### HiRoom
+Indoor room scenes with high-resolution RGB-D data.
+- **Scenes:** 24 validation scenes
+- **GT:** Fused point clouds
+### DTU-49 (Reconstruction Only)
+Multi-view stereo benchmark following MVSNet evaluation protocol.
+- **Scenes:** 22 evaluation scenes
+- **Views:** 49 images per scene
+- **GT:** Laser-scanned point clouds with observation masks
+- **Metrics:** Overall only (accuracy + completeness in mm)
+### DTU-64 (Pose Only)
+DTU subset for pose estimation evaluation.
+- **Scenes:** 13 scenes
+- **Views:** 64 images per scene
+- **Metrics:** AUC@3°, AUC@30°
+> **Why two DTU settings?**
+> - **DTU-64** (pose): More views = more challenging pose estimation
+> - **DTU-49** (recon): Standard MVSNet protocol for fair comparison with MVS methods
+---
+## 💻 Command Reference
+```
+python -m depth_anything_3.bench.evaluator [OPTIONS] [KEY=VALUE ...]
+Configuration:
+  --config PATH                      Config YAML file (default: bench/configs/eval_bench.yaml)
+Config Overrides (using dotlist notation):
+  model.path=VALUE                   Model path or HuggingFace ID
+  workspace.work_dir=VALUE           Working directory for outputs
+  eval.datasets=[dataset1,dataset2]  Datasets to evaluate (eth3d,7scenes,scannetpp,hiroom,dtu,dtu64)
+  eval.modes=[mode1,mode2]           Evaluation modes (pose,recon_unposed,recon_posed)
+  eval.scenes=[scene1,scene2]        Specific scenes to evaluate (null=all)
+  eval.max_frames=VALUE              Max frames per scene (-1=no limit, default: 100)
+  eval.ref_view_strategy=VALUE       Reference view strategy (default: first)
+  eval.eval_only=VALUE               Only run evaluation (skip inference) (true/false)
+  eval.print_only=VALUE              Only print saved metrics (true/false)
+  inference.num_fusion_workers=VALUE Number of parallel workers (default: 4)
+  inference.debug=VALUE              Enable debug mode (true/false)
+Special Flags:
+  --help, -h                         Show this help message
+Multi-GPU:
+  Use CUDA_VISIBLE_DEVICES to specify GPUs (auto-detected and distributed)
+```
+### Examples
+```bash
+MODEL=depth-anything/DA3-GIANT
+# Full evaluation
+python -m depth_anything_3.bench.evaluator model.path=$MODEL
+# Quick test on HiRoom only
+python -m depth_anything_3.bench.evaluator \
+    model.path=$MODEL \
+    eval.datasets=[hiroom] \
+    eval.modes=[pose]
+# Pose-only evaluation (all 5 pose datasets)
+python -m depth_anything_3.bench.evaluator \
+    model.path=$MODEL \
+    eval.datasets=[eth3d,7scenes,scannetpp,hiroom,dtu64] \
+    eval.modes=[pose]
+# Recon-only evaluation (all 5 recon datasets)
+python -m depth_anything_3.bench.evaluator \
+    model.path=$MODEL \
+    eval.datasets=[eth3d,7scenes,scannetpp,hiroom,dtu] \
+    eval.modes=[recon_unposed,recon_posed]
+# Debug specific scenes
+python -m depth_anything_3.bench.evaluator \
+    model.path=$MODEL \
+    eval.datasets=[eth3d] \
+    eval.scenes=[courtyard] \
+    inference.debug=true
+# Re-evaluate without re-running inference
+python -m depth_anything_3.bench.evaluator eval.eval_only=true
+# Just view results
+python -m depth_anything_3.bench.evaluator eval.print_only=true
+```
+---
+## 🔍 Troubleshooting
+### Data Path Issues
+Ensure dataset paths in `src/depth_anything_3/utils/constants.py` are correct:
+```python
+# Default paths (relative to project root)
+ETH3D_EVAL_DATA_ROOT = "workspace/benchmark_dataset/eth3d"
+SEVENSCENES_EVAL_DATA_ROOT = "workspace/benchmark_dataset/7scenes"
+SCANNETPP_EVAL_DATA_ROOT = "workspace/benchmark_dataset/scannetpp"
+HIROOM_EVAL_DATA_ROOT = "workspace/benchmark_dataset/hiroom/data"
+DTU_EVAL_DATA_ROOT = "workspace/benchmark_dataset/dtu"
+DTU64_EVAL_DATA_ROOT = "workspace/benchmark_dataset/dtu64"
+```
+---
+## 📝 Citation
+If you find this benchmark useful, please cite:
+```
+@article{depthanything3,
+  title={Depth Anything 3: Recovering the visual space from any views},
+  author={Haotong Lin and Sili Chen and Jun Hao Liew and Donny Y. Chen and Zhenyu Li and Guang Shi and Jiashi Feng and Bingyi Kang},
+  journal={arXiv preprint arXiv:2511.10647},
+  year={2025}
+}
+```
+Please also cite the original dataset papers for each benchmark you use.
+---
+## 📄 License
+The benchmark datasets are provided for research purposes only. Users must follow the original licenses of each dataset:
+- **ETH3D:** [https://www.eth3d.net/](https://www.eth3d.net/)
+- **7Scenes:** [Microsoft Research](https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/)
+- **ScanNet++:** [http://www.scan-net.org/](http://www.scan-net.org/)
+- **DTU:** [https://roboimagedata.compute.dtu.dk/](https://roboimagedata.compute.dtu.dk/)
+- **HiRoom:** [SVLightVerse](https://jerrypiglet.github.io/SVLightVerse/)

Depth-Anything-3/docs/CLI.md ADDED Viewed

	@@ -0,0 +1,654 @@

+# 🚀 Depth Anything 3 Command Line Interface
+## 📋 Table of Contents
+- [📖 Overview](#overview)
+- [⚡ Quick Start](#quick-start)
+- [📚 Command Reference](#command-reference)
+  - [🤖 auto - Auto Mode](#auto---auto-mode)
+  - [🖼️ image - Single Image Processing](#image---single-image-processing)
+  - [🗂️ images - Image Directory Processing](#images---image-directory-processing)
+  - [🎬 video - Video Processing](#video---video-processing)
+  - [📐 colmap - COLMAP Dataset Processing](#colmap---colmap-dataset-processing)
+  - [🔧 backend - Backend Service](#backend---backend-service)
+  - [🎨 gradio - Gradio Application](#gradio---gradio-application)
+  - [🖼️ gallery - Gallery Server](#gallery---gallery-server)
+- [⚙️ Parameter Details](#parameter-details)
+- [💡 Usage Examples](#usage-examples)
+## 📖 Overview
+The Depth Anything 3 CLI provides a comprehensive command-line toolkit supporting image depth estimation, video processing, COLMAP dataset handling, and web applications.
+The backend service enables cache model to GPU so that we do not need to reload model for each command.
+## ⚡ Quick Start
+The CLI can run fully offline or connect to the backend for cached weights and task scheduling:
+```bash
+# 🔧 Start backend service (optional, keeps model resident in GPU memory)
+da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE
+# 🚀 Use auto mode to process input
+da3 auto path/to/input --export-dir ./workspace/scene001
+# ♻️ Reuse backend for next job
+da3 auto path/to/video.mp4 \
+    --export-dir ./workspace/scene002 \
+    --use-backend \
+    --backend-url http://localhost:8008
+```
+Each export directory contains `scene.glb`, `scene.jpg`, and optional extras such as `depth_vis/` or `gs_video/` depending on the requested format.
+## 📚 Command Reference
+### 🤖 auto - Auto Mode
+Automatically detect input type and dispatch to the appropriate handler.
+**Usage:**
+```bash
+da3 auto INPUT_PATH [OPTIONS]
+```
+**Input Type Detection:**
+- 🖼️ Single image file (.jpg, .png, .jpeg, .webp, .bmp, .tiff, .tif)
+- 📁 Image directory
+- 🎬 Video file (.mp4, .avi, .mov, .mkv, .flv, .wmv, .webm, .m4v)
+- 📐 COLMAP directory (containing `images/` and `sparse/` subdirectories)
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `INPUT_PATH` | str | Required | Input path (image, directory, video, or COLMAP) |
+| `--model-dir` | str | Default model | Model directory path |
+| `--export-dir` | str | `debug` | Export directory |
+| `--export-format` | str | `glb` | Export format (supports `mini_npz`, `glb`, `feat_vis`, etc., can be combined with hyphens) |
+| `--device` | str | `cuda` | Device to use |
+| `--use-backend` | bool | `False` | Use backend service for inference |
+| `--backend-url` | str | `http://localhost:8008` | Backend service URL |
+| `--process-res` | int | `504` | Processing resolution |
+| `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
+| `--export-feat` | str | `""` | Export features from specified layers, comma-separated (e.g., `"0,1,2"`) |
+| `--auto-cleanup` | bool | `False` | Automatically clean export directory without confirmation |
+| `--fps` | float | `1.0` | [Video] Frame sampling FPS |
+| `--sparse-subdir` | str | `""` | [COLMAP] Sparse reconstruction subdirectory (e.g., `"0"` for `sparse/0/`) |
+| `--align-to-input-ext-scale` | bool | `True` | [COLMAP] Align prediction to input extrinsics scale |
+| `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
+| `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy: `first`, `middle`, `saddle_balanced`, `saddle_sim_range`. See [docs](funcs/ref_view_strategy.md) |
+| `--conf-thresh-percentile` | float | `40.0` | [GLB] Lower percentile for adaptive confidence threshold |
+| `--num-max-points` | int | `1000000` | [GLB] Maximum number of points in the point cloud |
+| `--show-cameras` | bool | `True` | [GLB] Show camera wireframes in the exported scene |
+| `--feat-vis-fps` | int | `15` | [FEAT_VIS] Frame rate for output video |
+**Examples:**
+```bash
+# 🖼️ Auto-process an image
+da3 auto path/to/image.jpg --export-dir ./output
+# 🎬 Auto-process a video
+da3 auto path/to/video.mp4 --fps 2.0 --export-dir ./output
+# 🔧 Use backend service
+da3 auto path/to/input \
+    --export-format mini_npz-glb \
+    --use-backend \
+    --backend-url http://localhost:8008 \
+    --export-dir ./output
+```
+---
+### 🖼️ image - Single Image Processing
+Process a single image for camera pose and depth estimation.
+**Usage:**
+```bash
+da3 image IMAGE_PATH [OPTIONS]
+```
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `IMAGE_PATH` | str | Required | Input image file path |
+| `--model-dir` | str | Default model | Model directory path |
+| `--export-dir` | str | `debug` | Export directory |
+| `--export-format` | str | `glb` | Export format |
+| `--device` | str | `cuda` | Device to use |
+| `--use-backend` | bool | `False` | Use backend service for inference |
+| `--backend-url` | str | `http://localhost:8008` | Backend service URL |
+| `--process-res` | int | `504` | Processing resolution |
+| `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
+| `--export-feat` | str | `""` | Export feature layer indices (comma-separated) |
+| `--auto-cleanup` | bool | `False` | Automatically clean export directory |
+| `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
+| `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
+| `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
+| `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
+| `--show-cameras` | bool | `True` | [GLB] Show cameras |
+| `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
+**Examples:**
+```bash
+# ✨ Basic usage
+da3 image path/to/image.png --export-dir ./output
+# ⚡ With backend acceleration
+da3 image path/to/image.png \
+    --use-backend \
+    --backend-url http://localhost:8008 \
+    --export-dir ./output
+# 🔍 Export feature visualization
+da3 image image.jpg \
+    --export-format feat_vis \
+    --export-feat "9,19,29,39" \
+    --export-dir ./results
+```
+---
+### 🗂️ images - Image Directory Processing
+Process a directory of images for batch depth estimation.
+**Usage:**
+```bash
+da3 images IMAGES_DIR [OPTIONS]
+```
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `IMAGES_DIR` | str | Required | Directory path containing images |
+| `--image-extensions` | str | `png,jpg,jpeg` | Image file extensions to process (comma-separated) |
+| `--model-dir` | str | Default model | Model directory path |
+| `--export-dir` | str | `debug` | Export directory |
+| `--export-format` | str | `glb` | Export format |
+| `--device` | str | `cuda` | Device to use |
+| `--use-backend` | bool | `False` | Use backend service for inference |
+| `--backend-url` | str | `http://localhost:8008` | Backend service URL |
+| `--process-res` | int | `504` | Processing resolution |
+| `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
+| `--export-feat` | str | `""` | Export feature layer indices |
+| `--auto-cleanup` | bool | `False` | Automatically clean export directory |
+| `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
+| `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
+| `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
+| `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
+| `--show-cameras` | bool | `True` | [GLB] Show cameras |
+| `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
+**Examples:**
+```bash
+# 📁 Process directory (defaults to png/jpg/jpeg)
+da3 images ./image_folder --export-dir ./output
+# 🎯 Custom extensions
+da3 images ./dataset --image-extensions "png,jpg,webp" --export-dir ./output
+# 🔧 Use backend service
+da3 images ./dataset \
+    --use-backend \
+    --backend-url http://localhost:8008 \
+    --export-dir ./output
+```
+---
+### 🎬 video - Video Processing
+Process video by extracting frames for depth estimation.
+**Usage:**
+```bash
+da3 video VIDEO_PATH [OPTIONS]
+```
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `VIDEO_PATH` | str | Required | Input video file path |
+| `--fps` | float | `1.0` | Frame extraction sampling FPS |
+| `--model-dir` | str | Default model | Model directory path |
+| `--export-dir` | str | `debug` | Export directory |
+| `--export-format` | str | `glb` | Export format |
+| `--device` | str | `cuda` | Device to use |
+| `--use-backend` | bool | `False` | Use backend service for inference |
+| `--backend-url` | str | `http://localhost:8008` | Backend service URL |
+| `--process-res` | int | `504` | Processing resolution |
+| `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
+| `--export-feat` | str | `""` | Export feature layer indices |
+| `--auto-cleanup` | bool | `False` | Automatically clean export directory |
+| `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
+| `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
+| `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
+| `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
+| `--show-cameras` | bool | `True` | [GLB] Show cameras |
+| `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
+**Examples:**
+```bash
+# ��� Basic video processing
+da3 video path/to/video.mp4 --export-dir ./output
+# ⚙️ Control frame sampling and resolution
+da3 video path/to/video.mp4 \
+    --fps 2.0 \
+    --process-res 1024 \
+    --export-dir ./output
+# 🔧 Use backend service
+da3 video path/to/video.mp4 \
+    --use-backend \
+    --backend-url http://localhost:8008 \
+    --export-dir ./output
+```
+---
+### 📐 colmap - COLMAP Dataset Processing
+Run pose-conditioned depth estimation on COLMAP data.
+**Usage:**
+```bash
+da3 colmap COLMAP_DIR [OPTIONS]
+```
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `COLMAP_DIR` | str | Required | COLMAP directory containing `images/` and `sparse/` subdirectories |
+| `--sparse-subdir` | str | `""` | Sparse reconstruction subdirectory (e.g., `"0"` for `sparse/0/`) |
+| `--align-to-input-ext-scale` | bool | `True` | Align prediction to input extrinsics scale |
+| `--model-dir` | str | Default model | Model directory path |
+| `--export-dir` | str | `debug` | Export directory |
+| `--export-format` | str | `glb` | Export format |
+| `--device` | str | `cuda` | Device to use |
+| `--use-backend` | bool | `False` | Use backend service for inference |
+| `--backend-url` | str | `http://localhost:8008` | Backend service URL |
+| `--process-res` | int | `504` | Processing resolution |
+| `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
+| `--export-feat` | str | `""` | Export feature layer indices |
+| `--auto-cleanup` | bool | `False` | Automatically clean export directory |
+| `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
+| `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
+| `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
+| `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
+| `--show-cameras` | bool | `True` | [GLB] Show cameras |
+| `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
+**Examples:**
+```bash
+# 📐 Process COLMAP dataset
+da3 colmap ./colmap_dataset --export-dir ./output
+# 🎯 Use specific sparse subdirectory and align scale
+da3 colmap ./colmap_dataset \
+    --sparse-subdir 0 \
+    --align-to-input-ext-scale \
+    --export-dir ./output
+# 🔧 Use backend service
+da3 colmap ./colmap_dataset \
+    --use-backend \
+    --backend-url http://localhost:8008 \
+    --export-dir ./output
+```
+---
+### 🔧 backend - Backend Service
+Start model backend service with integrated gallery.
+**Usage:**
+```bash
+da3 backend [OPTIONS]
+```
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `--model-dir` | str | Default model | Model directory path |
+| `--device` | str | `cuda` | Device to use |
+| `--host` | str | `127.0.0.1` | Host address to bind to |
+| `--port` | int | `8008` | Port number to bind to |
+| `--gallery-dir` | str | Default gallery dir | Gallery directory path (optional) |
+**Features:**
+- 🎯 Keeps model resident in GPU memory
+- 🔌 Provides REST inference API
+- 📊 Integrated dashboard and status monitoring
+- 🖼️ Optional gallery browser (if `--gallery-dir` is provided)
+**Available Endpoints:**
+- 🏠 `/` - Home page
+- 📊 `/dashboard` - Dashboard
+- ✅ `/status` - API status
+- 🖼️ `/gallery/` - Gallery browser (if enabled)
+**Examples:**
+```bash
+# 🚀 Basic backend service
+da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE
+# 🖼️ Backend with gallery
+da3 backend \
+    --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
+    --device cuda \
+    --host 0.0.0.0 \
+    --port 8008 \
+    --gallery-dir ./workspace
+# 💻 Use CPU
+da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE --device cpu
+```
+---
+### 🎨 gradio - Gradio Application
+Launch Depth Anything 3 Gradio interactive web application.
+**Usage:**
+```bash
+da3 gradio [OPTIONS]
+```
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `--model-dir` | str | Required | Model directory path |
+| `--workspace-dir` | str | Required | Workspace directory path |
+| `--gallery-dir` | str | Required | Gallery directory path |
+| `--host` | str | `127.0.0.1` | Host address to bind to |
+| `--port` | int | `7860` | Port number to bind to |
+| `--share` | bool | `False` | Create a public link |
+| `--debug` | bool | `False` | Enable debug mode |
+| `--cache-examples` | bool | `False` | Pre-cache all example scenes at startup |
+| `--cache-gs-tag` | str | `""` | Tag to match scene names for high-res+3DGS caching |
+**Examples:**
+```bash
+# 🎨 Basic Gradio application
+da3 gradio \
+    --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
+    --workspace-dir ./workspace \
+    --gallery-dir ./gallery
+# 🌐 Enable sharing and debug
+da3 gradio \
+    --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
+    --workspace-dir ./workspace \
+    --gallery-dir ./gallery \
+    --share \
+    --debug
+# ⚡ Pre-cache examples
+da3 gradio \
+    --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
+    --workspace-dir ./workspace \
+    --gallery-dir ./gallery \
+    --cache-examples \
+    --cache-gs-tag "dl3dv"
+```
+---
+### 🖼️ gallery - Gallery Server
+Launch standalone Depth Anything 3 Gallery server.
+**Usage:**
+```bash
+da3 gallery [OPTIONS]
+```
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `--gallery-dir` | str | Default gallery dir | Gallery root directory |
+| `--host` | str | `127.0.0.1` | Host address to bind to |
+| `--port` | int | `8007` | Port number to bind to |
+| `--open-browser` | bool | `False` | Open browser after launch |
+**Note:**
+The gallery expects each scene folder to contain at least `scene.glb` and `scene.jpg`, with optional subfolders such as `depth_vis/` or `gs_video/`.
+**Examples:**
+```bash
+# 🖼️ Basic gallery server
+da3 gallery --gallery-dir ./workspace
+# 🌐 Custom host and port
+da3 gallery \
+    --gallery-dir ./workspace \
+    --host 0.0.0.0 \
+    --port 8007
+# 🚀 Auto-open browser
+da3 gallery --gallery-dir ./workspace --open-browser
+```
+---
+## ⚙️ Parameter Details
+### 🔧 Common Parameters
+- **`--export-dir`**: Output directory, defaults to `debug`
+- **`--export-format`**: Export format, supports combining multiple formats with hyphens:
+  - 📦 `mini_npz`: Compressed NumPy format
+  - 🎨 `glb`: glTF binary format (3D scene)
+  - 🔍 `feat_vis`: Feature visualization
+  - Example: `mini_npz-glb` exports both formats
+- **`--process-res`** / **`--process-res-method`**: Control preprocessing resolution strategy
+  - `process-res`: Target resolution (default 504)
+  - `process-res-method`: Resize method (default `upper_bound_resize`)
+- **`--auto-cleanup`**: Remove existing export directory without confirmation
+- **`--use-backend`** / **`--backend-url`**: Reuse running backend service
+  - ⚡ Reduces model loading time
+  - 🌐 Supports distributed processing
+- **`--export-feat`**: Layer indices for exporting intermediate features (comma-separated)
+  - Example: `"9,19,29,39"`
+### 🎨 GLB Export Parameters
+- **`--conf-thresh-percentile`**: Lower percentile for adaptive confidence threshold (default 40.0)
+  - Used to filter low-confidence points
+- **`--num-max-points`**: Maximum number of points in point cloud (default 1,000,000)
+  - Controls output file size and performance
+- **`--show-cameras`**: Show camera wireframes in exported scene (default True)
+### 🔍 Feature Visualization Parameters
+- **`--feat-vis-fps`**: Frame rate for feature visualization output video (default 15)
+### 🎬 Video-Specific Parameters
+- **`--fps`**: Video frame extraction sampling rate (default 1.0 FPS)
+  - Higher values extract more frames
+### 📐 COLMAP-Specific Parameters
+- **`--sparse-subdir`**: Sparse reconstruction subdirectory
+  - Empty string uses `sparse/` directory
+  - `"0"` uses `sparse/0/` directory
+- **`--align-to-input-ext-scale`**: Align prediction to input extrinsics scale (default True)
+  - Ensures depth estimation is consistent with COLMAP scale
+---
+## 💡 Usage Examples
+### 1️⃣ Basic Workflow
+```bash
+# 🔧 Start backend service
+da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE --host 0.0.0.0 --port 8008
+# 🖼️ Process single image
+da3 image image.jpg --export-dir ./output1 --use-backend
+# 🎬 Process video
+da3 video video.mp4 --fps 2.0 --export-dir ./output2 --use-backend
+# 📐 Process COLMAP dataset
+da3 colmap ./colmap_data --export-dir ./output3 --use-backend
+```
+### 2️⃣ Using Auto Mode
+```bash
+# 🤖 Auto-detect and process
+da3 auto ./unknown_input --export-dir ./output
+# ⚡ With backend acceleration
+da3 auto ./unknown_input \
+    --use-backend \
+    --backend-url http://localhost:8008 \
+    --export-dir ./output
+```
+### 3️⃣ Multi-Format Export
+```bash
+# 📦 Export both NPZ and GLB formats
+da3 auto assets/examples/SOH \
+    --export-format mini_npz-glb \
+    --export-dir ./workspace/soh
+# 🔍 Export feature visualization
+da3 image image.jpg \
+    --export-format feat_vis \
+    --export-feat "9,19,29,39" \
+    --export-dir ./results
+```
+### 4️⃣ Advanced Configuration
+```bash
+# ⚙️ Custom resolution and point cloud density
+da3 image image.jpg \
+    --process-res 1024 \
+    --num-max-points 2000000 \
+    --conf-thresh-percentile 30.0 \
+    --export-dir ./output
+# 📐 COLMAP advanced options
+da3 colmap ./colmap_data \
+    --sparse-subdir 0 \
+    --align-to-input-ext-scale \
+    --process-res 756 \
+    --export-dir ./output
+```
+### 5️⃣ Batch Processing Workflow
+```bash
+# 🔧 Start backend
+da3 backend \
+    --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
+    --device cuda \
+    --host 0.0.0.0 \
+    --port 8008 \
+    --gallery-dir ./workspace
+# 🔄 Batch process multiple scenes
+for scene in scene1 scene2 scene3; do
+    da3 auto ./data/$scene \
+        --export-dir ./workspace/$scene \
+        --use-backend \
+        --auto-cleanup
+done
+# 🖼️ Launch gallery to view results
+da3 gallery --gallery-dir ./workspace --open-browser
+```
+### 6️⃣ Web Applications
+```bash
+# 🎨 Launch Gradio application
+da3 gradio \
+    --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
+    --workspace-dir workspace/gradio \
+    --gallery-dir ./gallery \
+    --host 0.0.0.0 \
+    --port 7860 \
+    --share
+```
+### 7️⃣ Transformer Feature Visualization
+```bash
+# 🔍 Export Transformer features
+# 📦 Combined with numerical output
+da3 auto video.mp4 \
+    --export-format glb-feat_vis \
+    --export-feat "11,21,31" \
+    --export-dir ./debug \
+    --use-backend
+```
+---
+## 📝 Notes
+1. **🔧 Backend Service**: Recommended for processing multiple tasks to improve efficiency
+2. **💾 GPU Memory**: Be mindful of GPU memory usage when processing high-resolution inputs
+3. **📁 Export Directory**: Use `--auto-cleanup` to avoid manual confirmation for deletion
+4. **🔀 Format Combination**: Multiple export formats can be combined with hyphens (e.g., `mini_npz-glb-feat_vis`)
+5. **📐 COLMAP Data**: Ensure COLMAP directory structure is correct (contains `images/` and `sparse/` subdirectories)
+---
+## ❓ Getting Help
+View detailed help for any command:
+```bash
+# 📖 View main help
+da3 --help
+# 🔍 View specific command help
+da3 auto --help
+da3 image --help
+da3 backend --help
+```

Depth-Anything-3/docs/funcs/ref_view_strategy.md ADDED Viewed

	@@ -0,0 +1,183 @@

+# 📐 Reference View Selection Strategy
+## 📖 Overview
+Reference view selection is a component in multi-view depth estimation. When processing multiple input views, the model needs to determine which view should serve as the primary reference frame for depth prediction, defining the world coordinate system.
+Different reference view will leads to different reconstruction results. This is a known consideration in multi-view geometry and was analyzed in [PI3](https://arxiv.org/abs/2507.13347). The choice of reference view can affect the quality and consistency of depth predictions across the scene.
+## 🚀 Our Simple Solution: Automatic Reference View Selection
+DA3 provides a simple approach to address this through **automatic reference view selection** based on **class tokens**. Instead of relying on heuristics or manual selection, the model analyzes the class token features from all input views and intelligently selects the most suitable reference frame.
+---
+## 🎨 Available Strategies
+### 1. ⚖️ `saddle_balanced` (Recommended, Default)
+**Philosophy:**
+Select a view that achieves balance across multiple feature metrics. This strategy looks for a "middle ground" view that is neither too similar nor too different from other views, making it a stable reference point.
+**How it works:**
+1. Extracts and normalizes class tokens from all views
+2. Computes three complementary metrics for each view:
+   - **Similarity score**: Average cosine similarity with other views
+   - **Feature norm**: L2 norm of the original features
+   - **Feature variance**: Variance across feature dimensions
+3. Normalizes each metric to [0, 1] range
+4. Selects the view closest to 0.5 (median) across all three metrics
+### 2. 🎢 `saddle_sim_range`
+**Philosophy:**
+Select a view with the largest similarity range to other views. This identifies "saddle point" views that are highly similar to some views but dissimilar to others, making them information-rich anchor points.
+**How it works:**
+1. Computes pairwise cosine similarity between all views
+2. For each view, calculates the range (max - min) of similarities to other views
+3. Selects the view with the maximum similarity range
+---
+### 3. 1️⃣ `first` (Not Recommended)
+**Philosophy:**
+Always use the first view in the input sequence as the reference.
+**How it works:**
+Simply returns index 0.
+**When to use:**
+- ⛔ **Not recommended** in general
+- 🔧 Only use when you have manually pre-sorted your views and know the first view is optimal
+- 🐛 Debugging or baseline comparisons
+---
+### 4. ⏸️ `middle`
+**Philosophy:**
+Select the view in the middle of the input sequence.
+**How it works:**
+Returns the view at index `S // 2` where S is the number of views.
+**When to use:**
+- ⏱️ **Only recommended when input images are temporally ordered**
+- 🎬 Video sequences (e.g., **DA3-LONG** setting)
+- 📹 Sequential captures where the middle frame likely has the most stable viewpoint
+**Specific use case: DA3-LONG** 🎬
+In video-based depth estimation scenarios (like DA3-LONG), where inputs are consecutive frames, `middle` is often the **optimal choice** because that it has maximum overlap with all other frames.
+## 💻 Usage
+### 🐍 Python API
+```python
+from depth_anything_3 import DepthAnything3
+model = DepthAnything3.from_pretrained("depth-anything/DA3NESTED-GIANT-LARGE")
+# Use default (saddle_balanced)
+prediction = model.inference(
+    images,
+    ref_view_strategy="saddle_balanced"
+)
+# For video sequences, consider using middle
+prediction = model.inference(
+    video_frames,
+    ref_view_strategy="middle"  # Good for temporal sequences
+)
+# For complex scenes with wide baselines
+prediction = model.inference(
+    images,
+    ref_view_strategy="saddle_sim_range"
+)
+```
+### 🖥️ Command Line Interface
+```bash
+# Default (saddle_balanced)
+da3 auto input/ --export-dir output/
+# Explicitly specify strategy
+da3 auto input/ --ref-view-strategy saddle_balanced
+# For video processing
+da3 video input.mp4 --ref-view-strategy middle
+# For wide-baseline multi-view
+da3 images captures/ --ref-view-strategy saddle_sim_range
+```
+---
+### 🎯 When Selection Is Applied
+Reference view selection is applied when:
+- 3️⃣ Number of views S ≥ 3
+---
+## 💡 Recommendations
+### 📋 Quick Guide
+| Scenario | Recommended Strategy | Rationale |
+|----------|---------------------|-----------|
+| **Default / Unknown** | `saddle_balanced` | Robust, balanced, works well across diverse scenarios |
+| **Video frames** | `middle` | Temporal coherence, stable middle frame |
+| **Wide-baseline multi-view** | `saddle_sim_range` | Maximizes information coverage |
+| **Pre-sorted inputs** | `first` | Use only if you've manually optimized ordering |
+| **Single image** | `first` | Automatically used (no reordering needed for S ≤ 2) |
+### ✨ Best Practices
+1. 🎯 **Start with defaults**: `saddle_balanced` works well in most cases
+2. 🎬 **Consider your input type**: Use `middle` for videos, `saddle_balanced` for photos
+3. 🔬 **Experiment if needed**: Try different strategies if results are suboptimal
+4. 📊 **Monitor performance**: Check `glb` quality and consistency across views.
+---
+## 🔧 Technical Details
+### 🎚️ Selection Threshold
+The reference view selection is only triggered when:
+```python
+num_views >= 3  # At least 3 views required
+```
+For 1-2 views, no reordering is performed (equivalent to using `first`).
+### ⚙️ Implementation
+The selection happens at layer `alt_start - 1` in the vision transformer, before the first global attention layer. This ensures the selected reference view influences the entire depth prediction pipeline.
+---
+## ❓ FAQ
+**Q: 🤔 Why is this feature provided?**
+A: The model can handle any view order, but this feature provides automatic optimization for reference view selection, which can help improve depth prediction quality in multi-view scenarios.
+**Q: ⏱️ Does this add computational cost?**
+A: The overhead is totally negligible.
+**Q: 🎮 Can I manually specify which view to use as reference?**
+A: Not directly through this parameter. You can pre-sort your input images to place your preferred reference view first and use `ref_view_strategy="first"`.
+**Q: ⚙️ What happens if I don't specify this parameter?**
+A: The default `saddle_balanced` strategy is used automatically.
+**Q: 📊 Is this feature used in the DA3 paper benchmarks?**
+A: No, the paper used `first` as the default strategy for all multi-view experiments. The current default has been updated to `saddle_balanced` for better robustness.

Depth-Anything-3/notebooks/da3.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Depth-Anything-3/src/depth_anything_3/api.py ADDED Viewed

	@@ -0,0 +1,446 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Depth Anything 3 API module.
+This module provides the main API for Depth Anything 3, including model loading,
+inference, and export capabilities. It supports both single and nested model architectures.
+"""
+from __future__ import annotations
+import time
+from typing import Optional, Sequence
+import numpy as np
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from PIL import Image
+from depth_anything_3.cfg import create_object, load_config
+from depth_anything_3.registry import MODEL_REGISTRY
+from depth_anything_3.specs import Prediction
+from depth_anything_3.utils.export import export
+from depth_anything_3.utils.geometry import affine_inverse
+from depth_anything_3.utils.io.input_processor import InputProcessor
+from depth_anything_3.utils.io.output_processor import OutputProcessor
+from depth_anything_3.utils.logger import logger
+from depth_anything_3.utils.pose_align import align_poses_umeyama
+torch.backends.cudnn.benchmark = False
+# logger.info("CUDNN Benchmark Disabled")
+SAFETENSORS_NAME = "model.safetensors"
+CONFIG_NAME = "config.json"
+class DepthAnything3(nn.Module, PyTorchModelHubMixin):
+    """
+    Depth Anything 3 main API class.
+    This class provides a high-level interface for depth estimation using Depth Anything 3.
+    It supports both single and nested model architectures with metric scaling capabilities.
+    Features:
+    - Hugging Face Hub integration via PyTorchModelHubMixin
+    - Support for multiple model presets (vitb, vitg, nested variants)
+    - Automatic mixed precision inference
+    - Export capabilities for various formats (GLB, PLY, NPZ, etc.)
+    - Camera pose estimation and metric depth scaling
+    Usage:
+        # Load from Hugging Face Hub
+        model = DepthAnything3.from_pretrained("huggingface/model-name")
+        # Or create with specific preset
+        model = DepthAnything3(preset="vitg")
+        # Run inference
+        prediction = model.inference(images, export_dir="output", export_format="glb")
+    """
+    _commit_hash: str | None = None  # Set by mixin when loading from Hub
+    def __init__(self, model_name: str = "da3-large", **kwargs):
+        """
+        Initialize DepthAnything3 with specified preset.
+        Args:
+        model_name: The name of the model preset to use.
+                    Examples: 'da3-giant', 'da3-large', 'da3metric-large', 'da3nested-giant-large'.
+        **kwargs: Additional keyword arguments (currently unused).
+        """
+        super().__init__()
+        self.model_name = model_name
+        # Build the underlying network
+        self.config = load_config(MODEL_REGISTRY[self.model_name])
+        self.model = create_object(self.config)
+        self.model.eval()
+        # Initialize processors
+        self.input_processor = InputProcessor()
+        self.output_processor = OutputProcessor()
+        # Device management (set by user)
+        self.device = None
+    @torch.inference_mode()
+    def forward(
+        self,
+        image: torch.Tensor,
+        extrinsics: torch.Tensor | None = None,
+        intrinsics: torch.Tensor | None = None,
+        export_feat_layers: list[int] | None = None,
+        infer_gs: bool = False,
+        use_ray_pose: bool = False,
+        ref_view_strategy: str = "saddle_balanced",
+    ) -> dict[str, torch.Tensor]:
+        """
+        Forward pass through the model.
+        Args:
+            image: Input batch with shape ``(B, N, 3, H, W)`` on the model device.
+            extrinsics: Optional camera extrinsics with shape ``(B, N, 4, 4)``.
+            intrinsics: Optional camera intrinsics with shape ``(B, N, 3, 3)``.
+            export_feat_layers: Layer indices to return intermediate features for.
+            infer_gs: Enable Gaussian Splatting branch.
+            use_ray_pose: Use ray-based pose estimation instead of camera decoder.
+            ref_view_strategy: Strategy for selecting reference view from multiple views.
+        Returns:
+            Dictionary containing model predictions
+        """
+        # Determine optimal autocast dtype
+        autocast_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        with torch.no_grad():
+            with torch.autocast(device_type=image.device.type, dtype=autocast_dtype):
+                return self.model(
+                    image, extrinsics, intrinsics, export_feat_layers, infer_gs, use_ray_pose, ref_view_strategy
+                )
+    def inference(
+        self,
+        image: list[np.ndarray | Image.Image | str],
+        extrinsics: np.ndarray | None = None,
+        intrinsics: np.ndarray | None = None,
+        align_to_input_ext_scale: bool = True,
+        infer_gs: bool = False,
+        use_ray_pose: bool = False,
+        ref_view_strategy: str = "saddle_balanced",
+        render_exts: np.ndarray | None = None,
+        render_ixts: np.ndarray | None = None,
+        render_hw: tuple[int, int] | None = None,
+        process_res: int = 504,
+        process_res_method: str = "upper_bound_resize",
+        export_dir: str | None = None,
+        export_format: str = "mini_npz",
+        export_feat_layers: Sequence[int] | None = None,
+        # GLB export parameters
+        conf_thresh_percentile: float = 40.0,
+        num_max_points: int = 1_000_000,
+        show_cameras: bool = True,
+        # Feat_vis export parameters
+        feat_vis_fps: int = 15,
+        # Other export parameters, e.g., gs_ply, gs_video
+        export_kwargs: Optional[dict] = {},
+    ) -> Prediction:
+        """
+        Run inference on input images.
+        Args:
+            image: List of input images (numpy arrays, PIL Images, or file paths)
+            extrinsics: Camera extrinsics (N, 4, 4)
+            intrinsics: Camera intrinsics (N, 3, 3)
+            align_to_input_ext_scale: whether to align the input pose scale to the prediction
+            infer_gs: Enable the 3D Gaussian branch (needed for `gs_ply`/`gs_video` exports)
+            use_ray_pose: Use ray-based pose estimation instead of camera decoder (default: False)
+            ref_view_strategy: Strategy for selecting reference view from multiple views.
+                Options: "first", "middle", "saddle_balanced", "saddle_sim_range".
+                Default: "saddle_balanced". For single view input (S ≤ 2), no reordering is performed.
+            render_exts: Optional render extrinsics for Gaussian video export
+            render_ixts: Optional render intrinsics for Gaussian video export
+            render_hw: Optional render resolution for Gaussian video export
+            process_res: Processing resolution
+            process_res_method: Resize method for processing
+            export_dir: Directory to export results
+            export_format: Export format (mini_npz, npz, glb, ply, gs, gs_video)
+            export_feat_layers: Layer indices to export intermediate features from
+            conf_thresh_percentile: [GLB] Lower percentile for adaptive confidence threshold (default: 40.0) # noqa: E501
+            num_max_points: [GLB] Maximum number of points in the point cloud (default: 1,000,000)
+            show_cameras: [GLB] Show camera wireframes in the exported scene (default: True)
+            feat_vis_fps: [FEAT_VIS] Frame rate for output video (default: 15)
+            export_kwargs: additional arguments to export functions.
+        Returns:
+            Prediction object containing depth maps and camera parameters
+        """
+        if "gs" in export_format:
+            assert infer_gs, "must set `infer_gs=True` to perform gs-related export."
+        if "colmap" in export_format:
+            assert isinstance(image[0], str), "`image` must be image paths for COLMAP export."
+        # Preprocess images
+        imgs_cpu, extrinsics, intrinsics = self._preprocess_inputs(
+            image, extrinsics, intrinsics, process_res, process_res_method
+        )
+        # Prepare tensors for model
+        imgs, ex_t, in_t = self._prepare_model_inputs(imgs_cpu, extrinsics, intrinsics)
+        # Normalize extrinsics
+        ex_t_norm = self._normalize_extrinsics(ex_t.clone() if ex_t is not None else None)
+        # Run model forward pass
+        export_feat_layers = list(export_feat_layers) if export_feat_layers is not None else []
+        raw_output = self._run_model_forward(
+            imgs, ex_t_norm, in_t, export_feat_layers, infer_gs, use_ray_pose, ref_view_strategy
+        )
+        # Convert raw output to prediction
+        prediction = self._convert_to_prediction(raw_output)
+        # Align prediction to extrinsincs
+        prediction = self._align_to_input_extrinsics_intrinsics(
+            extrinsics, intrinsics, prediction, align_to_input_ext_scale
+        )
+        # Add processed images for visualization
+        prediction = self._add_processed_images(prediction, imgs_cpu)
+        # Export if requested
+        if export_dir is not None:
+            if "gs" in export_format:
+                if infer_gs and "gs_video" not in export_format:
+                    export_format = f"{export_format}-gs_video"
+                if "gs_video" in export_format:
+                    if "gs_video" not in export_kwargs:
+                        export_kwargs["gs_video"] = {}
+                    export_kwargs["gs_video"].update(
+                        {
+                            "extrinsics": render_exts,
+                            "intrinsics": render_ixts,
+                            "out_image_hw": render_hw,
+                        }
+                    )
+            # Add GLB export parameters
+            if "glb" in export_format:
+                if "glb" not in export_kwargs:
+                    export_kwargs["glb"] = {}
+                export_kwargs["glb"].update(
+                    {
+                        "conf_thresh_percentile": conf_thresh_percentile,
+                        "num_max_points": num_max_points,
+                        "show_cameras": show_cameras,
+                    }
+                )
+            # Add Feat_vis export parameters
+            if "feat_vis" in export_format:
+                if "feat_vis" not in export_kwargs:
+                    export_kwargs["feat_vis"] = {}
+                export_kwargs["feat_vis"].update(
+                    {
+                        "fps": feat_vis_fps,
+                    }
+                )
+            # Add COLMAP export parameters
+            if "colmap" in export_format:
+                if "colmap" not in export_kwargs:
+                    export_kwargs["colmap"] = {}
+                export_kwargs["colmap"].update(
+                    {
+                        "image_paths": image,
+                        "conf_thresh_percentile": conf_thresh_percentile,
+                        "process_res_method": process_res_method,
+                    }
+                )
+            self._export_results(prediction, export_format, export_dir, **export_kwargs)
+        return prediction
+    def _preprocess_inputs(
+        self,
+        image: list[np.ndarray | Image.Image | str],
+        extrinsics: np.ndarray | None = None,
+        intrinsics: np.ndarray | None = None,
+        process_res: int = 504,
+        process_res_method: str = "upper_bound_resize",
+    ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+        """Preprocess input images using input processor."""
+        start_time = time.time()
+        imgs_cpu, extrinsics, intrinsics = self.input_processor(
+            image,
+            extrinsics.copy() if extrinsics is not None else None,
+            intrinsics.copy() if intrinsics is not None else None,
+            process_res,
+            process_res_method,
+        )
+        end_time = time.time()
+        logger.info(
+            "Processed Images Done taking",
+            end_time - start_time,
+            "seconds. Shape: ",
+            imgs_cpu.shape,
+        )
+        return imgs_cpu, extrinsics, intrinsics
+    def _prepare_model_inputs(
+        self,
+        imgs_cpu: torch.Tensor,
+        extrinsics: torch.Tensor | None,
+        intrinsics: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+        """Prepare tensors for model input."""
+        device = self._get_model_device()
+        # Move images to model device
+        imgs = imgs_cpu.to(device, non_blocking=True)[None].float()
+        # Convert camera parameters to tensors
+        ex_t = (
+            extrinsics.to(device, non_blocking=True)[None].float()
+            if extrinsics is not None
+            else None
+        )
+        in_t = (
+            intrinsics.to(device, non_blocking=True)[None].float()
+            if intrinsics is not None
+            else None
+        )
+        return imgs, ex_t, in_t
+    def _normalize_extrinsics(self, ex_t: torch.Tensor | None) -> torch.Tensor | None:
+        """Normalize extrinsics"""
+        if ex_t is None:
+            return None
+        transform = affine_inverse(ex_t[:, :1])
+        ex_t_norm = ex_t @ transform
+        c2ws = affine_inverse(ex_t_norm)
+        translations = c2ws[..., :3, 3]
+        dists = translations.norm(dim=-1)
+        median_dist = torch.median(dists)
+        median_dist = torch.clamp(median_dist, min=1e-1)
+        ex_t_norm[..., :3, 3] = ex_t_norm[..., :3, 3] / median_dist
+        return ex_t_norm
+    def _align_to_input_extrinsics_intrinsics(
+        self,
+        extrinsics: torch.Tensor | None,
+        intrinsics: torch.Tensor | None,
+        prediction: Prediction,
+        align_to_input_ext_scale: bool = True,
+        ransac_view_thresh: int = 10,
+    ) -> Prediction:
+        """Align depth map to input extrinsics"""
+        if extrinsics is None:
+            return prediction
+        prediction.intrinsics = intrinsics.numpy()
+        _, _, scale, aligned_extrinsics = align_poses_umeyama(
+            prediction.extrinsics,
+            extrinsics.numpy(),
+            ransac=len(extrinsics) >= ransac_view_thresh,
+            return_aligned=True,
+            random_state=42,
+        )
+        if align_to_input_ext_scale:
+            prediction.extrinsics = extrinsics[..., :3, :].numpy()
+            prediction.depth /= scale
+        else:
+            prediction.extrinsics = aligned_extrinsics
+        return prediction
+    def _run_model_forward(
+        self,
+        imgs: torch.Tensor,
+        ex_t: torch.Tensor | None,
+        in_t: torch.Tensor | None,
+        export_feat_layers: Sequence[int] | None = None,
+        infer_gs: bool = False,
+        use_ray_pose: bool = False,
+        ref_view_strategy: str = "saddle_balanced",
+    ) -> dict[str, torch.Tensor]:
+        """Run model forward pass."""
+        device = imgs.device
+        need_sync = device.type == "cuda"
+        if need_sync:
+            torch.cuda.synchronize(device)
+        start_time = time.time()
+        feat_layers = list(export_feat_layers) if export_feat_layers is not None else None
+        output = self.forward(imgs, ex_t, in_t, feat_layers, infer_gs, use_ray_pose, ref_view_strategy)
+        if need_sync:
+            torch.cuda.synchronize(device)
+        end_time = time.time()
+        logger.info(f"Model Forward Pass Done. Time: {end_time - start_time} seconds")
+        return output
+    def _convert_to_prediction(self, raw_output: dict[str, torch.Tensor]) -> Prediction:
+        """Convert raw model output to Prediction object."""
+        start_time = time.time()
+        output = self.output_processor(raw_output)
+        end_time = time.time()
+        logger.info(f"Conversion to Prediction Done. Time: {end_time - start_time} seconds")
+        return output
+    def _add_processed_images(self, prediction: Prediction, imgs_cpu: torch.Tensor) -> Prediction:
+        """Add processed images to prediction for visualization."""
+        # Convert from (N, 3, H, W) to (N, H, W, 3) and denormalize
+        processed_imgs = imgs_cpu.permute(0, 2, 3, 1).cpu().numpy()  # (N, H, W, 3)
+        # Denormalize from ImageNet normalization
+        mean = np.array([0.485, 0.456, 0.406])
+        std = np.array([0.229, 0.224, 0.225])
+        processed_imgs = processed_imgs * std + mean
+        processed_imgs = np.clip(processed_imgs, 0, 1)
+        processed_imgs = (processed_imgs * 255).astype(np.uint8)
+        prediction.processed_images = processed_imgs
+        return prediction
+    def _export_results(
+        self, prediction: Prediction, export_format: str, export_dir: str, **kwargs
+    ) -> None:
+        """Export results to specified format and directory."""
+        start_time = time.time()
+        export(prediction, export_format, export_dir, **kwargs)
+        end_time = time.time()
+        logger.info(f"Export Results Done. Time: {end_time - start_time} seconds")
+    def _get_model_device(self) -> torch.device:
+        """
+        Get the device where the model is located.
+        Returns:
+            Device where the model parameters are located
+        Raises:
+            ValueError: If no tensors are found in the model
+        """
+        if self.device is not None:
+            return self.device
+        # Find device from parameters
+        for param in self.parameters():
+            self.device = param.device
+            return param.device
+        # Find device from buffers
+        for buffer in self.buffers():
+            self.device = buffer.device
+            return buffer.device
+        raise ValueError("No tensor found in model")

Depth-Anything-3/src/depth_anything_3/app/css_and_html.py ADDED Viewed

	@@ -0,0 +1,594 @@

+# flake8: noqa: E501
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CSS and HTML content for the Depth Anything 3 Gradio application.
+This module contains all the CSS styles and HTML content blocks
+used in the Gradio interface.
+"""
+# CSS Styles for the Gradio interface
+GRADIO_CSS = """
+/* Add Font Awesome CDN with all styles including brands and colors */
+@import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css');
+/* Add custom styles for colored icons */
+.fa-color-blue {
+    color: #3b82f6;
+}
+.fa-color-purple {
+    color: #8b5cf6;
+}
+.fa-color-cyan {
+    color: #06b6d4;
+}
+.fa-color-green {
+    color: #10b981;
+}
+.fa-color-yellow {
+    color: #f59e0b;
+}
+.fa-color-red {
+    color: #ef4444;
+}
+.link-btn {
+    display: inline-flex;
+    align-items: center;
+    gap: 8px;
+    text-decoration: none;
+    padding: 12px 24px;
+    border-radius: 50px;
+    font-weight: 500;
+    transition: all 0.3s ease;
+}
+/* Dark mode tech theme */
+@media (prefers-color-scheme: dark) {
+    html, body {
+        background: #1e293b;
+        color: #ffffff;
+    }
+    .gradio-container {
+        background: #1e293b;
+        color: #ffffff;
+    }
+    .link-btn {
+        background: rgba(255, 255, 255, 0.2);
+        color: white;
+        backdrop-filter: blur(10px);
+        border: 1px solid rgba(255, 255, 255, 0.3);
+    }
+    .link-btn:hover {
+        background: rgba(255, 255, 255, 0.3);
+        transform: translateY(-2px);
+        box-shadow: 0 8px 25px rgba(0, 0, 0, 0.2);
+    }
+    .tech-bg {
+        background: linear-gradient(135deg, #0f172a, #1e293b); /* Darker colors */
+        position: relative;
+        overflow: hidden;
+    }
+    .tech-bg::before {
+        content: '';
+        position: absolute;
+        top: 0;
+        left: 0;
+        right: 0;
+        bottom: 0;
+        background:
+            radial-gradient(circle at 20% 80%, rgba(59, 130, 246, 0.15) 0%, transparent 50%), /* Reduced opacity */
+            radial-gradient(circle at 80% 20%, rgba(139, 92, 246, 0.15) 0%, transparent 50%), /* Reduced opacity */
+            radial-gradient(circle at 40% 40%, rgba(18, 194, 233, 0.1) 0%, transparent 50%); /* Reduced opacity */
+        animation: techPulse 8s ease-in-out infinite;
+    }
+    .gradio-container .panel,
+    .gradio-container .block,
+    .gradio-container .form {
+        background: rgba(0, 0, 0, 0.3);
+        border: 1px solid rgba(59, 130, 246, 0.2);
+        border-radius: 10px;
+    }
+    .gradio-container * {
+        color: #ffffff;
+    }
+    .gradio-container label {
+        color: #e0e0e0;
+    }
+    .gradio-container .markdown {
+        color: #e0e0e0;
+    }
+}
+/* Light mode tech theme */
+@media (prefers-color-scheme: light) {
+    html, body {
+        background: #ffffff;
+        color: #1e293b;
+    }
+    .gradio-container {
+        background: #ffffff;
+        color: #1e293b;
+    }
+    .tech-bg {
+        background: linear-gradient(135deg, #ffffff, #f1f5f9);
+        position: relative;
+        overflow: hidden;
+    }
+    .link-btn {
+        background: rgba(59, 130, 246, 0.15);
+        color: var(--body-text-color);
+        border: 1px solid rgba(59, 130, 246, 0.3);
+    }
+    .link-btn:hover {
+        background: rgba(59, 130, 246, 0.25);
+        transform: translateY(-2px);
+        box-shadow: 0 8px 25px rgba(59, 130, 246, 0.2);
+    }
+    .tech-bg::before {
+        content: '';
+        position: absolute;
+        top: 0;
+        left: 0;
+        right: 0;
+        bottom: 0;
+        background:
+            radial-gradient(circle at 20% 80%, rgba(59, 130, 246, 0.1) 0%, transparent 50%),
+            radial-gradient(circle at 80% 20%, rgba(139, 92, 246, 0.1) 0%, transparent 50%),
+            radial-gradient(circle at 40% 40%, rgba(18, 194, 233, 0.08) 0%, transparent 50%);
+        animation: techPulse 8s ease-in-out infinite;
+    }
+    .gradio-container .panel,
+    .gradio-container .block,
+    .gradio-container .form {
+        background: rgba(255, 255, 255, 0.8);
+        border: 1px solid rgba(59, 130, 246, 0.3);
+        border-radius: 10px;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    .gradio-container * {
+        color: #1e293b;
+    }
+    .gradio-container label {
+        color: #334155;
+    }
+    .gradio-container .markdown {
+        color: #334155;
+    }
+}
+@keyframes techPulse {
+    0%, 100% { opacity: 0.5; }
+    50% { opacity: 0.8; }
+}
+/* Custom log with tech gradient */
+.custom-log * {
+    font-style: italic;
+    font-size: 22px !important;
+    background: linear-gradient(135deg, #3b82f6, #8b5cf6);
+    background-size: 400% 400%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    font-weight: bold !important;
+    color: transparent !important;
+    text-align: center !important;
+    animation: techGradient 3s ease infinite;
+}
+@keyframes techGradient {
+    0% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+    100% { background-position: 0% 50%; }
+}
+@keyframes metricPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+@keyframes pointcloudPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+@keyframes camerasPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+@keyframes gaussiansPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+/* Special colors for key terms - Global styles */
+.metric-text {
+    background: linear-gradient(45deg, #ff6b6b, #ff8e53, #ff6b6b);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: metricPulse 2s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
+}
+.pointcloud-text {
+    background: linear-gradient(45deg, #4ecdc4, #44a08d, #4ecdc4);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: pointcloudPulse 2.5s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(78, 205, 196, 0.5);
+}
+.cameras-text {
+    background: linear-gradient(45deg, #667eea, #764ba2, #667eea);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: camerasPulse 3s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(102, 126, 234, 0.5);
+}
+.gaussians-text {
+    background: linear-gradient(45deg, #f093fb, #f5576c, #f093fb);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: gaussiansPulse 2.2s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(240, 147, 251, 0.5);
+}
+.example-log * {
+    font-style: italic;
+    font-size: 16px !important;
+    background: linear-gradient(135deg, #3b82f6, #8b5cf6);
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+}
+#my_radio .wrap {
+    display: flex;
+    flex-wrap: nowrap;
+    justify-content: center;
+    align-items: center;
+}
+#my_radio .wrap label {
+    display: flex;
+    width: 50%;
+    justify-content: center;
+    align-items: center;
+    margin: 0;
+    padding: 10px 0;
+    box-sizing: border-box;
+}
+/* Align navigation buttons with dropdown bottom */
+.navigation-row {
+    display: flex !important;
+    align-items: flex-end !important;
+    gap: 8px !important;
+}
+.navigation-row > div:nth-child(1),
+.navigation-row > div:nth-child(3) {
+    align-self: flex-end !important;
+}
+.navigation-row > div:nth-child(2) {
+    flex: 1 !important;
+}
+/* Make thumbnails clickable with pointer cursor */
+.clickable-thumbnail img {
+    cursor: pointer !important;
+}
+.clickable-thumbnail:hover img {
+    cursor: pointer !important;
+    opacity: 0.8;
+    transition: opacity 0.3s ease;
+}
+/* Make thumbnail containers narrower horizontally */
+.clickable-thumbnail {
+    padding: 5px 2px !important;
+    margin: 0 2px !important;
+}
+.clickable-thumbnail .image-container {
+    margin: 0 !important;
+    padding: 0 !important;
+}
+.scene-info {
+    text-align: center !important;
+    padding: 5px 2px !important;
+    margin: 0 !important;
+}
+"""
+def get_header_html(logo_base64=None):
+    """
+    Generate the main header HTML with logo and title.
+    Args:
+        logo_base64 (str, optional): Base64 encoded logo image
+    Returns:
+        str: HTML string for the header
+    """
+    return """
+    <div class="tech-bg" style="text-align: center; margin-bottom: 5px; padding: 40px 20px; border-radius: 15px; position: relative; overflow: hidden;">
+        <div style="position: relative; z-index: 2;">
+            <h1 style="margin: 0; font-size: 3.5em; font-weight: 700;
+                background: linear-gradient(135deg, #3b82f6, #8b5cf6);
+                background-size: 400% 400%;
+                -webkit-background-clip: text;
+                background-clip: text;
+                color: transparent;
+                animation: techGradient 3s ease infinite;
+                text-shadow: 0 0 30px rgba(59, 130, 246, 0.5);
+                letter-spacing: 2px;">
+                Depth Anything 3
+            </h1>
+            <p style="margin: 15px 0 0 0; font-size: 2.16em; font-weight: 300;" class="header-subtitle">
+                Recovering the Visual Space from Any Views
+            </p>
+            <div style="margin-top: 20px;">
+                <!-- Revert buttons to original inline styles -->
+                <a href="https://depth-anything-3.github.io" target="_blank" class="link-btn">
+                    <i class="fas fa-globe" style="margin-right: 8px;"></i> Project Page
+                </a>
+                <a href="https://arxiv.org/abs/2406.09414" target="_blank" class="link-btn">
+                    <i class="fas fa-file-pdf" style="margin-right: 8px;"></i> Paper
+                </a>
+                <a href="https://github.com/ByteDance-Seed/Depth-Anything-3" target="_blank" class="link-btn">
+                    <i class="fab fa-github" style="margin-right: 8px;"></i> Code
+                </a>
+            </div>
+        </div>
+    </div>
+    <style>
+        /* Ensure tech-bg class is properly applied in dark mode */
+        @media (prefers-color-scheme: dark) {
+            .header-subtitle {
+                color: #cbd5e1;
+            }
+            /* Increase priority to ensure background color is properly applied */
+            .tech-bg {
+                background: linear-gradient(135deg, #0f172a, #1e293b) !important;
+            }
+        }
+        @media (prefers-color-scheme: light) {
+            .header-subtitle {
+                color: #475569;
+            }
+            /* Also add explicit background color for light mode */
+            .tech-bg {
+                background: linear-gradient(135deg, rgba(59, 130, 246, 0.1) 0%, rgba(139, 92, 246, 0.1) 100%) !important;
+            }
+        }
+    </style>
+    """
+def get_description_html():
+    """
+    Generate the main description and getting started HTML.
+    Returns:
+        str: HTML string for the description
+    """
+    return """
+    <div class="description-container" style="padding: 25px; border-radius: 15px; margin: 0 0 20px 0;">
+        <h2 class="description-title" style="margin-top: 0; font-size: 1.6em; text-align: center;">
+            <i class="fas fa-bullseye fa-color-red" style="margin-right: 8px;"></i> What This Demo Does
+        </h2>
+        <div class="description-content" style="padding: 20px; border-radius: 10px; margin: 15px 0; text-align: center;">
+            <p class="description-main" style="line-height: 1.6; margin: 0; font-size: 1.45em;">
+                <strong>Upload images or videos</strong> → <strong>Get <span class="metric-text">Metric</span> <span class="pointcloud-text">Point Clouds</span>, <span class="cameras-text">Cameras</span> and <span class="gaussians-text">Novel Views</span></strong> → <strong>Explore in 3D</strong>
+            </p>
+        </div>
+        <div style="text-align: center; margin-top: 15px;">
+            <p class="description-tip" style="font-style: italic; margin: 0;">
+                <i class="fas fa-lightbulb fa-color-yellow" style="margin-right: 8px;"></i> <strong>Tip:</strong> Landscape-oriented images or videos are preferred for best 3D recovering.
+            </p>
+        </div>
+    </div>
+    <style>
+        @media (prefers-color-scheme: dark) {
+            .description-container {
+                background: linear-gradient(135deg, rgba(59, 130, 246, 0.1) 0%, rgba(139, 92, 246, 0.1) 100%);
+                border: 1px solid rgba(59, 130, 246, 0.2);
+            }
+            .description-title { color: #3b82f6; }
+            .description-content { background: rgba(0, 0, 0, 0.3); }
+            .description-main { color: #e0e0e0; }
+            .description-text { color: #cbd5e1; }
+            .description-tip { color: #cbd5e1; }
+        }
+        @media (prefers-color-scheme: light) {
+            .description-container {
+                background: linear-gradient(135deg, rgba(59, 130, 246, 0.05) 0%, rgba(139, 92, 246, 0.05) 100%);
+                border: 1px solid rgba(59, 130, 246, 0.3);
+            }
+            .description-title { color: #3b82f6; }
+            .description-content { background: transparent; }
+            .description-main { color: #1e293b; }
+            .description-text { color: #475569; }
+            .description-tip { color: #475569; }
+        }
+    </style>
+    """
+def get_acknowledgements_html():
+    """
+    Generate the acknowledgements section HTML.
+    Returns:
+        str: HTML string for the acknowledgements
+    """
+    return """
+    <div style="background: linear-gradient(135deg, rgba(59, 130, 246, 0.1) 0%, rgba(139, 92, 246, 0.1) 100%);
+                padding: 25px; border-radius: 15px; margin: 20px 0; border: 1px solid rgba(59, 130, 246, 0.2);">
+        <h3 style="color: #3b82f6; margin-top: 0; text-align: center; font-size: 1.4em;">
+            <i class="fas fa-trophy fa-color-yellow" style="margin-right: 8px;"></i> Research Credits & Acknowledgments
+        </h3>
+        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin: 15px 0;">
+            <!-- Original Research Section (Left) -->
+            <div style="text-align: center;">
+                <h4 style="color: #8b5cf6; margin: 10px 0;"><i class="fas fa-flask fa-color-green" style="margin-right: 8px;"></i> Original Research</h4>
+                <p style="color: #e0e0e0; margin: 5px 0;">
+                    <a href="https://depth-anything-3.github.io" target="_blank"
+                       style="color: #3b82f6; text-decoration: none; font-weight: 600;">
+                        Depth Anything 3
+                    </a>
+                </p>
+            </div>
+            <!-- Previous Versions Section (Right) -->
+            <div style="text-align: center;">
+                <h4 style="color: #8b5cf6; margin: 10px 0;"><i class="fas fa-history fa-color-blue" style="margin-right: 8px;"></i> Previous Versions</h4>
+                <div style="display: flex; flex-direction: row; gap: 15px; justify-content: center; align-items: center;">
+                    <p style="color: #e0e0e0; margin: 0;">
+                        <a href="https://huggingface.co/spaces/LiheYoung/Depth-Anything" target="_blank"
+                           style="color: #3b82f6; text-decoration: none; font-weight: 600;">
+                            Depth-Anything
+                        </a>
+                    </p>
+                    <span style="color: #e0e0e0;">•</span>
+                    <p style="color: #e0e0e0; margin: 0;">
+                        <a href="https://huggingface.co/spaces/depth-anything/Depth-Anything-V2" target="_blank"
+                           style="color: #3b82f6; text-decoration: none; font-weight: 600;">
+                            Depth-Anything-V2
+                        </a>
+                    </p>
+                </div>
+            </div>
+        </div>
+        <!-- HF Demo Adapted from - Centered at the bottom of the whole block -->
+        <div style="margin-top: 20px; padding-top: 15px; border-top: 1px solid rgba(59, 130, 246, 0.3); text-align: center;">
+            <p style="color: #a0a0a0; font-size: 0.9em; margin: 0;">
+                <i class="fas fa-code-branch fa-color-gray" style="margin-right: 5px;"></i> HF demo adapted from <a href="https://huggingface.co/spaces/facebook/map-anything" target="_blank" style="color: inherit; text-decoration: none;">Map Anything</a>
+            </p>
+        </div>
+    </div>
+    """
+def get_gradio_theme():
+    """
+    Get the configured Gradio theme with adaptive tech colors.
+    Returns:
+        gr.themes.Base: Configured Gradio theme
+    """
+    import gradio as gr
+    return gr.themes.Base(
+        primary_hue=gr.themes.Color(
+            c50="#eff6ff",
+            c100="#dbeafe",
+            c200="#bfdbfe",
+            c300="#93c5fd",
+            c400="#60a5fa",
+            c500="#3b82f6",
+            c600="#2563eb",
+            c700="#1d4ed8",
+            c800="#1e40af",
+            c900="#1e3a8a",
+            c950="#172554",
+        ),
+        secondary_hue=gr.themes.Color(
+            c50="#f5f3ff",
+            c100="#ede9fe",
+            c200="#ddd6fe",
+            c300="#c4b5fd",
+            c400="#a78bfa",
+            c500="#8b5cf6",
+            c600="#7c3aed",
+            c700="#6d28d9",
+            c800="#5b21b6",
+            c900="#4c1d95",
+            c950="#2e1065",
+        ),
+        neutral_hue=gr.themes.Color(
+            c50="#f8fafc",
+            c100="#f1f5f9",
+            c200="#e2e8f0",
+            c300="#cbd5e1",
+            c400="#94a3b8",
+            c500="#64748b",
+            c600="#475569",
+            c700="#334155",
+            c800="#1e293b",
+            c900="#0f172a",
+            c950="#020617",
+        ),
+    )
+# Measure tab instructions HTML
+MEASURE_INSTRUCTIONS_HTML = """
+### Click points on the image to compute distance.
+> <i class="fas fa-triangle-exclamation fa-color-red" style="margin-right: 5px;"></i> Metric scale estimation is difficult on aerial/drone images.
+"""

Depth-Anything-3/src/depth_anything_3/app/gradio_app.py ADDED Viewed

	@@ -0,0 +1,724 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Refactored Gradio App for Depth Anything 3.
+This is the main application file that orchestrates all components.
+The original functionality has been split into modular components for better maintainability.
+"""
+import argparse
+import os
+from typing import Any, Dict, List
+import gradio as gr
+from depth_anything_3.app.css_and_html import GRADIO_CSS, get_gradio_theme
+from depth_anything_3.app.modules.event_handlers import EventHandlers
+from depth_anything_3.app.modules.ui_components import UIComponents
+# Set environment variables
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+class DepthAnything3App:
+    """
+    Main application class for Depth Anything 3 Gradio app.
+    """
+    def __init__(self, model_dir: str = None, workspace_dir: str = None, gallery_dir: str = None):
+        """
+        Initialize the application.
+        Args:
+            model_dir: Path to the model directory
+            workspace_dir: Path to the workspace directory
+            gallery_dir: Path to the gallery directory
+        """
+        self.model_dir = model_dir
+        self.workspace_dir = workspace_dir
+        self.gallery_dir = gallery_dir
+        # Set environment variables for directories
+        if self.model_dir:
+            os.environ["DA3_MODEL_DIR"] = self.model_dir
+        if self.workspace_dir:
+            os.environ["DA3_WORKSPACE_DIR"] = self.workspace_dir
+        if self.gallery_dir:
+            os.environ["DA3_GALLERY_DIR"] = self.gallery_dir
+        self.event_handlers = EventHandlers()
+        self.ui_components = UIComponents()
+    def cache_examples(
+        self,
+        show_cam: bool = True,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        save_percentage: float = 20.0,
+        num_max_points: int = 1000,
+        cache_gs_tag: str = "",
+        gs_trj_mode: str = "smooth",
+        gs_video_quality: str = "low",
+    ) -> None:
+        """
+        Pre-cache all example scenes at startup.
+        Args:
+            show_cam: Whether to show camera in visualization
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            save_percentage: Filter percentage for point cloud
+            num_max_points: Maximum number of points
+            cache_gs_tag: Tag to match scene names for high-res+3DGS caching (e.g., "dl3dv")
+            gs_trj_mode: Trajectory mode for 3DGS
+            gs_video_quality: Video quality for 3DGS
+        """
+        from depth_anything_3.app.modules.utils import get_scene_info
+        examples_dir = os.path.join(self.workspace_dir, "examples")
+        if not os.path.exists(examples_dir):
+            print(f"Examples directory not found: {examples_dir}")
+            return
+        scenes = get_scene_info(examples_dir)
+        if not scenes:
+            print("No example scenes found to cache.")
+            return
+        print(f"\n{'='*60}")
+        print(f"Caching {len(scenes)} example scenes...")
+        print(f"{'='*60}\n")
+        for i, scene in enumerate(scenes, 1):
+            scene_name = scene["name"]
+            # Check if scene name matches the gs tag for high-res+3DGS caching
+            use_high_res_gs = cache_gs_tag and cache_gs_tag.lower() in scene_name.lower()
+            if use_high_res_gs:
+                print(f"[{i}/{len(scenes)}] Caching scene: {scene_name} (HIGH-RES + 3DGS)")
+                print(f"  - Number of images: {scene['num_images']}")
+                print(f"  - Matched tag: '{cache_gs_tag}' - using high_res + 3DGS")
+            else:
+                print(f"[{i}/{len(scenes)}] Caching scene: {scene_name} (LOW-RES)")
+                print(f"  - Number of images: {scene['num_images']}")
+            try:
+                # Load example scene
+                _, target_dir, _, _, _, _, _, _, _ = self.event_handlers.load_example_scene(
+                    scene_name
+                )
+                if target_dir and target_dir != "None":
+                    # Run reconstruction with appropriate settings
+                    print("  - Running reconstruction...")
+                    result = self.event_handlers.gradio_demo(
+                        target_dir=target_dir,
+                        show_cam=show_cam,
+                        filter_black_bg=filter_black_bg,
+                        filter_white_bg=filter_white_bg,
+                        process_res_method="high_res" if use_high_res_gs else "low_res",
+                        save_percentage=save_percentage,
+                        num_max_points=num_max_points,
+                        infer_gs=use_high_res_gs,
+                        ref_view_strategy="saddle_balanced",
+                        gs_trj_mode=gs_trj_mode,
+                        gs_video_quality=gs_video_quality,
+                    )
+                    # Check if successful
+                    if result[0] is not None:  # reconstruction_output
+                        print(f"  ✓ Scene '{scene_name}' cached successfully")
+                    else:
+                        print(f"  ✗ Scene '{scene_name}' caching failed: {result[1]}")
+                else:
+                    print(f"  ✗ Scene '{scene_name}' loading failed")
+            except Exception as e:
+                print(f"  ✗ Error caching scene '{scene_name}': {str(e)}")
+            print()
+        print("=" * 60)
+        print("Example scene caching completed!")
+        print("=" * 60 + "\n")
+    def create_app(self) -> gr.Blocks:
+        """
+        Create and configure the Gradio application.
+        Returns:
+            Configured Gradio Blocks interface
+        """
+        # Initialize theme
+        def get_theme():
+            return get_gradio_theme()
+        with gr.Blocks(theme=get_theme(), css=GRADIO_CSS) as demo:
+            # State variables for the tabbed interface
+            is_example = gr.Textbox(label="is_example", visible=False, value="None")
+            processed_data_state = gr.State(value=None)
+            measure_points_state = gr.State(value=[])
+            selected_image_index_state = gr.State(value=0)  # Track selected image index
+            # current_view_index = gr.State(value=0)  # noqa: F841 Track current view index
+            # Header and description
+            self.ui_components.create_header_section()
+            self.ui_components.create_description_section()
+            target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
+            # Main content area
+            with gr.Row():
+                with gr.Column(scale=2):
+                    # Upload section
+                    (
+                        input_video,
+                        s_time_interval,
+                        input_images,
+                        image_gallery,
+                    ) = self.ui_components.create_upload_section()
+                with gr.Column(scale=4):
+                    with gr.Column():
+                        # gr.Markdown("**Metric 3D Reconstruction (Point Cloud and Camera Poses)**")
+                        # Reconstruction control section (buttons) - moved below tabs
+                        log_output = gr.Markdown(
+                            "Please upload a video or images, then click Reconstruct.",
+                            elem_classes=["custom-log"],
+                        )
+                        # Tabbed interface
+                        with gr.Tabs():
+                            with gr.Tab("Point Cloud & Cameras"):
+                                reconstruction_output = (
+                                    self.ui_components.create_3d_viewer_section()
+                                )
+                            with gr.Tab("Metric Depth"):
+                                (
+                                    prev_measure_btn,
+                                    measure_view_selector,
+                                    next_measure_btn,
+                                    measure_image,
+                                    measure_depth_image,
+                                    measure_text,
+                                ) = self.ui_components.create_measure_section()
+                            with gr.Tab("3DGS Rendered Novel Views"):
+                                gs_video, gs_info = self.ui_components.create_nvs_video()
+                        # Inference control section (before inference)
+                        (process_res_method_dropdown, infer_gs, ref_view_strategy_dropdown) = (
+                            self.ui_components.create_inference_control_section()
+                        )
+                        # Display control section - includes 3DGS options, buttons, and Visualization Options  # noqa: E501
+                        (
+                            show_cam,
+                            filter_black_bg,
+                            filter_white_bg,
+                            save_percentage,
+                            num_max_points,
+                            gs_trj_mode,
+                            gs_video_quality,
+                            submit_btn,
+                            clear_btn,
+                        ) = self.ui_components.create_display_control_section()
+                        # bind visibility of gs_trj_mode to infer_gs
+                        infer_gs.change(
+                            fn=lambda checked: (
+                                gr.update(visible=checked),
+                                gr.update(visible=checked),
+                                gr.update(visible=checked),
+                                gr.update(visible=(not checked)),
+                            ),
+                            inputs=infer_gs,
+                            outputs=[gs_trj_mode, gs_video_quality, gs_video, gs_info],
+                        )
+            # Example scenes section
+            gr.Markdown("## Example Scenes")
+            scenes = self.ui_components.create_example_scenes_section()
+            scene_components = self.ui_components.create_example_scene_grid(scenes)
+            # Set up event handlers
+            self._setup_event_handlers(
+                demo,
+                is_example,
+                processed_data_state,
+                measure_points_state,
+                target_dir_output,
+                input_video,
+                input_images,
+                s_time_interval,
+                image_gallery,
+                reconstruction_output,
+                log_output,
+                show_cam,
+                filter_black_bg,
+                filter_white_bg,
+                process_res_method_dropdown,
+                save_percentage,
+                submit_btn,
+                clear_btn,
+                num_max_points,
+                infer_gs,
+                ref_view_strategy_dropdown,
+                selected_image_index_state,
+                measure_view_selector,
+                measure_image,
+                measure_depth_image,
+                measure_text,
+                prev_measure_btn,
+                next_measure_btn,
+                scenes,
+                scene_components,
+                gs_video,
+                gs_info,
+                gs_trj_mode,
+                gs_video_quality,
+            )
+            # Acknowledgements
+            self.ui_components.create_acknowledgements_section()
+        return demo
+    def _setup_event_handlers(
+        self,
+        demo: gr.Blocks,
+        is_example: gr.Textbox,
+        processed_data_state: gr.State,
+        measure_points_state: gr.State,
+        target_dir_output: gr.Textbox,
+        input_video: gr.Video,
+        input_images: gr.File,
+        s_time_interval: gr.Slider,
+        image_gallery: gr.Gallery,
+        reconstruction_output: gr.Model3D,
+        log_output: gr.Markdown,
+        show_cam: gr.Checkbox,
+        filter_black_bg: gr.Checkbox,
+        filter_white_bg: gr.Checkbox,
+        process_res_method_dropdown: gr.Dropdown,
+        save_percentage: gr.Slider,
+        submit_btn: gr.Button,
+        clear_btn: gr.ClearButton,
+        num_max_points: gr.Slider,
+        infer_gs: gr.Checkbox,
+        ref_view_strategy_dropdown: gr.Dropdown,
+        selected_image_index_state: gr.State,
+        measure_view_selector: gr.Dropdown,
+        measure_image: gr.Image,
+        measure_depth_image: gr.Image,
+        measure_text: gr.Markdown,
+        prev_measure_btn: gr.Button,
+        next_measure_btn: gr.Button,
+        scenes: List[Dict[str, Any]],
+        scene_components: List[gr.Image],
+        gs_video: gr.Video,
+        gs_info: gr.Markdown,
+        gs_trj_mode: gr.Dropdown,
+        gs_video_quality: gr.Dropdown,
+    ) -> None:
+        """
+        Set up all event handlers for the application.
+        Args:
+            demo: Gradio Blocks interface
+            All other arguments: Gradio components to connect
+        """
+        # Configure clear button
+        clear_btn.add(
+            [
+                input_video,
+                input_images,
+                reconstruction_output,
+                log_output,
+                target_dir_output,
+                image_gallery,
+                gs_video,
+            ]
+        )
+        # Main reconstruction button
+        submit_btn.click(
+            fn=self.event_handlers.clear_fields, inputs=[], outputs=[reconstruction_output]
+        ).then(fn=self.event_handlers.update_log, inputs=[], outputs=[log_output]).then(
+            fn=self.event_handlers.gradio_demo,
+            inputs=[
+                target_dir_output,
+                show_cam,
+                filter_black_bg,
+                filter_white_bg,
+                process_res_method_dropdown,
+                save_percentage,
+                # pass num_max_points
+                num_max_points,
+                infer_gs,
+                ref_view_strategy_dropdown,
+                gs_trj_mode,
+                gs_video_quality,
+            ],
+            outputs=[
+                reconstruction_output,
+                log_output,
+                processed_data_state,
+                measure_image,
+                measure_depth_image,
+                measure_text,
+                measure_view_selector,
+                gs_video,
+                gs_video,  # gs_video visibility
+                gs_info,  # gs_info visibility
+            ],
+        ).then(
+            fn=lambda: "False",
+            inputs=[],
+            outputs=[is_example],  # set is_example to "False"
+        )
+        # Real-time visualization updates
+        self._setup_visualization_handlers(
+            show_cam,
+            filter_black_bg,
+            filter_white_bg,
+            process_res_method_dropdown,
+            target_dir_output,
+            is_example,
+            reconstruction_output,
+            log_output,
+        )
+        # File upload handlers
+        input_video.change(
+            fn=self.event_handlers.handle_uploads,
+            inputs=[input_video, input_images, s_time_interval],
+            outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
+        )
+        input_images.change(
+            fn=self.event_handlers.handle_uploads,
+            inputs=[input_video, input_images, s_time_interval],
+            outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
+        )
+        # Navigation handlers
+        self._setup_navigation_handlers(
+            prev_measure_btn,
+            next_measure_btn,
+            measure_view_selector,
+            measure_image,
+            measure_depth_image,
+            measure_points_state,
+            processed_data_state,
+        )
+        # Measurement handler
+        measure_image.select(
+            fn=self.event_handlers.measure,
+            inputs=[processed_data_state, measure_points_state, measure_view_selector],
+            outputs=[measure_image, measure_depth_image, measure_points_state, measure_text],
+        )
+        # Example scene handlers
+        self._setup_example_scene_handlers(
+            scenes,
+            scene_components,
+            reconstruction_output,
+            target_dir_output,
+            image_gallery,
+            log_output,
+            is_example,
+            processed_data_state,
+            measure_view_selector,
+            measure_image,
+            measure_depth_image,
+            gs_video,
+            gs_info,
+        )
+    def _setup_visualization_handlers(
+        self,
+        show_cam: gr.Checkbox,
+        filter_black_bg: gr.Checkbox,
+        filter_white_bg: gr.Checkbox,
+        process_res_method_dropdown: gr.Dropdown,
+        target_dir_output: gr.Textbox,
+        is_example: gr.Textbox,
+        reconstruction_output: gr.Model3D,
+        log_output: gr.Markdown,
+    ) -> None:
+        """Set up visualization update handlers."""
+        # Common inputs for visualization updates
+        viz_inputs = [
+            target_dir_output,
+            show_cam,
+            is_example,
+            filter_black_bg,
+            filter_white_bg,
+            process_res_method_dropdown,
+        ]
+        # Set up change handlers for all visualization controls
+        for component in [show_cam, filter_black_bg, filter_white_bg]:
+            component.change(
+                fn=self.event_handlers.update_visualization,
+                inputs=viz_inputs,
+                outputs=[reconstruction_output, log_output],
+            )
+    def _setup_navigation_handlers(
+        self,
+        prev_measure_btn: gr.Button,
+        next_measure_btn: gr.Button,
+        measure_view_selector: gr.Dropdown,
+        measure_image: gr.Image,
+        measure_depth_image: gr.Image,
+        measure_points_state: gr.State,
+        processed_data_state: gr.State,
+    ) -> None:
+        """Set up navigation handlers for measure tab."""
+        # Measure tab navigation
+        prev_measure_btn.click(
+            fn=lambda processed_data, current_selector: self.event_handlers.navigate_measure_view(
+                processed_data, current_selector, -1
+            ),
+            inputs=[processed_data_state, measure_view_selector],
+            outputs=[
+                measure_view_selector,
+                measure_image,
+                measure_depth_image,
+                measure_points_state,
+            ],
+        )
+        next_measure_btn.click(
+            fn=lambda processed_data, current_selector: self.event_handlers.navigate_measure_view(
+                processed_data, current_selector, 1
+            ),
+            inputs=[processed_data_state, measure_view_selector],
+            outputs=[
+                measure_view_selector,
+                measure_image,
+                measure_depth_image,
+                measure_points_state,
+            ],
+        )
+        measure_view_selector.change(
+            fn=lambda processed_data, selector_value: (
+                self.event_handlers.update_measure_view(
+                    processed_data, int(selector_value.split()[1]) - 1
+                )
+                if selector_value
+                else (None, None, [])
+            ),
+            inputs=[processed_data_state, measure_view_selector],
+            outputs=[measure_image, measure_depth_image, measure_points_state],
+        )
+    def _setup_example_scene_handlers(
+        self,
+        scenes: List[Dict[str, Any]],
+        scene_components: List[gr.Image],
+        reconstruction_output: gr.Model3D,
+        target_dir_output: gr.Textbox,
+        image_gallery: gr.Gallery,
+        log_output: gr.Markdown,
+        is_example: gr.Textbox,
+        processed_data_state: gr.State,
+        measure_view_selector: gr.Dropdown,
+        measure_image: gr.Image,
+        measure_depth_image: gr.Image,
+        gs_video: gr.Video,
+        gs_info: gr.Markdown,
+    ) -> None:
+        """Set up example scene handlers."""
+        def load_and_update_measure(name):
+            result = self.event_handlers.load_example_scene(name)
+            # result = (reconstruction_output, target_dir, image_paths, log_message, processed_data, measure_view_selector, gs_video, gs_video_vis, gs_info_vis)  # noqa: E501
+            # Update measure view if processed_data is available
+            measure_img = None
+            measure_depth = None
+            if result[4] is not None:  # processed_data exists
+                measure_img, measure_depth, _ = (
+                    self.event_handlers.visualization_handler.update_measure_view(result[4], 0)
+                )
+            return result + ("True", measure_img, measure_depth)
+        for i, scene in enumerate(scenes):
+            if i < len(scene_components):
+                scene_components[i].select(
+                    fn=lambda name=scene["name"]: load_and_update_measure(name),
+                    outputs=[
+                        reconstruction_output,
+                        target_dir_output,
+                        image_gallery,
+                        log_output,
+                        processed_data_state,
+                        measure_view_selector,
+                        gs_video,
+                        gs_video,  # gs_video_visibility
+                        gs_info,  # gs_info_visibility
+                        is_example,
+                        measure_image,
+                        measure_depth_image,
+                    ],
+                )
+    def launch(self, host: str = "127.0.0.1", port: int = 7860, **kwargs) -> None:
+        """
+        Launch the application.
+        Args:
+            host: Host address to bind to
+            port: Port number to bind to
+            **kwargs: Additional arguments for demo.launch()
+        """
+        demo = self.create_app()
+        demo.queue(max_size=20).launch(
+            show_error=True, ssr_mode=False, server_name=host, server_port=port, **kwargs
+        )
+def main():
+    """Main function to run the application."""
+    parser = argparse.ArgumentParser(
+        description="Depth Anything 3 Gradio Application",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage
+  python gradio_app.py --help
+  python gradio_app.py --host 0.0.0.0 --port 8080
+  python gradio_app.py --model-dir /path/to/model --workspace-dir /path/to/workspace
+  # Cache examples at startup (all low-res)
+  python gradio_app.py --cache-examples
+  # Cache with selective high-res+3DGS for scenes matching tag
+  python gradio_app.py --cache-examples --cache-gs-tag dl3dv
+  # This will use high-res + 3DGS for scenes containing "dl3dv" in their name,
+  # and low-res only for other scenes
+        """,
+    )
+    # Server configuration
+    parser.add_argument(
+        "--host", default="127.0.0.1", help="Host address to bind to (default: 127.0.0.1)"
+    )
+    parser.add_argument(
+        "--port", type=int, default=7860, help="Port number to bind to (default: 7860)"
+    )
+    # Directory configuration
+    parser.add_argument(
+        "--model-dir",
+        default="depth-anything/DA3NESTED-GIANT-LARGE",
+        help="Path to the model directory (default: depth-anything/DA3NESTED-GIANT-LARGE)",
+    )
+    parser.add_argument(
+        "--workspace-dir",
+        default="workspace/gradio",  # noqa: E501
+        help="Path to the workspace directory (default: workspace/gradio)",  # noqa: E501
+    )
+    parser.add_argument(
+        "--gallery-dir",
+        default="workspace/gallery",
+        help="Path to the gallery directory (default: workspace/gallery)",  # noqa: E501
+    )
+    # Additional Gradio options
+    parser.add_argument("--share", action="store_true", help="Create a public link for the app")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    # Example caching options
+    parser.add_argument(
+        "--cache-examples",
+        action="store_true",
+        help="Pre-cache all example scenes at startup for faster loading",
+    )
+    parser.add_argument(
+        "--cache-gs-tag",
+        type=str,
+        default="",
+        help="Tag to match scene names for high-res+3DGS caching (e.g., 'dl3dv'). Scenes containing this tag will use high_res and infer_gs=True; others will use low_res only.",  # noqa: E501
+    )
+    args = parser.parse_args()
+    # Create directories if they don't exist
+    os.makedirs(args.workspace_dir, exist_ok=True)
+    os.makedirs(args.gallery_dir, exist_ok=True)
+    # Initialize and launch the application
+    app = DepthAnything3App(
+        model_dir=args.model_dir, workspace_dir=args.workspace_dir, gallery_dir=args.gallery_dir
+    )
+    # Prepare launch arguments
+    launch_kwargs = {"share": args.share, "debug": args.debug}
+    print("Starting Depth Anything 3 Gradio App...")
+    print(f"Host: {args.host}")
+    print(f"Port: {args.port}")
+    print(f"Model Directory: {args.model_dir}")
+    print(f"Workspace Directory: {args.workspace_dir}")
+    print(f"Gallery Directory: {args.gallery_dir}")
+    print(f"Share: {args.share}")
+    print(f"Debug: {args.debug}")
+    print(f"Cache Examples: {args.cache_examples}")
+    if args.cache_examples:
+        if args.cache_gs_tag:
+            print(
+                f"Cache GS Tag: '{args.cache_gs_tag}' (scenes matching this tag will use high-res + 3DGS)"  # noqa: E501
+            )  # noqa: E501
+        else:
+            print("Cache GS Tag: None (all scenes will use low-res only)")
+    # Pre-cache examples if requested
+    if args.cache_examples:
+        print("\n" + "=" * 60)
+        print("Pre-caching mode enabled")
+        if args.cache_gs_tag:
+            print(f"Scenes containing '{args.cache_gs_tag}' will use HIGH-RES + 3DGS")
+            print("Other scenes will use LOW-RES only")
+        else:
+            print("All scenes will use LOW-RES only")
+        print("=" * 60)
+        app.cache_examples(
+            show_cam=True,
+            filter_black_bg=False,
+            filter_white_bg=False,
+            save_percentage=5.0,
+            num_max_points=1000,
+            cache_gs_tag=args.cache_gs_tag,
+            gs_trj_mode="smooth",
+            gs_video_quality="low",
+        )
+    app.launch(host=args.host, port=args.port, **launch_kwargs)
+if __name__ == "__main__":
+    main()

Depth-Anything-3/src/depth_anything_3/app/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Modules package for Depth Anything 3 Gradio app.
+This package contains all the modular components for the Gradio application.
+"""
+from depth_anything_3.app.modules.event_handlers import EventHandlers
+from depth_anything_3.app.modules.file_handlers import FileHandler
+from depth_anything_3.app.modules.model_inference import ModelInference
+from depth_anything_3.app.modules.ui_components import UIComponents
+from depth_anything_3.app.modules.utils import (
+    create_depth_visualization,
+    get_logo_base64,
+    get_scene_info,
+    save_to_gallery_func,
+)
+from depth_anything_3.app.modules.visualization import VisualizationHandler
+__all__ = [
+    "ModelInference",
+    "FileHandler",
+    "VisualizationHandler",
+    "EventHandlers",
+    "UIComponents",
+    "create_depth_visualization",
+    "save_to_gallery_func",
+    "get_scene_info",
+    "get_logo_base64",
+]

Depth-Anything-3/src/depth_anything_3/app/modules/event_handlers.py ADDED Viewed

	@@ -0,0 +1,619 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Event handling module for Depth Anything 3 Gradio app.
+This module handles all event callbacks and user interactions.
+"""
+import os
+import time
+from glob import glob
+from typing import Any, Dict, List, Optional, Tuple
+import gradio as gr
+import numpy as np
+import torch
+from depth_anything_3.app.modules.file_handlers import FileHandler
+from depth_anything_3.app.modules.model_inference import ModelInference
+from depth_anything_3.utils.memory import cleanup_cuda_memory
+from depth_anything_3.app.modules.visualization import VisualizationHandler
+class EventHandlers:
+    """
+    Handles all event callbacks and user interactions for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the event handlers."""
+        self.model_inference = ModelInference()
+        self.file_handler = FileHandler()
+        self.visualization_handler = VisualizationHandler()
+    def clear_fields(self) -> None:
+        """
+        Clears the 3D viewer, the stored target_dir, and empties the gallery.
+        """
+        return None
+    def update_log(self) -> str:
+        """
+        Display a quick log message while waiting.
+        """
+        return "Loading and Reconstructing..."
+    def save_current_visualization(
+        self,
+        target_dir: str,
+        save_percentage: float,
+        show_cam: bool,
+        filter_black_bg: bool,
+        filter_white_bg: bool,
+        processed_data: Optional[Dict],
+        scene_name: str = "",
+    ) -> str:
+        """
+        Save current visualization results to gallery with specified save percentage.
+        Args:
+            target_dir: Directory containing results
+            save_percentage: Percentage of points to save (0-100)
+            show_cam: Whether to show cameras
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            processed_data: Processed data from reconstruction
+        Returns:
+            Status message
+        """
+        if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+            return "No reconstruction available. Please run 'Reconstruct' first."
+        if processed_data is None:
+            return "No processed data available. Please run 'Reconstruct' first."
+        try:
+            # Add debug information
+            print("[DEBUG] save_current_visualization called with:")
+            print(f"  target_dir: {target_dir}")
+            print(f"  save_percentage: {save_percentage}")
+            print(f"  show_cam: {show_cam}")
+            print(f"  filter_black_bg: {filter_black_bg}")
+            print(f"  filter_white_bg: {filter_white_bg}")
+            print(f"  processed_data: {processed_data is not None}")
+            # Import the gallery save function
+            # Create gallery name with user input or auto-generated
+            import datetime
+            from .utils import save_to_gallery_func
+            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            if scene_name and scene_name.strip():
+                gallery_name = f"{scene_name.strip()}_{timestamp}_pct{save_percentage:.0f}"
+            else:
+                gallery_name = f"save_{timestamp}_pct{save_percentage:.0f}"
+            print(f"[DEBUG] Saving to gallery with name: {gallery_name}")
+            # Save entire process folder to gallery
+            success, message = save_to_gallery_func(
+                target_dir=target_dir, processed_data=processed_data, gallery_name=gallery_name
+            )
+            if success:
+                print(f"[DEBUG] Gallery save completed successfully: {message}")
+                return (
+                    "Successfully saved to gallery!\n"
+                    f"Gallery name: {gallery_name}\n"
+                    f"Save percentage: {save_percentage}%\n"
+                    f"Show cameras: {show_cam}\n"
+                    f"Filter black bg: {filter_black_bg}\n"
+                    f"Filter white bg: {filter_white_bg}\n\n"
+                    f"{message}"
+                )
+            else:
+                print(f"[DEBUG] Gallery save failed: {message}")
+                return f"Failed to save to gallery: {message}"
+        except Exception as e:
+            return f"Error saving visualization: {str(e)}"
+    def gradio_demo(
+        self,
+        target_dir: str,
+        show_cam: bool = True,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        process_res_method: str = "upper_bound_resize",
+        save_percentage: float = 30.0,
+        num_max_points: int = 1_000_000,
+        infer_gs: bool = False,
+        ref_view_strategy: str = "saddle_balanced",
+        gs_trj_mode: str = "extend",
+        gs_video_quality: str = "high",
+    ) -> Tuple[
+        Optional[str],
+        str,
+        Optional[Dict],
+        Optional[np.ndarray],
+        Optional[np.ndarray],
+        str,
+        gr.Dropdown,
+        Optional[str],  # gs video path
+        gr.update,  # gs video visibility update
+        gr.update,  # gs info visibility update
+    ]:
+        """
+        Perform reconstruction using the already-created target_dir/images.
+        Args:
+            target_dir: Directory containing images
+            show_cam: Whether to show camera
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            process_res_method: Method for resizing input images
+            save_percentage: Filter percentage for point cloud
+            num_max_points: Maximum number of points
+            infer_gs: Whether to infer 3D Gaussian Splatting
+            ref_view_strategy: Reference view selection strategy
+        Returns:
+            Tuple of reconstruction results
+        """
+        if not os.path.isdir(target_dir) or target_dir == "None":
+            return (
+                None,
+                "No valid target directory found. Please upload first.",
+                None,
+                None,
+                None,
+                "",
+                None,
+                None,
+                gr.update(visible=False),  # gs_video
+                gr.update(visible=True),  # gs_info
+            )
+        start_time = time.time()
+        cleanup_cuda_memory()
+        # Get image files for logging
+        target_dir_images = os.path.join(target_dir, "images")
+        all_files = (
+            sorted(os.listdir(target_dir_images)) if os.path.isdir(target_dir_images) else []
+        )
+        print("Running DepthAnything3 model...")
+        print(f"Reference view strategy: {ref_view_strategy}")
+        with torch.no_grad():
+            prediction, processed_data = self.model_inference.run_inference(
+                target_dir,
+                process_res_method=process_res_method,
+                show_camera=show_cam,
+                save_percentage=save_percentage,
+                num_max_points=int(num_max_points * 1000),  # Convert K to actual count
+                infer_gs=infer_gs,
+                ref_view_strategy=ref_view_strategy,
+                gs_trj_mode=gs_trj_mode,
+                gs_video_quality=gs_video_quality,
+            )
+        # The GLB file is already generated by the API
+        glbfile = os.path.join(target_dir, "scene.glb")
+        # Handle 3DGS video based on infer_gs flag
+        gsvideo_path = None
+        gs_video_visible = False
+        gs_info_visible = True
+        if infer_gs:
+            try:
+                gsvideo_path = sorted(glob(os.path.join(target_dir, "gs_video", "*.mp4")))[-1]
+                gs_video_visible = True
+                gs_info_visible = False
+            except IndexError:
+                gsvideo_path = None
+                print("3DGS video not found, but infer_gs was enabled")
+        # Cleanup
+        cleanup_cuda_memory()
+        end_time = time.time()
+        print(f"Total time: {end_time - start_time:.2f} seconds")
+        log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
+        # Populate visualization tabs with processed data
+        depth_vis, measure_img, measure_depth_vis, measure_pts = (
+            self.visualization_handler.populate_visualization_tabs(processed_data)
+        )
+        # Update view selectors based on available views
+        depth_selector, measure_selector = self.visualization_handler.update_view_selectors(
+            processed_data
+        )
+        return (
+            glbfile,
+            log_msg,
+            processed_data,
+            measure_img,  # measure_image
+            measure_depth_vis,  # measure_depth_image
+            "",  # measure_text (empty initially)
+            measure_selector,  # measure_view_selector
+            gsvideo_path,
+            gr.update(visible=gs_video_visible),  # gs_video visibility
+            gr.update(visible=gs_info_visible),  # gs_info visibility
+        )
+    def update_visualization(
+        self,
+        target_dir: str,
+        show_cam: bool,
+        is_example: str,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        process_res_method: str = "upper_bound_resize",
+    ) -> Tuple[gr.update, str]:
+        """
+        Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
+        and return it for the 3D viewer.
+        Args:
+            target_dir: Directory containing results
+            show_cam: Whether to show camera
+            is_example: Whether this is an example scene
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            process_res_method: Method for resizing input images
+        Returns:
+            Tuple of (glb_file, log_message)
+        """
+        if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+            return (
+                gr.update(),
+                "No reconstruction available. Please click the Reconstruct button first.",
+            )
+        # Check if GLB exists (could be cached example or reconstructed scene)
+        glbfile = os.path.join(target_dir, "scene.glb")
+        if os.path.exists(glbfile):
+            return (
+                glbfile,
+                (
+                    "Visualization loaded from cache."
+                    if is_example == "True"
+                    else "Visualization updated."
+                ),
+            )
+        # If no GLB but it's an example that hasn't been reconstructed yet
+        if is_example == "True":
+            return (
+                gr.update(),
+                "No reconstruction available. Please click the Reconstruct button first.",
+            )
+        # For non-examples, check predictions.npz
+        predictions_path = os.path.join(target_dir, "predictions.npz")
+        if not os.path.exists(predictions_path):
+            error_message = (
+                f"No reconstruction available at {predictions_path}. "
+                "Please run 'Reconstruct' first."
+            )
+            return gr.update(), error_message
+        loaded = np.load(predictions_path, allow_pickle=True)
+        predictions = {key: loaded[key] for key in loaded.keys()}  # noqa: F841
+        return (
+            glbfile,
+            "Visualization updated.",
+        )
+    def handle_uploads(
+        self,
+        input_video: Optional[str],
+        input_images: Optional[List],
+        s_time_interval: float = 10.0,
+    ) -> Tuple[Optional[str], Optional[str], Optional[List], Optional[str]]:
+        """
+        Handle file uploads and update gallery.
+        Args:
+            input_video: Path to input video file
+            input_images: List of input image files
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message)
+        """
+        return self.file_handler.update_gallery_on_upload(
+            input_video, input_images, s_time_interval
+        )
+    def load_example_scene(self, scene_name: str, examples_dir: str = None) -> Tuple[
+        Optional[str],
+        Optional[str],
+        Optional[List],
+        str,
+        Optional[Dict],
+        gr.Dropdown,
+        Optional[str],
+        gr.update,
+        gr.update,
+    ]:
+        """
+        Load a scene from examples directory.
+        Args:
+            scene_name: Name of the scene to load
+            examples_dir: Path to examples directory (if None, uses workspace_dir/examples)
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message, processed_data, measure_view_selector, gs_video, gs_video_vis, gs_info_vis)  # noqa: E501
+        """
+        if examples_dir is None:
+            # Get workspace directory from environment variable
+            workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+            examples_dir = os.path.join(workspace_dir, "examples")
+        reconstruction_output, target_dir, image_paths, log_message = (
+            self.file_handler.load_example_scene(scene_name, examples_dir)
+        )
+        # Try to load cached processed data if available
+        processed_data = None
+        measure_view_selector = gr.Dropdown(choices=["View 1"], value="View 1")
+        gs_video_path = None
+        gs_video_visible = False
+        gs_info_visible = True
+        if target_dir and target_dir != "None":
+            predictions_path = os.path.join(target_dir, "predictions.npz")
+            if os.path.exists(predictions_path):
+                try:
+                    # Load predictions from cache
+                    loaded = np.load(predictions_path, allow_pickle=True)
+                    predictions = {key: loaded[key] for key in loaded.keys()}
+                    # Reconstruct processed_data structure
+                    num_images = len(predictions.get("images", []))
+                    processed_data = {}
+                    for i in range(num_images):
+                        processed_data[i] = {
+                            "image": predictions["images"][i] if "images" in predictions else None,
+                            "depth": predictions["depths"][i] if "depths" in predictions else None,
+                            "depth_image": os.path.join(
+                                target_dir, "depth_vis", f"{i:04d}.jpg"  # Fixed: use .jpg not .png
+                            ),
+                            "intrinsics": (
+                                predictions["intrinsics"][i]
+                                if "intrinsics" in predictions
+                                and i < len(predictions["intrinsics"])
+                                else None
+                            ),
+                            "mask": None,
+                        }
+                    # Update measure view selector
+                    choices = [f"View {i + 1}" for i in range(num_images)]
+                    measure_view_selector = gr.Dropdown(choices=choices, value=choices[0])
+                except Exception as e:
+                    print(f"Error loading cached data: {e}")
+            # Check for cached 3DGS video
+            gs_video_dir = os.path.join(target_dir, "gs_video")
+            if os.path.exists(gs_video_dir):
+                try:
+                    from glob import glob
+                    gs_videos = sorted(glob(os.path.join(gs_video_dir, "*.mp4")))
+                    if gs_videos:
+                        gs_video_path = gs_videos[-1]
+                        gs_video_visible = True
+                        gs_info_visible = False
+                        print(f"Loaded cached 3DGS video: {gs_video_path}")
+                except Exception as e:
+                    print(f"Error loading cached 3DGS video: {e}")
+        return (
+            reconstruction_output,
+            target_dir,
+            image_paths,
+            log_message,
+            processed_data,
+            measure_view_selector,
+            gs_video_path,
+            gr.update(visible=gs_video_visible),
+            gr.update(visible=gs_info_visible),
+        )
+    def navigate_depth_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector: str,
+        direction: int,
+    ) -> Tuple[str, Optional[str]]:
+        """
+        Navigate depth view.
+        Args:
+            processed_data: Processed data dictionary
+            current_selector: Current selector value
+            direction: Direction to navigate
+        Returns:
+            Tuple of (new_selector_value, depth_vis)
+        """
+        return self.visualization_handler.navigate_depth_view(
+            processed_data, current_selector, direction
+        )
+    def update_depth_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Optional[str]:
+        """
+        Update depth view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Path to depth visualization image or None
+        """
+        return self.visualization_handler.update_depth_view(processed_data, view_index)
+    def navigate_measure_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector: str,
+        direction: int,
+    ) -> Tuple[str, Optional[np.ndarray], Optional[np.ndarray], List]:
+        """
+        Navigate measure view.
+        Args:
+            processed_data: Processed data dictionary
+            current_selector: Current selector value
+            direction: Direction to navigate
+        Returns:
+            Tuple of (new_selector_value, measure_image, depth_right_half, measure_points)
+        """
+        return self.visualization_handler.navigate_measure_view(
+            processed_data, current_selector, direction
+        )
+    def update_measure_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], List]:
+        """
+        Update measure view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Tuple of (measure_image, depth_right_half, measure_points)
+        """
+        return self.visualization_handler.update_measure_view(processed_data, view_index)
+    def measure(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        measure_points: List,
+        current_view_selector: str,
+        event: gr.SelectData,
+    ) -> List:
+        """
+        Handle measurement on images.
+        Args:
+            processed_data: Processed data dictionary
+            measure_points: List of current measure points
+            current_view_selector: Current view selector value
+            event: Gradio select event
+        Returns:
+            List of [image, depth_right_half, measure_points, text]
+        """
+        return self.visualization_handler.measure(
+            processed_data, measure_points, current_view_selector, event
+        )
+    def select_first_frame(
+        self, image_gallery: List, selected_index: int = 0
+    ) -> Tuple[List, str, str]:
+        """
+        Select the first frame from the image gallery.
+        Args:
+            image_gallery: List of images in the gallery
+            selected_index: Index of the selected image (default: 0)
+        Returns:
+            Tuple of (updated_image_gallery, log_message, selected_frame_path)
+        """
+        try:
+            if not image_gallery or len(image_gallery) == 0:
+                return image_gallery, "No images available to select as first frame.", ""
+            # Handle None or invalid selected_index
+            if (
+                selected_index is None
+                or selected_index < 0
+                or selected_index >= len(image_gallery)
+            ):
+                selected_index = 0
+                print(f"Invalid selected_index: {selected_index}, using default: 0")
+            # Get the selected image based on index
+            selected_image = image_gallery[selected_index]
+            print(f"Selected image index: {selected_index}")
+            print(f"Total images: {len(image_gallery)}")
+            # Extract the file path from the selected image
+            selected_frame_path = ""
+            print(f"Selected image type: {type(selected_image)}")
+            print(f"Selected image: {selected_image}")
+            if isinstance(selected_image, tuple):
+                # Gradio Gallery returns tuple (path, None)
+                selected_frame_path = selected_image[0]
+            elif isinstance(selected_image, str):
+                selected_frame_path = selected_image
+            elif hasattr(selected_image, "name"):
+                selected_frame_path = selected_image.name
+            elif isinstance(selected_image, dict):
+                if "name" in selected_image:
+                    selected_frame_path = selected_image["name"]
+                elif "path" in selected_image:
+                    selected_frame_path = selected_image["path"]
+                elif "src" in selected_image:
+                    selected_frame_path = selected_image["src"]
+            else:
+                # Try to convert to string
+                selected_frame_path = str(selected_image)
+            print(f"Extracted path: {selected_frame_path}")
+            # Extract filename from the path for matching
+            import os
+            selected_filename = os.path.basename(selected_frame_path)
+            print(f"Selected filename: {selected_filename}")
+            # Move the selected image to the front
+            updated_gallery = [selected_image] + [
+                img for img in image_gallery if img != selected_image
+            ]
+            log_message = (
+                f"Selected frame: {selected_filename}. "
+                f"Moved to first position. Total frames: {len(updated_gallery)}"
+            )
+            return updated_gallery, log_message, selected_filename
+        except Exception as e:
+            print(f"Error selecting first frame: {e}")
+            return image_gallery, f"Error selecting first frame: {e}", ""

Depth-Anything-3/src/depth_anything_3/app/modules/file_handlers.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+File handling module for Depth Anything 3 Gradio app.
+This module handles file uploads, video processing, and file operations.
+"""
+import os
+import shutil
+import time
+from datetime import datetime
+from typing import List, Optional, Tuple
+import cv2
+from PIL import Image
+from pillow_heif import register_heif_opener
+register_heif_opener()
+class FileHandler:
+    """
+    Handles file uploads and processing for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the file handler."""
+    def handle_uploads(
+        self,
+        input_video: Optional[str],
+        input_images: Optional[List],
+        s_time_interval: float = 10.0,
+    ) -> Tuple[str, List[str]]:
+        """
+        Create a new 'target_dir' + 'images' subfolder, and place user-uploaded
+        images or extracted frames from video into it.
+        Args:
+            input_video: Path to input video file
+            input_images: List of input image files
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            Tuple of (target_dir, image_paths)
+        """
+        start_time = time.time()
+        # Get workspace directory from environment variable or use default
+        workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+        if not os.path.exists(workspace_dir):
+            os.makedirs(workspace_dir)
+        # Create input_images subdirectory
+        input_images_dir = os.path.join(workspace_dir, "input_images")
+        if not os.path.exists(input_images_dir):
+            os.makedirs(input_images_dir)
+        # Create a unique folder name within input_images
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        target_dir = os.path.join(input_images_dir, f"session_{timestamp}")
+        target_dir_images = os.path.join(target_dir, "images")
+        # Clean up if somehow that folder already exists
+        if os.path.exists(target_dir):
+            shutil.rmtree(target_dir)
+        os.makedirs(target_dir)
+        os.makedirs(target_dir_images)
+        image_paths = []
+        # Handle images
+        if input_images is not None:
+            image_paths.extend(self._process_images(input_images, target_dir_images))
+        # Handle video
+        if input_video is not None:
+            image_paths.extend(
+                self._process_video(input_video, target_dir_images, s_time_interval)
+            )
+        # Sort final images for gallery
+        image_paths = sorted(image_paths)
+        end_time = time.time()
+        print(f"Files copied to {target_dir_images}; took {end_time - start_time:.3f} seconds")
+        return target_dir, image_paths
+    def _process_images(self, input_images: List, target_dir_images: str) -> List[str]:
+        """
+        Process uploaded images.
+        Args:
+            input_images: List of input image files
+            target_dir_images: Target directory for images
+        Returns:
+            List of processed image paths
+        """
+        image_paths = []
+        for file_data in input_images:
+            if isinstance(file_data, dict) and "name" in file_data:
+                file_path = file_data["name"]
+            else:
+                file_path = file_data
+            # Check if the file is a HEIC image
+            file_ext = os.path.splitext(file_path)[1].lower()
+            if file_ext in [".heic", ".heif"]:
+                # Convert HEIC to JPEG for better gallery compatibility
+                try:
+                    with Image.open(file_path) as img:
+                        # Convert to RGB if necessary (HEIC can have different color modes)
+                        if img.mode not in ("RGB", "L"):
+                            img = img.convert("RGB")
+                        # Create JPEG filename
+                        base_name = os.path.splitext(os.path.basename(file_path))[0]
+                        dst_path = os.path.join(target_dir_images, f"{base_name}.jpg")
+                        # Save as JPEG with high quality
+                        img.save(dst_path, "JPEG", quality=95)
+                        image_paths.append(dst_path)
+                        print(
+                            f"Converted HEIC to JPEG: {os.path.basename(file_path)} -> "
+                            f"{os.path.basename(dst_path)}"
+                        )
+                except Exception as e:
+                    print(f"Error converting HEIC file {file_path}: {e}")
+                    # Fall back to copying as is
+                    dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
+                    shutil.copy(file_path, dst_path)
+                    image_paths.append(dst_path)
+            else:
+                # Regular image files - copy as is
+                dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
+                shutil.copy(file_path, dst_path)
+                image_paths.append(dst_path)
+        return image_paths
+    def _process_video(
+        self, input_video: str, target_dir_images: str, s_time_interval: float
+    ) -> List[str]:
+        """
+        Process video file and extract frames.
+        Args:
+            input_video: Path to input video file
+            target_dir_images: Target directory for extracted frames
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            List of extracted frame paths
+        """
+        image_paths = []
+        if isinstance(input_video, dict) and "name" in input_video:
+            video_path = input_video["name"]
+        else:
+            video_path = input_video
+        vs = cv2.VideoCapture(video_path)
+        fps = vs.get(cv2.CAP_PROP_FPS)
+        frame_interval = max(1, int(fps / s_time_interval))  # Convert FPS to frame interval
+        count = 0
+        video_frame_num = 0
+        while True:
+            gotit, frame = vs.read()
+            if not gotit:
+                break
+            count += 1
+            if count % frame_interval == 0:
+                image_path = os.path.join(target_dir_images, f"{video_frame_num:06}.png")
+                cv2.imwrite(image_path, frame)
+                image_paths.append(image_path)
+                video_frame_num += 1
+        return image_paths
+    def update_gallery_on_upload(
+        self,
+        input_video: Optional[str],
+        input_images: Optional[List],
+        s_time_interval: float = 10.0,
+    ) -> Tuple[Optional[str], Optional[str], Optional[List], Optional[str]]:
+        """
+        Handle file uploads and update gallery.
+        Args:
+            input_video: Path to input video file
+            input_images: List of input image files
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message)
+        """
+        if not input_video and not input_images:
+            return None, None, None, None
+        target_dir, image_paths = self.handle_uploads(input_video, input_images, s_time_interval)
+        return (
+            None,
+            target_dir,
+            image_paths,
+            "Upload complete. Click 'Reconstruct' to begin 3D processing.",
+        )
+    def load_example_scene(
+        self, scene_name: str, examples_dir: str = "examples"
+    ) -> Tuple[Optional[str], Optional[str], Optional[List], str]:
+        """
+        Load a scene from examples directory.
+        Args:
+            scene_name: Name of the scene to load
+            examples_dir: Path to examples directory
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message)
+        """
+        from depth_anything_3.app.modules.utils import get_scene_info
+        scenes = get_scene_info(examples_dir)
+        # Find the selected scene
+        selected_scene = None
+        for scene in scenes:
+            if scene["name"] == scene_name:
+                selected_scene = scene
+                break
+        if selected_scene is None:
+            return None, None, None, "Scene not found"
+        # Use fixed directory name for examples (not timestamp-based)
+        workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+        input_images_dir = os.path.join(workspace_dir, "input_images")
+        if not os.path.exists(input_images_dir):
+            os.makedirs(input_images_dir)
+        # Create a fixed folder name based on scene name
+        target_dir = os.path.join(input_images_dir, f"example_{scene_name}")
+        target_dir_images = os.path.join(target_dir, "images")
+        # Check if already cached (GLB file exists)
+        glb_path = os.path.join(target_dir, "scene.glb")
+        is_cached = os.path.exists(glb_path)
+        # Create directory if it doesn't exist
+        if not os.path.exists(target_dir):
+            os.makedirs(target_dir)
+            os.makedirs(target_dir_images)
+        # Copy images if directory is new or empty
+        if not os.path.exists(target_dir_images) or len(os.listdir(target_dir_images)) == 0:
+            os.makedirs(target_dir_images, exist_ok=True)
+            image_paths = []
+            for file_path in selected_scene["image_files"]:
+                dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
+                shutil.copy(file_path, dst_path)
+                image_paths.append(dst_path)
+        else:
+            # Use existing images
+            image_paths = sorted(
+                [
+                    os.path.join(target_dir_images, f)
+                    for f in os.listdir(target_dir_images)
+                    if f.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif"))
+                ]
+            )
+        # Return cached GLB if available
+        if is_cached:
+            return (
+                glb_path,  # Return cached reconstruction
+                target_dir,  # Set target directory
+                image_paths,  # Set gallery
+                f"Loaded cached scene '{scene_name}' with {selected_scene['num_images']} images.",
+            )
+        else:
+            return (
+                None,  # No cached reconstruction
+                target_dir,  # Set target directory
+                image_paths,  # Set gallery
+                (
+                    f"Loaded scene '{scene_name}' with {selected_scene['num_images']} images. "
+                    "Click 'Reconstruct' to begin 3D processing."
+                ),
+            )

Depth-Anything-3/src/depth_anything_3/app/modules/model_inference.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Model inference module for Depth Anything 3 Gradio app.
+This module handles all model-related operations including inference,
+data processing, and result preparation.
+"""
+import glob
+import os
+from typing import Any, Dict, Optional, Tuple
+import numpy as np
+import torch
+from depth_anything_3.api import DepthAnything3
+from depth_anything_3.utils.memory import cleanup_cuda_memory
+from depth_anything_3.utils.export.glb import export_to_glb
+from depth_anything_3.utils.export.gs import export_to_gs_video
+class ModelInference:
+    """
+    Handles model inference and data processing for Depth Anything 3.
+    """
+    def __init__(self):
+        """Initialize the model inference handler."""
+        self.model = None
+    def initialize_model(self, device: str = "cuda") -> None:
+        """
+        Initialize the DepthAnything3 model.
+        Args:
+            device: Device to load the model on
+        """
+        if self.model is None:
+            # Get model directory from environment variable or use default
+            model_dir = os.environ.get(
+                "DA3_MODEL_DIR", "/dev/shm/da3_models/DA3HF-VITG-METRIC_VITL"
+            )
+            self.model = DepthAnything3.from_pretrained(model_dir)
+            self.model = self.model.to(device)
+        else:
+            self.model = self.model.to(device)
+        self.model.eval()
+    def run_inference(
+        self,
+        target_dir: str,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        process_res_method: str = "upper_bound_resize",
+        show_camera: bool = True,
+        save_percentage: float = 30.0,
+        num_max_points: int = 1_000_000,
+        infer_gs: bool = False,
+        ref_view_strategy: str = "saddle_balanced",
+        gs_trj_mode: str = "extend",
+        gs_video_quality: str = "high",
+    ) -> Tuple[Any, Dict[int, Dict[str, Any]]]:
+        """
+        Run DepthAnything3 model inference on images.
+        Args:
+            target_dir: Directory containing images
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            process_res_method: Method for resizing input images
+            show_camera: Whether to show camera in 3D view
+            save_percentage: Percentage of points to save (0-100)
+            num_max_points: Maximum number of points in point cloud
+            infer_gs: Whether to infer 3D Gaussian Splatting
+            ref_view_strategy: Reference view selection strategy
+            gs_trj_mode: Trajectory mode for 3DGS
+            gs_video_quality: Video quality for 3DGS
+        Returns:
+            Tuple of (prediction, processed_data)
+        """
+        print(f"Processing images from {target_dir}")
+        # Device check
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        device = torch.device(device)
+        # Initialize model if needed
+        self.initialize_model(device)
+        # Get image paths
+        print("Loading images...")
+        image_folder_path = os.path.join(target_dir, "images")
+        all_image_paths = sorted(glob.glob(os.path.join(image_folder_path, "*")))
+        # Filter for image files
+        image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]
+        all_image_paths = [
+            path
+            for path in all_image_paths
+            if any(path.lower().endswith(ext) for ext in image_extensions)
+        ]
+        print(f"Found {len(all_image_paths)} images")
+        print(f"All image paths: {all_image_paths}")
+        # Use sorted image order (reference view will be selected automatically)
+        image_paths = all_image_paths
+        print(f"Reference view selection strategy: {ref_view_strategy}")
+        if len(image_paths) == 0:
+            raise ValueError("No images found. Check your upload.")
+        # Map UI options to actual method names
+        method_mapping = {"high_res": "lower_bound_resize", "low_res": "upper_bound_resize"}
+        actual_method = method_mapping.get(process_res_method, "upper_bound_crop")
+        # Run model inference
+        print(f"Running inference with method: {actual_method}")
+        with torch.no_grad():
+            prediction = self.model.inference(
+                image_paths,
+                export_dir=None,
+                process_res_method=actual_method,
+                infer_gs=infer_gs,
+                ref_view_strategy=ref_view_strategy,
+            )
+        # num_max_points: int = 1_000_000,
+        export_to_glb(
+            prediction,
+            filter_black_bg=filter_black_bg,
+            filter_white_bg=filter_white_bg,
+            export_dir=target_dir,
+            show_cameras=show_camera,
+            conf_thresh_percentile=save_percentage,
+            num_max_points=int(num_max_points),
+        )
+        # export to gs video if needed
+        if infer_gs:
+            mode_mapping = {"extend": "extend", "smooth": "interpolate_smooth"}
+            print(f"GS mode: {gs_trj_mode}; Backend mode: {mode_mapping[gs_trj_mode]}")
+            export_to_gs_video(
+                prediction,
+                export_dir=target_dir,
+                chunk_size=4,
+                trj_mode=mode_mapping.get(gs_trj_mode, "extend"),
+                enable_tqdm=True,
+                vis_depth="hcat",
+                video_quality=gs_video_quality,
+            )
+        # Save predictions.npz for caching metric depth data
+        self._save_predictions_cache(target_dir, prediction)
+        # Process results
+        processed_data = self._process_results(target_dir, prediction, image_paths)
+        # Clean up using centralized memory utilities for consistency with backend
+        cleanup_cuda_memory()
+        return prediction, processed_data
+    def _save_predictions_cache(self, target_dir: str, prediction: Any) -> None:
+        """
+        Save predictions data to predictions.npz for caching.
+        Args:
+            target_dir: Directory to save the cache
+            prediction: Model prediction object
+        """
+        try:
+            output_file = os.path.join(target_dir, "predictions.npz")
+            # Build save dict with prediction data
+            save_dict = {}
+            # Save processed images if available
+            if prediction.processed_images is not None:
+                save_dict["images"] = prediction.processed_images
+            # Save depth data
+            if prediction.depth is not None:
+                save_dict["depths"] = np.round(prediction.depth, 6)
+            # Save confidence if available
+            if prediction.conf is not None:
+                save_dict["conf"] = np.round(prediction.conf, 2)
+            # Save camera parameters
+            if prediction.extrinsics is not None:
+                save_dict["extrinsics"] = prediction.extrinsics
+            if prediction.intrinsics is not None:
+                save_dict["intrinsics"] = prediction.intrinsics
+            # Save to file
+            np.savez_compressed(output_file, **save_dict)
+            print(f"Saved predictions cache to: {output_file}")
+        except Exception as e:
+            print(f"Warning: Failed to save predictions cache: {e}")
+    def _process_results(
+        self, target_dir: str, prediction: Any, image_paths: list
+    ) -> Dict[int, Dict[str, Any]]:
+        """
+        Process model results into structured data.
+        Args:
+            target_dir: Directory containing results
+            prediction: Model prediction object
+            image_paths: List of input image paths
+        Returns:
+            Dictionary containing processed data for each view
+        """
+        processed_data = {}
+        # Read generated depth visualization files
+        depth_vis_dir = os.path.join(target_dir, "depth_vis")
+        if os.path.exists(depth_vis_dir):
+            depth_files = sorted(glob.glob(os.path.join(depth_vis_dir, "*.jpg")))
+            for i, depth_file in enumerate(depth_files):
+                # Use processed images directly from API
+                processed_image = None
+                if prediction.processed_images is not None and i < len(
+                    prediction.processed_images
+                ):
+                    processed_image = prediction.processed_images[i]
+                processed_data[i] = {
+                    "depth_image": depth_file,
+                    "image": processed_image,
+                    "original_image_path": image_paths[i] if i < len(image_paths) else None,
+                    "depth": prediction.depth[i] if i < len(prediction.depth) else None,
+                    "intrinsics": (
+                        prediction.intrinsics[i]
+                        if prediction.intrinsics is not None and i < len(prediction.intrinsics)
+                        else None
+                    ),
+                    "mask": None,  # No mask information available
+                }
+        return processed_data
+    # cleanup() removed: call cleanup_cuda_memory() directly where needed.

Depth-Anything-3/src/depth_anything_3/app/modules/ui_components.py ADDED Viewed

	@@ -0,0 +1,477 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UI components module for Depth Anything 3 Gradio app.
+This module contains UI component definitions and layout functions.
+"""
+import os
+from typing import Any, Dict, List, Tuple
+import gradio as gr
+from depth_anything_3.app.modules.utils import get_logo_base64, get_scene_info
+class UIComponents:
+    """
+    Handles UI component creation and layout for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the UI components handler."""
+    def create_upload_section(self) -> Tuple[gr.Video, gr.Slider, gr.File, gr.Gallery]:
+        """
+        Create the upload section with video, images, and gallery components.
+        Returns:
+            A tuple of Gradio components: (input_video, s_time_interval, input_images, image_gallery).
+        """
+        input_video = gr.Video(label="Upload Video", interactive=True)
+        s_time_interval = gr.Slider(
+            minimum=0.1,
+            maximum=60,
+            value=10,
+            step=0.1,
+            label="Sampling FPS (Frames Per Second)",
+            interactive=True,
+            visible=True,
+        )
+        input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
+        image_gallery = gr.Gallery(
+            label="Preview",
+            columns=4,
+            height="300px",
+            show_download_button=True,
+            object_fit="contain",
+            preview=True,
+            interactive=False,
+        )
+        return input_video, s_time_interval, input_images, image_gallery
+    def create_3d_viewer_section(self) -> gr.Model3D:
+        """
+        Create the 3D viewer component.
+        Returns:
+            3D model viewer component
+        """
+        return gr.Model3D(
+            height=520,
+            zoom_speed=0.5,
+            pan_speed=0.5,
+            clear_color=[0.0, 0.0, 0.0, 0.0],
+            key="persistent_3d_viewer",
+            elem_id="reconstruction_3d_viewer",
+        )
+    def create_nvs_video(self) -> Tuple[gr.Video, gr.Markdown]:
+        """
+        Create the 3DGS rendered video display component and info message.
+        Returns:
+            Tuple of (video component, info message component)
+        """
+        with gr.Column():
+            gs_info = gr.Markdown(
+                (
+                    "‼️ **3D Gaussian Splatting rendering is currently DISABLED.** <br><br><br>"
+                    "To render novel views from 3DGS, "
+                    "enable **Infer 3D Gaussian Splatting** below. <br>"
+                    "Next, in **Visualization Options**, "
+                    "*optionally* configure the **rendering trajectory** (default: smooth) "
+                    "and **video quality** (default: low), "
+                    "then click **Reconstruct**."
+                ),
+                visible=True,
+                height=520,
+            )
+            gs_video = gr.Video(
+                height=520,
+                label="3DGS Rendered NVS Video (depth shown for reference only)",
+                interactive=False,
+                visible=False,
+            )
+        return gs_video, gs_info
+    def create_depth_section(self) -> Tuple[gr.Button, gr.Dropdown, gr.Button, gr.Image]:
+        """
+        Create the depth visualization section.
+        Returns:
+            A tuple of (prev_depth_btn, depth_view_selector, next_depth_btn, depth_map)
+        """
+        with gr.Row(elem_classes=["navigation-row"]):
+            prev_depth_btn = gr.Button("◀ Previous", size="sm", scale=1)
+            depth_view_selector = gr.Dropdown(
+                choices=["View 1"],
+                value="View 1",
+                label="Select View",
+                scale=2,
+                interactive=True,
+                allow_custom_value=True,
+            )
+            next_depth_btn = gr.Button("Next ▶", size="sm", scale=1)
+        depth_map = gr.Image(
+            type="numpy",
+            label="Colorized Depth Map",
+            format="png",
+            interactive=False,
+        )
+        return prev_depth_btn, depth_view_selector, next_depth_btn, depth_map
+    def create_measure_section(
+        self,
+    ) -> Tuple[gr.Button, gr.Dropdown, gr.Button, gr.Image, gr.Image, gr.Markdown]:
+        """
+        Create the measurement section.
+        Returns:
+            A tuple of (prev_measure_btn, measure_view_selector, next_measure_btn, measure_image,
+            measure_depth_image, measure_text)
+        """
+        from depth_anything_3.app.css_and_html import MEASURE_INSTRUCTIONS_HTML
+        gr.Markdown(MEASURE_INSTRUCTIONS_HTML)
+        with gr.Row(elem_classes=["navigation-row"]):
+            prev_measure_btn = gr.Button("◀ Previous", size="sm", scale=1)
+            measure_view_selector = gr.Dropdown(
+                choices=["View 1"],
+                value="View 1",
+                label="Select View",
+                scale=2,
+                interactive=True,
+                allow_custom_value=True,
+            )
+            next_measure_btn = gr.Button("Next ▶", size="sm", scale=1)
+        with gr.Row():
+            measure_image = gr.Image(
+                type="numpy",
+                show_label=False,
+                format="webp",
+                interactive=False,
+                sources=[],
+                label="RGB Image",
+                scale=1,
+                height=275,
+            )
+            measure_depth_image = gr.Image(
+                type="numpy",
+                show_label=False,
+                format="webp",
+                interactive=False,
+                sources=[],
+                label="Depth Visualization (Right Half)",
+                scale=1,
+                height=275,
+            )
+        gr.Markdown(
+            "**Note:** Images have been adjusted to model processing size. "
+            "Click two points on the RGB image to measure distance."
+        )
+        measure_text = gr.Markdown("")
+        return (
+            prev_measure_btn,
+            measure_view_selector,
+            next_measure_btn,
+            measure_image,
+            measure_depth_image,
+            measure_text,
+        )
+    def create_inference_control_section(self) -> Tuple[gr.Dropdown, gr.Checkbox, gr.Dropdown]:
+        """
+        Create the inference control section (before inference).
+        Returns:
+            Tuple of (process_res_method_dropdown, infer_gs, ref_view_strategy)
+        """
+        with gr.Row():
+            process_res_method_dropdown = gr.Dropdown(
+                choices=["high_res", "low_res"],
+                value="low_res",
+                label="Image Processing Method",
+                info="low_res for much more images",
+                scale=1,
+            )
+            # Modify line 220, add color class
+            infer_gs = gr.Checkbox(
+                label="Infer 3D Gaussian Splatting",
+                value=False,
+                info=(
+                    'Enable novel view rendering from 3DGS (<i class="fas fa-triangle-exclamation '
+                    'fa-color-red"></i> requires extra processing time)'
+                ),
+                scale=1,
+            )
+            ref_view_strategy = gr.Dropdown(
+                choices=["saddle_balanced", "saddle_sim_range", "first", "middle"],
+                value="saddle_balanced",
+                label="Reference View Strategy",
+                info="Strategy for selecting reference view from multiple inputs",
+                scale=1,
+            )
+        return (process_res_method_dropdown, infer_gs, ref_view_strategy)
+    def create_display_control_section(
+        self,
+    ) -> Tuple[
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Slider,
+        gr.Slider,
+        gr.Dropdown,
+        gr.Dropdown,
+        gr.Button,
+        gr.ClearButton,
+    ]:
+        """
+        Create the display control section (options for visualization).
+        Returns:
+            Tuple of display control components including buttons
+        """
+        with gr.Column():
+            # 3DGS options at the top
+            with gr.Row():
+                gs_trj_mode = gr.Dropdown(
+                    choices=["smooth", "extend"],
+                    value="smooth",
+                    label=("Rendering trajectory for 3DGS viewpoints (requires n_views ≥ 2)"),
+                    info=("'smooth' for view interpolation; 'extend' for longer trajectory"),
+                    visible=False,  # initially hidden
+                )
+                gs_video_quality = gr.Dropdown(
+                    choices=["low", "medium", "high"],
+                    value="low",
+                    label=("Video quality for 3DGS rendered outputs"),
+                    info=("'low' for faster loading speed; 'high' for better visual quality"),
+                    visible=False,  # initially hidden
+                )
+            # Reconstruct and Clear buttons (before Visualization Options)
+            with gr.Row():
+                submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
+                clear_btn = gr.ClearButton(scale=1)
+            gr.Markdown("### Visualization Options: (Click Reconstruct to update)")
+            show_cam = gr.Checkbox(label="Show Camera", value=True)
+            filter_black_bg = gr.Checkbox(label="Filter Black Background", value=False)
+            filter_white_bg = gr.Checkbox(label="Filter White Background", value=False)
+            save_percentage = gr.Slider(
+                minimum=0,
+                maximum=100,
+                value=10,
+                step=1,
+                label="Filter Percentage",
+                info="Confidence Threshold (%): Higher values filter more points.",
+            )
+            num_max_points = gr.Slider(
+                minimum=1000,
+                maximum=100000,
+                value=1000,
+                step=1000,
+                label="Max Points (K points)",
+                info="Maximum number of points to export to GLB (in thousands)",
+            )
+        return (
+            show_cam,
+            filter_black_bg,
+            filter_white_bg,
+            save_percentage,
+            num_max_points,
+            gs_trj_mode,
+            gs_video_quality,
+            submit_btn,
+            clear_btn,
+        )
+    def create_control_section(
+        self,
+    ) -> Tuple[
+        gr.Button,
+        gr.ClearButton,
+        gr.Dropdown,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Dropdown,
+        gr.Checkbox,
+        gr.Textbox,
+    ]:
+        """
+        Create the control section with buttons and options.
+        Returns:
+            Tuple of control components
+        """
+        with gr.Row():
+            submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
+            clear_btn = gr.ClearButton(
+                scale=1,
+            )
+        with gr.Row():
+            frame_filter = gr.Dropdown(
+                choices=["All"], value="All", label="Show Points from Frame"
+            )
+            with gr.Column():
+                gr.Markdown("### Visualization Option: (Click Reconstruct to update)")
+                show_cam = gr.Checkbox(label="Show Camera", value=True)
+                show_mesh = gr.Checkbox(label="Show Mesh", value=True)
+                filter_black_bg = gr.Checkbox(label="Filter Black Background", value=False)
+                filter_white_bg = gr.Checkbox(label="Filter White Background", value=False)
+                gr.Markdown("### Reconstruction Options: (updated on next run)")
+                apply_mask_checkbox = gr.Checkbox(
+                    label="Apply mask for predicted ambiguous depth classes & edges",
+                    value=True,
+                )
+                process_res_method_dropdown = gr.Dropdown(
+                    choices=[
+                        "upper_bound_resize",
+                        "upper_bound_crop",
+                        "lower_bound_resize",
+                        "lower_bound_crop",
+                    ],
+                    value="upper_bound_resize",
+                    label="Image Processing Method",
+                    info="Method for resizing input images",
+                )
+                save_to_gallery_checkbox = gr.Checkbox(
+                    label="Save to Gallery",
+                    value=False,
+                    info="Save current reconstruction results to gallery directory",
+                )
+                gallery_name_input = gr.Textbox(
+                    label="Gallery Name",
+                    placeholder="Enter a name for the gallery folder",
+                    value="",
+                    info="Leave empty for auto-generated name with timestamp",
+                )
+        return (
+            submit_btn,
+            clear_btn,
+            frame_filter,
+            show_cam,
+            show_mesh,
+            filter_black_bg,
+            filter_white_bg,
+            apply_mask_checkbox,
+            process_res_method_dropdown,
+            save_to_gallery_checkbox,
+            gallery_name_input,
+        )
+    def create_example_scenes_section(self) -> List[Dict[str, Any]]:
+        """
+        Create the example scenes section.
+        Returns:
+            List of scene information dictionaries
+        """
+        # Get workspace directory from environment variable
+        workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+        examples_dir = os.path.join(workspace_dir, "examples")
+        # Get scene information
+        scenes = get_scene_info(examples_dir)
+        return scenes
+    def create_example_scene_grid(self, scenes: List[Dict[str, Any]]) -> List[gr.Image]:
+        """
+        Create the example scene grid.
+        Args:
+            scenes: List of scene information dictionaries
+        Returns:
+            List of scene image components
+        """
+        scene_components = []
+        if scenes:
+            for i in range(0, len(scenes), 4):  # Process 4 scenes per row
+                with gr.Row():
+                    for j in range(4):
+                        scene_idx = i + j
+                        if scene_idx < len(scenes):
+                            scene = scenes[scene_idx]
+                            with gr.Column(scale=1, elem_classes=["clickable-thumbnail"]):
+                                # Clickable thumbnail
+                                scene_img = gr.Image(
+                                    value=scene["thumbnail"],
+                                    height=150,
+                                    interactive=False,
+                                    show_label=False,
+                                    elem_id=f"scene_thumb_{scene['name']}",
+                                    sources=[],
+                                )
+                                scene_components.append(scene_img)
+                                # Scene name and image count as text below thumbnail
+                                gr.Markdown(
+                                    f"**{scene['name']}** \n {scene['num_images']} images",
+                                    elem_classes=["scene-info"],
+                                )
+                        else:
+                            # Empty column to maintain grid structure
+                            with gr.Column(scale=1):
+                                pass
+        return scene_components
+    def create_header_section(self) -> gr.HTML:
+        """
+        Create the header section with logo and title.
+        Returns:
+            Header HTML component
+        """
+        from depth_anything_3.app.css_and_html import get_header_html
+        return gr.HTML(get_header_html(get_logo_base64()))
+    def create_description_section(self) -> gr.HTML:
+        """
+        Create the description section.
+        Returns:
+            Description HTML component
+        """
+        from depth_anything_3.app.css_and_html import get_description_html
+        return gr.HTML(get_description_html())
+    def create_acknowledgements_section(self) -> gr.HTML:
+        """
+        Create the acknowledgements section.
+        Returns:
+            Acknowledgements HTML component
+        """
+        from depth_anything_3.app.css_and_html import get_acknowledgements_html
+        return gr.HTML(get_acknowledgements_html())

Depth-Anything-3/src/depth_anything_3/app/modules/utils.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility functions for Depth Anything 3 Gradio app.
+This module contains helper functions for data processing, visualization,
+and file operations.
+"""
+import json
+import os
+import shutil
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+def create_depth_visualization(depth: np.ndarray) -> Optional[np.ndarray]:
+    """
+    Create a colored depth visualization.
+    Args:
+        depth: Depth array
+    Returns:
+        Colored depth visualization or None
+    """
+    if depth is None:
+        return None
+    # Normalize depth to 0-1 range
+    depth_min = depth[depth > 0].min() if (depth > 0).any() else 0
+    depth_max = depth.max()
+    if depth_max <= depth_min:
+        return None
+    # Normalize depth
+    depth_norm = (depth - depth_min) / (depth_max - depth_min)
+    depth_norm = np.clip(depth_norm, 0, 1)
+    # Apply colormap (using matplotlib's viridis colormap)
+    import matplotlib.cm as cm
+    # Convert to colored image
+    depth_colored = cm.viridis(depth_norm)[:, :, :3]  # Remove alpha channel
+    depth_colored = (depth_colored * 255).astype(np.uint8)
+    return depth_colored
+def save_to_gallery_func(
+    target_dir: str, processed_data: Dict[int, Dict[str, Any]], gallery_name: Optional[str] = None
+) -> Tuple[bool, str]:
+    """
+    Save the current reconstruction results to the gallery directory.
+    Args:
+        target_dir: Source directory containing reconstruction results
+        processed_data: Processed data dictionary
+        gallery_name: Name for the gallery folder
+    Returns:
+        Tuple of (success, message)
+    """
+    try:
+        # Get gallery directory from environment variable or use default
+        gallery_dir = os.environ.get(
+            "DA3_GALLERY_DIR",
+            "workspace/gallery",
+        )
+        if not os.path.exists(gallery_dir):
+            os.makedirs(gallery_dir)
+        # Use provided name or create a unique name
+        if gallery_name is None or gallery_name.strip() == "":
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            gallery_name = f"reconstruction_{timestamp}"
+        gallery_path = os.path.join(gallery_dir, gallery_name)
+        # Check if directory already exists
+        if os.path.exists(gallery_path):
+            return False, f"Save failed: folder '{gallery_name}' already exists"
+        # Create the gallery directory
+        os.makedirs(gallery_path, exist_ok=True)
+        # Copy GLB file
+        glb_source = os.path.join(target_dir, "scene.glb")
+        glb_dest = os.path.join(gallery_path, "scene.glb")
+        if os.path.exists(glb_source):
+            shutil.copy2(glb_source, glb_dest)
+        # Copy depth visualization images
+        depth_vis_dir = os.path.join(target_dir, "depth_vis")
+        if os.path.exists(depth_vis_dir):
+            gallery_depth_vis = os.path.join(gallery_path, "depth_vis")
+            shutil.copytree(depth_vis_dir, gallery_depth_vis)
+        # Copy original images
+        images_source = os.path.join(target_dir, "images")
+        if os.path.exists(images_source):
+            gallery_images = os.path.join(gallery_path, "images")
+            shutil.copytree(images_source, gallery_images)
+        scene_preview_source = os.path.join(target_dir, "scene.jpg")
+        scene_preview_dest = os.path.join(gallery_path, "scene.jpg")
+        shutil.copy2(scene_preview_source, scene_preview_dest)
+        # Save metadata
+        metadata = {
+            "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
+            "num_images": len(processed_data) if processed_data else 0,
+            "gallery_name": gallery_name,
+        }
+        with open(os.path.join(gallery_path, "metadata.json"), "w") as f:
+            json.dump(metadata, f, indent=2)
+        print(f"Saved reconstruction to gallery: {gallery_path}")
+        return True, f"Save successful: saved to {gallery_path}"
+    except Exception as e:
+        print(f"Error saving to gallery: {e}")
+        return False, f"Save failed: {str(e)}"
+def get_scene_info(examples_dir: str) -> List[Dict[str, Any]]:
+    """
+    Get information about scenes in the examples directory.
+    Args:
+        examples_dir: Path to examples directory
+    Returns:
+        List of scene information dictionaries
+    """
+    import glob
+    scenes = []
+    if not os.path.exists(examples_dir):
+        return scenes
+    for scene_folder in sorted(os.listdir(examples_dir)):
+        scene_path = os.path.join(examples_dir, scene_folder)
+        if os.path.isdir(scene_path):
+            # Find all image files in the scene folder
+            image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff", "*.tif"]
+            image_files = []
+            for ext in image_extensions:
+                image_files.extend(glob.glob(os.path.join(scene_path, ext)))
+                image_files.extend(glob.glob(os.path.join(scene_path, ext.upper())))
+            if image_files:
+                # Sort images and get the first one for thumbnail
+                image_files = sorted(image_files)
+                first_image = image_files[0]
+                num_images = len(image_files)
+                scenes.append(
+                    {
+                        "name": scene_folder,
+                        "path": scene_path,
+                        "thumbnail": first_image,
+                        "num_images": num_images,
+                        "image_files": image_files,
+                    }
+                )
+    return scenes
+# NOTE: cleanup was moved to a single canonical helper in
+# `depth_anything_3.utils.memory.cleanup_cuda_memory`.
+# Callers should import and call that directly instead of using this module.
+def get_logo_base64() -> Optional[str]:
+    """
+    Convert WAI logo to base64 for embedding in HTML.
+    Returns:
+        Base64 encoded logo string or None
+    """
+    import base64
+    logo_path = "examples/WAI-Logo/wai_logo.png"
+    try:
+        with open(logo_path, "rb") as img_file:
+            img_data = img_file.read()
+            base64_str = base64.b64encode(img_data).decode()
+            return f"data:image/png;base64,{base64_str}"
+    except FileNotFoundError:
+        return None

Depth-Anything-3/src/depth_anything_3/app/modules/visualization.py ADDED Viewed

	@@ -0,0 +1,434 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Visualization module for Depth Anything 3 Gradio app.
+This module handles visualization updates, navigation, and measurement functionality.
+"""
+import os
+from typing import Any, Dict, List, Optional, Tuple
+import cv2
+import gradio as gr
+import numpy as np
+class VisualizationHandler:
+    """
+    Handles visualization updates and navigation for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the visualization handler."""
+    def update_view_selectors(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]]
+    ) -> Tuple[gr.Dropdown, gr.Dropdown]:
+        """
+        Update view selector dropdowns based on available views.
+        Args:
+            processed_data: Processed data dictionary
+        Returns:
+            Tuple of (depth_view_selector, measure_view_selector)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            choices = ["View 1"]
+        else:
+            num_views = len(processed_data)
+            choices = [f"View {i + 1}" for i in range(num_views)]
+        return (
+            gr.Dropdown(choices=choices, value=choices[0]),  # depth_view_selector
+            gr.Dropdown(choices=choices, value=choices[0]),  # measure_view_selector
+        )
+    def get_view_data_by_index(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Get view data by index, handling bounds.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to get
+        Returns:
+            View data dictionary or None
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return None
+        view_keys = list(processed_data.keys())
+        if view_index < 0 or view_index >= len(view_keys):
+            view_index = 0
+        return processed_data[view_keys[view_index]]
+    def update_depth_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Optional[str]:
+        """
+        Update depth view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Path to depth visualization image or None
+        """
+        view_data = self.get_view_data_by_index(processed_data, view_index)
+        if view_data is None or view_data.get("depth_image") is None:
+            return None
+        # Return the depth visualization image directly
+        return view_data["depth_image"]
+    def navigate_depth_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector_value: str,
+        direction: int,
+    ) -> Tuple[str, Optional[str]]:
+        """
+        Navigate depth view (direction: -1 for previous, +1 for next).
+        Args:
+            processed_data: Processed data dictionary
+            current_selector_value: Current selector value
+            direction: Direction to navigate (-1 for previous, +1 for next)
+        Returns:
+            Tuple of (new_selector_value, depth_vis)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return "View 1", None
+        # Parse current view number
+        try:
+            current_view = int(current_selector_value.split()[1]) - 1
+        except:  # noqa
+            current_view = 0
+        num_views = len(processed_data)
+        new_view = (current_view + direction) % num_views
+        new_selector_value = f"View {new_view + 1}"
+        depth_vis = self.update_depth_view(processed_data, new_view)
+        return new_selector_value, depth_vis
+    def update_measure_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], List]:
+        """
+        Update measure view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Tuple of (measure_image, depth_right_half, measure_points)
+        """
+        view_data = self.get_view_data_by_index(processed_data, view_index)
+        if view_data is None:
+            return None, None, []  # image, depth_right_half, measure_points
+        # Get the processed (resized) image
+        if "image" in view_data and view_data["image"] is not None:
+            image = view_data["image"].copy()
+        else:
+            return None, None, []
+        # Ensure image is in uint8 format
+        if image.dtype != np.uint8:
+            if image.max() <= 1.0:
+                image = (image * 255).astype(np.uint8)
+            else:
+                image = image.astype(np.uint8)
+        # Extract right half of the depth visualization (pure depth part)
+        depth_image_path = view_data.get("depth_image", None)
+        depth_right_half = None
+        if depth_image_path and os.path.exists(depth_image_path):
+            try:
+                # Load the combined depth visualization image
+                depth_combined = cv2.imread(depth_image_path)
+                depth_combined = cv2.cvtColor(depth_combined, cv2.COLOR_BGR2RGB)
+                if depth_combined is not None:
+                    height, width = depth_combined.shape[:2]
+                    # Extract right half (depth visualization part)
+                    depth_right_half = depth_combined[:, width // 2 :]
+            except Exception as e:
+                print(f"Error extracting depth right half: {e}")
+        return image, depth_right_half, []
+    def navigate_measure_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector_value: str,
+        direction: int,
+    ) -> Tuple[str, Optional[np.ndarray], Optional[str], List]:
+        """
+        Navigate measure view (direction: -1 for previous, +1 for next).
+        Args:
+            processed_data: Processed data dictionary
+            current_selector_value: Current selector value
+            direction: Direction to navigate (-1 for previous, +1 for next)
+        Returns:
+            Tuple of (new_selector_value, measure_image, depth_image_path, measure_points)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return "View 1", None, None, []
+        # Parse current view number
+        try:
+            current_view = int(current_selector_value.split()[1]) - 1
+        except:  # noqa
+            current_view = 0
+        num_views = len(processed_data)
+        new_view = (current_view + direction) % num_views
+        new_selector_value = f"View {new_view + 1}"
+        measure_image, depth_right_half, measure_points = self.update_measure_view(
+            processed_data, new_view
+        )
+        return new_selector_value, measure_image, depth_right_half, measure_points
+    def populate_visualization_tabs(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]]
+    ) -> Tuple[Optional[str], Optional[np.ndarray], Optional[str], List]:
+        """
+        Populate the depth and measure tabs with processed data.
+        Args:
+            processed_data: Processed data dictionary
+        Returns:
+            Tuple of (depth_vis, measure_img, depth_image_path, measure_points)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return None, None, None, []
+        # Use update function to get depth visualization
+        depth_vis = self.update_depth_view(processed_data, 0)
+        measure_img, depth_right_half, _ = self.update_measure_view(processed_data, 0)
+        return depth_vis, measure_img, depth_right_half, []
+    def reset_measure(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]]
+    ) -> Tuple[Optional[np.ndarray], List, str]:
+        """
+        Reset measure points.
+        Args:
+            processed_data: Processed data dictionary
+        Returns:
+            Tuple of (image, measure_points, text)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return None, [], ""
+        # Return the first view image
+        first_view = list(processed_data.values())[0]
+        return first_view["image"], [], ""
+    def measure(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        measure_points: List,
+        current_view_selector: str,
+        event: gr.SelectData,
+    ) -> List:
+        """
+        Handle measurement on images.
+        Args:
+            processed_data: Processed data dictionary
+            measure_points: List of current measure points
+            current_view_selector: Current view selector value
+            event: Gradio select event
+        Returns:
+            List of [image, depth_right_half, measure_points, text]
+        """
+        try:
+            print(f"Measure function called with selector: {current_view_selector}")
+            if processed_data is None or len(processed_data) == 0:
+                return [None, [], "No data available"]
+            # Use the currently selected view instead of always using the first view
+            try:
+                current_view_index = int(current_view_selector.split()[1]) - 1
+            except:  # noqa
+                current_view_index = 0
+            print(f"Using view index: {current_view_index}")
+            # Get view data safely
+            if current_view_index < 0 or current_view_index >= len(processed_data):
+                current_view_index = 0
+            view_keys = list(processed_data.keys())
+            current_view = processed_data[view_keys[current_view_index]]
+            if current_view is None:
+                return [None, [], "No view data available"]
+            point2d = event.index[0], event.index[1]
+            print(f"Clicked point: {point2d}")
+            measure_points.append(point2d)
+            # Get image and depth visualization
+            image, depth_right_half, _ = self.update_measure_view(
+                processed_data, current_view_index
+            )
+            if image is None:
+                return [None, [], "No image available"]
+            image = image.copy()
+            # Ensure image is in uint8 format for proper cv2 operations
+            try:
+                if image.dtype != np.uint8:
+                    if image.max() <= 1.0:
+                        # Image is in [0, 1] range, convert to [0, 255]
+                        image = (image * 255).astype(np.uint8)
+                    else:
+                        # Image is already in [0, 255] range
+                        image = image.astype(np.uint8)
+            except Exception as e:
+                print(f"Image conversion error: {e}")
+                return [None, [], f"Image conversion error: {e}"]
+            # Draw circles for points
+            try:
+                for p in measure_points:
+                    if 0 <= p[0] < image.shape[1] and 0 <= p[1] < image.shape[0]:
+                        image = cv2.circle(image, p, radius=5, color=(255, 0, 0), thickness=2)
+            except Exception as e:
+                print(f"Drawing error: {e}")
+                return [None, [], f"Drawing error: {e}"]
+            # Get depth information from processed_data
+            depth_text = ""
+            try:
+                for i, p in enumerate(measure_points):
+                    if (
+                        current_view["depth"] is not None
+                        and 0 <= p[1] < current_view["depth"].shape[0]
+                        and 0 <= p[0] < current_view["depth"].shape[1]
+                    ):
+                        d = current_view["depth"][p[1], p[0]]
+                        depth_text += f"- **P{i + 1} depth: {d:.2f}m**\n"
+                    else:
+                        depth_text += f"- **P{i + 1}: Click position ({p[0]}, {p[1]}) - No depth information**\n"  # noqa: E501
+            except Exception as e:
+                print(f"Depth text error: {e}")
+                depth_text = f"Error computing depth: {e}\n"
+            if len(measure_points) == 2:
+                try:
+                    point1, point2 = measure_points
+                    # Draw line
+                    if (
+                        0 <= point1[0] < image.shape[1]
+                        and 0 <= point1[1] < image.shape[0]
+                        and 0 <= point2[0] < image.shape[1]
+                        and 0 <= point2[1] < image.shape[0]
+                    ):
+                        image = cv2.line(image, point1, point2, color=(255, 0, 0), thickness=2)
+                    # Compute 3D distance using depth information and camera intrinsics
+                    distance_text = "- **Distance: Unable to calculate 3D distance**"
+                    if (
+                        current_view["depth"] is not None
+                        and 0 <= point1[1] < current_view["depth"].shape[0]
+                        and 0 <= point1[0] < current_view["depth"].shape[1]
+                        and 0 <= point2[1] < current_view["depth"].shape[0]
+                        and 0 <= point2[0] < current_view["depth"].shape[1]
+                    ):
+                        try:
+                            # Get depth values at the two points
+                            d1 = current_view["depth"][point1[1], point1[0]]
+                            d2 = current_view["depth"][point2[1], point2[0]]
+                            # Convert 2D pixel coordinates to 3D world coordinates
+                            if current_view["intrinsics"] is not None:
+                                # Get camera intrinsics
+                                K = current_view["intrinsics"]  # 3x3 intrinsic matrix
+                                fx, fy = K[0, 0], K[1, 1]  # focal lengths
+                                cx, cy = K[0, 2], K[1, 2]  # principal point
+                                # Convert pixel coordinates to normalized camera coordinates
+                                # Point 1: (u1, v1) -> (x1, y1, z1)
+                                u1, v1 = point1[0], point1[1]
+                                x1 = (u1 - cx) * d1 / fx
+                                y1 = (v1 - cy) * d1 / fy
+                                z1 = d1
+                                # Point 2: (u2, v2) -> (x2, y2, z2)
+                                u2, v2 = point2[0], point2[1]
+                                x2 = (u2 - cx) * d2 / fx
+                                y2 = (v2 - cy) * d2 / fy
+                                z2 = d2
+                                # Calculate 3D Euclidean distance
+                                p1_3d = np.array([x1, y1, z1])
+                                p2_3d = np.array([x2, y2, z2])
+                                distance_3d = np.linalg.norm(p1_3d - p2_3d)
+                                distance_text = f"- **Distance: {distance_3d:.2f}m**"
+                            else:
+                                # Fallback to simplified calculation if no intrinsics
+                                pixel_distance = np.sqrt(
+                                    (point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2
+                                )
+                                avg_depth = (d1 + d2) / 2
+                                scale_factor = avg_depth / 1000  # Rough scaling factor
+                                estimated_3d_distance = pixel_distance * scale_factor
+                                distance_text = f"- **Distance: {estimated_3d_distance:.2f}m (estimated, no intrinsics)**"  # noqa: E501
+                        except Exception as e:
+                            print(f"Distance computation error: {e}")
+                            distance_text = f"- **Distance computation error: {e}**"
+                    measure_points = []
+                    text = depth_text + distance_text
+                    print(f"Measurement complete: {text}")
+                    return [image, depth_right_half, measure_points, text]
+                except Exception as e:
+                    print(f"Final measurement error: {e}")
+                    return [None, [], f"Measurement error: {e}"]
+            else:
+                print(f"Single point measurement: {depth_text}")
+                return [image, depth_right_half, measure_points, depth_text]
+        except Exception as e:
+            print(f"Overall measure function error: {e}")
+            return [None, [], f"Measure function error: {e}"]

Depth-Anything-3/src/depth_anything_3/bench/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Depth Anything 3 Benchmark Evaluation Module.
+This module provides tools for evaluating DepthAnything3 model on various benchmark datasets.
+Currently supported datasets:
+- DTU (3D Reconstruction)
+- DTU-64 (Pose Evaluation Only)
+- ETH3D (3D Reconstruction)
+- 7Scenes (3D Reconstruction)
+- ScanNet++ (3D Reconstruction)
+- HiRoom (3D Reconstruction)
+Supported evaluation modes:
+- pose: Camera pose estimation evaluation
+- recon_unposed: 3D reconstruction with predicted poses
+- recon_posed: 3D reconstruction with ground truth poses
+"""
+from depth_anything_3.bench.registries import MV_REGISTRY, MONO_REGISTRY
+def __getattr__(name):
+    """Lazy import to avoid circular import when running as __main__."""
+    if name == "Evaluator":
+        from depth_anything_3.bench.evaluator import Evaluator
+        return Evaluator
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = ["Evaluator", "MV_REGISTRY", "MONO_REGISTRY"]

Depth-Anything-3/src/depth_anything_3/bench/configs/eval_bench.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+# DepthAnything3 Benchmark Evaluation Configuration
+#
+# This config can be loaded and overridden via command line.
+# Example: python -m depth_anything_3.bench.evaluator --model /path/to/model --work_dir /path/to/workspace
+#
+# See depth_anything_3.cfg for config utility functions.
+# ==============================================================================
+# Model Configuration
+# ==============================================================================
+model:
+  # Path to model checkpoint or HuggingFace model ID
+  path: depth-anything/DA3-GIANT
+# ==============================================================================
+# Workspace Configuration
+# ==============================================================================
+workspace:
+  # Working directory for outputs (model results, metrics, etc.)
+  work_dir: ./workspace/evaluation
+# ==============================================================================
+# Evaluation Configuration
+# ==============================================================================
+eval:
+  # Datasets to evaluate
+  # Options: dtu, dtu64, eth3d, 7scenes (sevenscenes), scannetpp, hiroom
+  datasets:
+    - eth3d
+    - 7scenes
+    - scannetpp
+    - hiroom
+    - dtu
+    - dtu64
+  # Evaluation modes
+  # Options: pose, recon_unposed, recon_posed, view_syn
+  modes:
+    - pose
+    - recon_unposed
+    - recon_posed
+  # Reference view selection strategy for inference
+  # Options: first, saddle_balanced, auto, mid
+  ref_view_strategy: "first"
+  # Specific scenes to evaluate (null = all scenes)
+  # Example: [courtyard, relief] for eth3d
+  scenes: null
+  # Maximum number of frames per scene (for sampling)
+  # If a scene has more frames, randomly sample to this limit.
+  # Set to -1 to disable sampling.
+  max_frames: 100
+  # Only run evaluation (skip inference)
+  eval_only: false
+  # Only print saved metrics (skip inference and evaluation)
+  print_only: false
+# ==============================================================================
+# Inference Configuration
+# ==============================================================================
+inference:
+  # Number of parallel workers for TSDF fusion
+  num_fusion_workers: 4
+  # Enable debug mode with verbose output
+  debug: false
+# ==============================================================================
+# Preset Configurations
+# ==============================================================================
+# These can be activated via command line: --preset full_eval
+presets:
+  # Full evaluation on all 6 datasets
+  full_eval:
+    datasets: [eth3d, 7scenes, scannetpp, hiroom, dtu, dtu64]
+    modes: [pose, recon_unposed, recon_posed]
+  # Pose-only evaluation
+  pose_only:
+    datasets: [eth3d, 7scenes, scannetpp, hiroom, dtu64]
+    modes: [pose]
+  # Reconstruction-only evaluation (5 datasets, excluding dtu64)
+  recon_only:
+    datasets: [eth3d, 7scenes, scannetpp, hiroom, dtu]
+    modes: [recon_unposed, recon_posed]
+  # Quick test (single scene per dataset)
+  quick_test:
+    datasets: [eth3d]
+    modes: [pose, recon_unposed]
+    scenes: [courtyard]

Depth-Anything-3/src/depth_anything_3/bench/dataset.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Base dataset class for benchmark evaluation.
+All dataset implementations should inherit from this class and implement
+the required abstract methods.
+"""
+import os
+import time
+from abc import abstractmethod
+from typing import Dict as TDict
+import numpy as np
+import torch
+from addict import Dict
+from depth_anything_3.bench.utils import compute_pose
+from depth_anything_3.utils.geometry import as_homogeneous
+def _wait_for_file_ready(path: str, timeout: float = 3.0, interval: float = 0.2) -> None:
+    """Wait until file size stabilizes for 2 consecutive checks."""
+    last_size = -1
+    stable_count = 0
+    start = time.time()
+    while time.time() - start < timeout:
+        time.sleep(interval)
+        size = os.path.getsize(path)
+        if size == last_size and size > 0:
+            stable_count += 1
+            if stable_count >= 2:  # Need 2 consecutive stable checks
+                return
+        else:
+            stable_count = 0
+        last_size = size
+class Dataset:
+    """
+    Base class for all benchmark datasets.
+    Subclasses must implement:
+        - SCENES: List of scene identifiers
+        - data_root: Path to dataset root
+        - get_data(scene): Return scene data (images, intrinsics, extrinsics, etc.)
+        - eval3d(scene, fuse_path): Evaluate 3D reconstruction
+        - fuse3d(scene, result_path, fuse_path, mode): Fuse depth maps into point cloud
+    Optional overrides:
+        - eval_pose(scene, result_path): Evaluate pose estimation (default provided)
+    """
+    # Subclasses should define these
+    SCENES: list = []
+    data_root: str = ""
+    def __init__(self):
+        pass
+    def eval_pose(self, scene: str, result_path: str) -> TDict[str, float]:
+        """
+        Evaluate camera pose estimation accuracy.
+        Args:
+            scene: Scene identifier
+            result_path: Path to .npz file containing predicted extrinsics
+        Returns:
+            Dict with pose metrics (auc30, auc15, auc05, auc03)
+        """
+        _wait_for_file_ready(result_path)
+        pred = np.load(result_path)
+        gt = self.get_data(scene)
+        return compute_pose(
+            torch.from_numpy(as_homogeneous(pred["extrinsics"])),
+            torch.from_numpy(as_homogeneous(gt["extrinsics"])),
+        )
+    @abstractmethod
+    def get_data(self, scene: str) -> Dict:
+        """
+        Get scene data including images, camera parameters, and auxiliary info.
+        Args:
+            scene: Scene identifier
+        Returns:
+            Dict with:
+                - image_files: List[str] - paths to images
+                - extrinsics: np.ndarray [N, 4, 4] - camera extrinsics (world-to-camera)
+                - intrinsics: np.ndarray [N, 3, 3] - camera intrinsics
+                - aux: Dict - auxiliary data (masks, GT paths, etc.)
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def eval3d(self, scene: str, fuse_path: str) -> TDict[str, float]:
+        """
+        Evaluate 3D reconstruction quality against ground truth.
+        Args:
+            scene: Scene identifier
+            fuse_path: Path to fused point cloud (.ply)
+        Returns:
+            Dict with reconstruction metrics (e.g., acc, comp, overall)
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def fuse3d(self, scene: str, result_path: str, fuse_path: str, mode: str) -> None:
+        """
+        Fuse per-view depth maps into a single point cloud.
+        Args:
+            scene: Scene identifier
+            result_path: Path to .npz file with predicted depths and poses
+            fuse_path: Output path for fused point cloud (.ply)
+            mode: Fusion mode ("recon_unposed" or "recon_posed")
+        """
+        raise NotImplementedError

Depth-Anything-3/src/depth_anything_3/bench/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Benchmark dataset implementations.
+Datasets are auto-registered via decorators when imported.
+Add new dataset files here and they will be automatically discovered.
+"""

Depth-Anything-3/src/depth_anything_3/bench/datasets/dtu.py ADDED Viewed

	@@ -0,0 +1,681 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+DTU Benchmark dataset implementation.
+DTU is a multi-view stereo benchmark for 3D reconstruction evaluation.
+Reference: https://roboimagedata.compute.dtu.dk/
+Note: DepthAnything3 was never trained on any images from DTU.
+"""
+import glob
+import os
+from typing import Dict as TDict, List
+import numpy as np
+import open3d as o3d
+import torch
+import torch.nn.functional as F
+from addict import Dict
+from PIL import Image
+from plyfile import PlyData
+from scipy.io import loadmat
+from sklearn import neighbors as skln
+from tqdm import tqdm
+from depth_anything_3.bench.dataset import Dataset
+from depth_anything_3.bench.registries import MONO_REGISTRY, MV_REGISTRY
+from depth_anything_3.utils.constants import (
+    DTU_DIST_THRESH,
+    DTU_EVAL_DATA_ROOT,
+    DTU_MAX_POINTS,
+    DTU_NUM_CONSIST,
+    DTU_SCENES,
+)
+from depth_anything_3.utils.pose_align import align_poses_umeyama
+@MV_REGISTRY.register(name="dtu")
+@MONO_REGISTRY.register(name="dtu")
+class DTU(Dataset):
+    """
+    DTU Benchmark dataset wrapper for DepthAnything3 evaluation.
+    Supports:
+        - Camera pose estimation evaluation (AUC metrics)
+        - 3D reconstruction evaluation (accuracy, completeness, overall)
+        - Point cloud fusion from depth maps
+    The dataset uses MVSNet evaluation protocol:
+    https://drive.google.com/file/d/1rX0EXlUL4prRxrRu2DgLJv2j7-tpUD4D/view
+    """
+    data_root = DTU_EVAL_DATA_ROOT
+    SCENES = DTU_SCENES
+    # Evaluation/triangulation hyperparameters from constants
+    dist_thresh = DTU_DIST_THRESH
+    num_consist = DTU_NUM_CONSIST
+    # ------------------------------
+    # Public API
+    # ------------------------------
+    def read_cam_file(self, filename: str) -> tuple:
+        """
+        Read DTU camera file containing extrinsics and intrinsics.
+        Args:
+            filename: Path to camera text file
+        Returns:
+            Tuple of (intrinsics [3,3], extrinsics [4,4])
+        """
+        with open(filename) as f:
+            lines = [line.rstrip() for line in f.readlines()]
+        extrinsics = np.fromstring(" ".join(lines[1:5]), dtype=np.float32, sep=" ").reshape((4, 4))
+        intrinsics = np.fromstring(" ".join(lines[7:10]), dtype=np.float32, sep=" ").reshape((3, 3))
+        return intrinsics, extrinsics
+    def get_data(self, scene: str) -> Dict:
+        """
+        Collect per-view image paths, intrinsics/extrinsics, and GT masks.
+        Args:
+            scene: Scene identifier (e.g., "scan1")
+        Returns:
+            Dict with:
+                - image_files: List[str] - paths to images
+                - extrinsics: np.ndarray [N, 4, 4]
+                - intrinsics: np.ndarray [N, 3, 3]
+                - aux.mask_files: List[str] - paths to depth masks
+        """
+        rgb_folder = os.path.join(self.data_root, "Rectified", scene)
+        camera_folder = os.path.join(self.data_root, "Cameras")
+        files = sorted(glob.glob(os.path.join(rgb_folder, "*.png")))
+        # Reorder: place index 33 first (reference view convention)
+        files = [files[33]] + files[:33] + files[34:]
+        out = Dict(
+            {
+                "image_files": files,
+                "extrinsics": [],
+                "intrinsics": [],
+                "aux": Dict({"mask_files": []}),
+            }
+        )
+        for rgb_file in files:
+            basename = os.path.basename(rgb_file)
+            file_idx = basename.split("_")[1]
+            cam_idx = depth_idx = int(file_idx) - 1
+            mask_file = self._depth_mask_path(scene, depth_idx)
+            proj_mat_filename = os.path.join(camera_folder, f"{cam_idx:0>8}_cam.txt")
+            ixt, ext = self.read_cam_file(proj_mat_filename)
+            out.extrinsics.append(ext)
+            out.intrinsics.append(ixt)
+            out.aux.mask_files.append(mask_file)
+        out.extrinsics = np.asarray(out.extrinsics, dtype=np.float32)
+        out.intrinsics = np.asarray(out.intrinsics, dtype=np.float32)
+        return out
+    def get_3dgtpath(self, scene: str) -> str:
+        """Get path to ground truth point cloud for a scene."""
+        scene_id = int(scene[4:])
+        return os.path.join(self.data_root, f"Points/stl/stl{scene_id:03}_total.ply")
+    def eval3d(self, scene: str, fuse_path: str, use_gpu: bool = False) -> TDict[str, float]:
+        """
+        Evaluate fused point cloud against DTU GT with ObsMask/Plane.
+        Args:
+            scene: Scene identifier
+            fuse_path: Path to fused point cloud
+            use_gpu: If True, use GPU-accelerated distance computation (faster but may have minor numerical differences)
+        Returns:
+            Dict with metrics: {"comp": float, "acc": float, "overall": float}
+        """
+        scene_id = int(scene[4:])
+        gt_ply = os.path.join(self.data_root, f"Points/stl/stl{scene_id:03}_total.ply")
+        mask_file = os.path.join(
+            self.data_root, f"SampleSet/mvs_data/ObsMask/ObsMask{scene_id}_10.mat"
+        )
+        plane_file = os.path.join(
+            self.data_root, f"SampleSet/mvs_data/ObsMask/Plane{scene_id}.mat"
+        )
+        result = self._evaluate_reconstruction(
+            scene, fuse_path, gt_ply, mask_file, plane_file, use_gpu=use_gpu
+        )
+        return {"comp": result[0], "acc": result[1], "overall": result[2]}
+    def load_masks(self, mask_files: List[str]) -> np.ndarray:
+        """
+        Load DTU depth validity masks.
+        Args:
+            mask_files: List of paths to mask images
+        Returns:
+            Boolean array [N, H, W] indicating valid depth regions
+        """
+        masks = []
+        for mask_file in mask_files:
+            mask = Image.open(mask_file)
+            mask = np.array(mask, dtype=np.float32)
+            masks.append(mask > 10)
+        return np.asarray(masks)
+    def fuse3d(self, scene: str, result_path: str, fuse_path: str, mode: str) -> None:
+        """
+        Fuse per-view depths into a point cloud and save to PLY.
+        Args:
+            scene: Scene identifier (e.g., "scan114")
+            result_path: Path to npz file containing predicted depths/poses
+            fuse_path: Output path for fused point cloud (.ply)
+            mode: "recon_unposed" or "recon_posed"
+        """
+        gt_data = self.get_data(scene)
+        pred_data = Dict({k: v for k, v in np.load(result_path).items()})
+        masks = self.load_masks(gt_data.aux.mask_files)
+        if mode == "recon_unposed":
+            depths, intrinsics, extrinsics = self._prep_unposed(pred_data, gt_data, masks)
+        elif mode == "recon_posed":
+            depths, intrinsics, extrinsics = self._prep_posed(pred_data, gt_data, masks)
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+        proj_mat = self._build_proj_mats(intrinsics, extrinsics)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        dtype = torch.float32
+        depths_t = torch.from_numpy(depths).to(device=device, dtype=dtype).unsqueeze(1)
+        proj_t = torch.from_numpy(proj_mat).to(device=device, dtype=dtype)
+        height, width = depths_t.shape[-2:]
+        points: List[np.ndarray] = []
+        for idx in range(len(gt_data.image_files)):
+            if mode == "recon_unposed":
+                # Simple unfiltered back-projection per frame
+                cur_p_pcd = self._generate_points_from_depth(
+                    depths_t[idx : idx + 1], proj_t[idx : idx + 1]
+                )
+                mask = (depths_t[idx : idx + 1] > 0.001).squeeze()
+                cur_p_pcd = cur_p_pcd[:, :, mask]
+                no_filter_pc = cur_p_pcd.squeeze(0).permute(1, 0).cpu().numpy()
+                points.append(no_filter_pc)
+            else:  # recon_posed
+                final_pc = self._fuse_consistent_points(depths_t, proj_t, idx, height, width)
+                points.append(final_pc)
+        # Concatenate and optionally downsample to hard cap
+        points_np = np.concatenate(points, axis=0)
+        points_np = self._cap_points(points_np, max_points=DTU_MAX_POINTS)
+        os.makedirs(os.path.dirname(fuse_path), exist_ok=True)
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(points_np)
+        o3d.io.write_point_cloud(fuse_path, pcd)
+    # ------------------------------
+    # Geometry helpers
+    # ------------------------------
+    def _generate_points_from_depth(
+        self, depth: torch.Tensor, proj: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Back-project depth map into 3D world coordinates.
+        Args:
+            depth: Depth tensor [B, 1, H, W]
+            proj: Projection matrix [B, 4, 4] = [[K@R, K@t], [0,0,0,1]]
+        Returns:
+            Point cloud tensor [B, 3, H, W]
+        """
+        batch, height, width = depth.shape[0], depth.shape[2], depth.shape[3]
+        inv_proj = torch.inverse(proj)
+        rot = inv_proj[:, :3, :3]
+        trans = inv_proj[:, :3, 3:4]
+        y, x = torch.meshgrid(
+            [
+                torch.arange(0, height, dtype=torch.float32, device=depth.device),
+                torch.arange(0, width, dtype=torch.float32, device=depth.device),
+            ],
+            indexing="ij",
+        )
+        y, x = y.contiguous(), x.contiguous()
+        y, x = y.view(height * width), x.view(height * width)
+        xyz = torch.stack((x, y, torch.ones_like(x)))
+        xyz = torch.unsqueeze(xyz, 0).repeat(batch, 1, 1)
+        rot_xyz = torch.matmul(rot, xyz)
+        rot_depth_xyz = rot_xyz * depth.view(batch, 1, -1)
+        proj_xyz = rot_depth_xyz + trans.view(batch, 3, 1)
+        return proj_xyz.view(batch, 3, height, width)
+    def _homo_warping(
+        self,
+        src_fea: torch.Tensor,
+        src_proj: torch.Tensor,
+        ref_proj: torch.Tensor,
+        depth_values: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Homography warping for multi-view consistency checking.
+        Args:
+            src_fea: Source features [B, C, H, W]
+            src_proj: Source projection [B, 4, 4]
+            ref_proj: Reference projection [B, 4, 4]
+            depth_values: Depth values [B, Ndepth] or [B, Ndepth, H, W]
+        Returns:
+            Warped features [B, C, H, W]
+        """
+        batch, channels = src_fea.shape[0], src_fea.shape[1]
+        height, width = src_fea.shape[2], src_fea.shape[3]
+        with torch.no_grad():
+            proj = torch.matmul(src_proj, torch.inverse(ref_proj))
+            rot = proj[:, :3, :3]
+            trans = proj[:, :3, 3:4]
+            y, x = torch.meshgrid(
+                [
+                    torch.arange(0, height, dtype=torch.float32, device=src_fea.device),
+                    torch.arange(0, width, dtype=torch.float32, device=src_fea.device),
+                ],
+                indexing="ij",
+            )
+            y, x = y.contiguous(), x.contiguous()
+            y, x = y.view(height * width), x.view(height * width)
+            xyz = torch.stack((x, y, torch.ones_like(x)))
+            xyz = torch.unsqueeze(xyz, 0).repeat(batch, 1, 1)
+            rot_xyz = torch.matmul(rot, xyz)
+            rot_depth_xyz = rot_xyz.unsqueeze(2) * depth_values.view(-1, 1, 1, height * width)
+            proj_xyz = rot_depth_xyz + trans.view(batch, 3, 1, 1)
+            proj_xy = proj_xyz[:, :2, :, :] / proj_xyz[:, 2:3, :, :]
+            proj_x_normalized = proj_xy[:, 0, :, :] / ((width - 1) / 2) - 1
+            proj_y_normalized = proj_xy[:, 1, :, :] / ((height - 1) / 2) - 1
+            grid = torch.stack((proj_x_normalized, proj_y_normalized), dim=3)
+        warped_src_fea = F.grid_sample(
+            src_fea,
+            grid.view(batch, height, width, 2),
+            mode="bilinear",
+            padding_mode="zeros",
+            align_corners=True,
+        )
+        return warped_src_fea.view(batch, channels, height, width)
+    def _filter_depth(
+        self,
+        ref_depth: torch.Tensor,
+        src_depths: torch.Tensor,
+        ref_proj: torch.Tensor,
+        src_projs: torch.Tensor,
+    ) -> tuple:
+        """
+        Compute geometric consistency between reference and source depths.
+        Args:
+            ref_depth: Reference depth [1, 1, H, W]
+            src_depths: Source depths [B, 1, H, W]
+            ref_proj: Reference projection [1, 4, 4]
+            src_projs: Source projections [B, 4, 4]
+        Returns:
+            Tuple of (ref_pc, aligned_pcs, dist)
+        """
+        ref_pc = self._generate_points_from_depth(ref_depth, ref_proj)
+        src_pcs = self._generate_points_from_depth(src_depths, src_projs)
+        aligned_pcs = self._homo_warping(src_pcs, src_projs, ref_proj, ref_depth)
+        x_2 = (ref_pc[:, 0] - aligned_pcs[:, 0]) ** 2
+        y_2 = (ref_pc[:, 1] - aligned_pcs[:, 1]) ** 2
+        z_2 = (ref_pc[:, 2] - aligned_pcs[:, 2]) ** 2
+        dist = torch.sqrt(x_2 + y_2 + z_2).unsqueeze(1)
+        return ref_pc, aligned_pcs, dist
+    def _extract_points(
+        self, pc: torch.Tensor, mask: torch.Tensor, rgb: np.ndarray = None
+    ) -> np.ndarray:
+        """Extract masked points from a dense grid."""
+        pc = pc.cpu().numpy()
+        mask = mask.cpu().numpy().reshape(-1)
+        pc = pc.reshape(-1, 3)
+        points = pc[np.where(mask)]
+        if rgb is not None:
+            rgb = rgb.reshape(-1, 3)
+            colors = rgb[np.where(mask)]
+            return np.concatenate([points, colors], axis=1)
+        return points
+    # ------------------------------
+    # 3D Reconstruction Evaluation
+    # ------------------------------
+    def _evaluate_reconstruction(
+        self,
+        scanid: str,
+        pred_ply: str,
+        gt_ply: str,
+        mask_file: str,
+        plane_file: str,
+        down_dense: float = 0.2,
+        patch: int = 60,
+        max_dist: int = 20,
+        use_gpu: bool = False,
+    ) -> tuple:
+        """
+        Compute accuracy, completeness, and overall metrics for one scan.
+        Args:
+            scanid: Scan identifier
+            pred_ply: Predicted point cloud path or array
+            gt_ply: Ground truth point cloud path or array
+            mask_file: ObsMask file path
+            plane_file: Plane file path
+            down_dense: Downsample density (min distance between points)
+            patch: Patch size for boundary
+            max_dist: Outlier threshold in mm
+            use_gpu: If True, use GPU-accelerated distance computation
+        Returns:
+            Tuple of (mean_d2s, mean_s2d, overall)
+        """
+        thresh = down_dense
+        # Load and downsample predicted point cloud
+        data_pcd = self._read_ply(pred_ply) if isinstance(pred_ply, str) else pred_ply
+        # Use fixed seed for reproducibility
+        shuffle_rng = np.random.default_rng(seed=42)
+        shuffle_rng.shuffle(data_pcd, axis=0)
+        # Downsample point cloud
+        nn_engine = skln.NearestNeighbors(
+            n_neighbors=1, radius=thresh, algorithm="kd_tree", n_jobs=-1
+        )
+        nn_engine.fit(data_pcd)
+        rnn_idxs = nn_engine.radius_neighbors(data_pcd, radius=thresh, return_distance=False)
+        mask = np.ones(data_pcd.shape[0], dtype=np.bool_)
+        for curr, idxs in enumerate(rnn_idxs):
+            if mask[curr]:
+                mask[idxs] = 0
+                mask[curr] = 1
+        data_down = data_pcd[mask]
+        # Restrict to observed volume (ObsMask)
+        obs_mask_file = loadmat(mask_file)
+        ObsMask, BB, Res = (obs_mask_file[attr] for attr in ["ObsMask", "BB", "Res"])
+        BB = BB.astype(np.float32)
+        inbound = ((data_down >= BB[:1] - patch) & (data_down < BB[1:] + patch * 2)).sum(
+            axis=-1
+        ) == 3
+        data_in = data_down[inbound]
+        data_grid = np.around((data_in - BB[:1]) / Res).astype(np.int32)
+        grid_inbound = ((data_grid >= 0) & (data_grid < np.expand_dims(ObsMask.shape, 0))).sum(
+            axis=-1
+        ) == 3
+        data_grid_in = data_grid[grid_inbound]
+        in_obs = ObsMask[data_grid_in[:, 0], data_grid_in[:, 1], data_grid_in[:, 2]].astype(
+            np.bool_
+        )
+        data_in_obs = data_in[grid_inbound][in_obs]
+        # Compute accuracy (pred -> GT) and completeness (GT -> pred)
+        stl = self._read_ply(gt_ply) if isinstance(gt_ply, str) else gt_ply
+        if use_gpu and torch.cuda.is_available():
+            # GPU-accelerated distance computation
+            mean_d2s = self._knn_dist_gpu(data_in_obs, stl, max_dist)
+        else:
+            # CPU version (original, for exact reproduction)
+            nn_engine.fit(stl)
+            dist_d2s, _ = nn_engine.kneighbors(data_in_obs, n_neighbors=1, return_distance=True)
+            mean_d2s = dist_d2s[dist_d2s < max_dist].mean()
+        ground_plane = loadmat(plane_file)["P"]
+        stl_hom = np.concatenate([stl, np.ones_like(stl[:, :1])], -1)
+        above = (ground_plane.reshape((1, 4)) * stl_hom).sum(-1) > 0
+        stl_above = stl[above]
+        if use_gpu and torch.cuda.is_available():
+            # GPU-accelerated distance computation
+            mean_s2d = self._knn_dist_gpu(stl_above, data_in, max_dist)
+        else:
+            # CPU version (original, for exact reproduction)
+            nn_engine.fit(data_in)
+            dist_s2d, _ = nn_engine.kneighbors(stl_above, n_neighbors=1, return_distance=True)
+            mean_s2d = dist_s2d[dist_s2d < max_dist].mean()
+        overall = (mean_d2s + mean_s2d) / 2
+        return mean_d2s, mean_s2d, overall
+    def _knn_dist_gpu(
+        self,
+        query: np.ndarray,
+        target: np.ndarray,
+        max_dist: float,
+        batch_size: int = 8192,
+        target_batch_size: int = 50000,
+    ) -> float:
+        """
+        GPU-accelerated nearest neighbor distance computation.
+        Args:
+            query: Query points [N, 3]
+            target: Target points [M, 3]
+            max_dist: Outlier threshold
+            batch_size: Batch size for query to avoid OOM (tuned for 16GB GPU)
+            target_batch_size: Batch size for target to avoid OOM
+        Returns:
+            Mean distance (excluding outliers)
+        """
+        device = torch.device("cuda")
+        all_min_dists = []
+        n_query_batches = (len(query) + batch_size - 1) // batch_size
+        n_target_batches = (len(target) + target_batch_size - 1) // target_batch_size
+        # Pre-load target batches to GPU to avoid repeated transfers
+        # Memory: ~50000 pts * 3 coords * 4 bytes * n_batches
+        target_batches = []
+        for j in range(0, len(target), target_batch_size):
+            target_batch = target[j : j + target_batch_size]
+            target_t = torch.from_numpy(target_batch).float().to(device)
+            target_batches.append(target_t)
+        with tqdm(total=n_query_batches, desc="  GPU KNN", leave=False, ncols=100) as pbar:
+            for i in range(0, len(query), batch_size):
+                batch = query[i : i + batch_size]
+                query_t = torch.from_numpy(batch).float().to(device)
+                # Compute distances to all target batches
+                # Memory peak: query_batch × target_batch_size × 4 bytes
+                # = 8192 × 50000 × 4 = ~1.6 GB per cdist call
+                batch_min_dists = []
+                for target_t in target_batches:
+                    dists = torch.cdist(query_t, target_t)
+                    batch_min_dists.append(dists.min(dim=1).values)
+                    del dists  # Free immediately
+                # Get minimum distance across all target batches
+                min_dists = torch.stack(batch_min_dists, dim=1).min(dim=1).values
+                all_min_dists.append(min_dists.cpu().numpy())
+                del query_t, min_dists, batch_min_dists
+                pbar.update(1)
+        # Clean up target batches
+        for target_t in target_batches:
+            del target_t
+        torch.cuda.empty_cache()
+        all_min_dists = np.concatenate(all_min_dists)
+        return all_min_dists[all_min_dists < max_dist].mean()
+    def _read_ply(self, file: str) -> np.ndarray:
+        """Read point cloud from PLY file."""
+        data = PlyData.read(file)
+        vertex = data["vertex"]
+        return np.stack([vertex["x"], vertex["y"], vertex["z"]], axis=-1)
+    # ------------------------------
+    # Private helpers
+    # ------------------------------
+    def _depth_mask_path(self, scene: str, depth_idx: int) -> str:
+        """Get path to depth mask for a scene and frame."""
+        return os.path.join(
+            self.data_root, "depth_raw", "Depths", scene, f"depth_visual_{depth_idx:04d}.png"
+        )
+    def _prep_unposed(
+        self, pred_data: Dict, gt_data: Dict, masks: np.ndarray
+    ) -> tuple:
+        """
+        Prepare depths/intrinsics/extrinsics for recon_unposed mode.
+        Applies Umeyama scale, rescales intrinsics if depth resolution differs,
+        and zeroes invalid-mask depths (nearest interpolation as in paper).
+        """
+        _, _, scale, extrinsics = align_poses_umeyama(
+            gt_data.extrinsics.copy(),
+            pred_data.extrinsics.copy(),
+            ransac=True,
+            return_aligned=True,
+            random_state=42,
+        )
+        depths = pred_data.depth * scale
+        intrinsics = pred_data.intrinsics.copy()
+        if depths.shape[-2:] != masks.shape[-2:]:
+            # When resizing depths to mask size, adjust intrinsics accordingly
+            sx = masks.shape[-1] / depths.shape[-1]
+            sy = masks.shape[-2] / depths.shape[-2]
+            intrinsics[:, 0:1] *= sx
+            intrinsics[:, 1:2] *= sy
+            depths = F.interpolate(
+                torch.from_numpy(depths)[None].float(),
+                size=(masks.shape[-2], masks.shape[-1]),
+                mode="nearest",
+            )[0].numpy()
+            depths[masks == False] = 0.0  # noqa: E712
+        return depths, intrinsics, extrinsics
+    def _prep_posed(
+        self, pred_data: Dict, gt_data: Dict, masks: np.ndarray
+    ) -> tuple:
+        """
+        Prepare depths/intrinsics/extrinsics for recon_posed mode.
+        Uses GT intrinsics/extrinsics but aligns scale via Umeyama.
+        Same mask order as other datasets: mask BEFORE scale.
+        """
+        _, _, scale, _ = align_poses_umeyama(
+            gt_data.extrinsics.copy(),
+            pred_data.extrinsics.copy(),
+            ransac=True,
+            return_aligned=True,
+            random_state=42,
+        )
+        depths = pred_data.depth.copy()
+        intrinsics = gt_data.intrinsics.copy()
+        extrinsics = gt_data.extrinsics.copy()
+        if depths.shape[-2:] != masks.shape[-2:]:
+            depths = F.interpolate(
+                torch.from_numpy(depths)[None].float(),
+                size=(masks.shape[-2], masks.shape[-1]),
+                mode="nearest",
+            )[0].numpy()
+        # Mask BEFORE scale (same as other datasets)
+        depths[masks == False] = 0.0  # noqa: E712
+        depths = depths * scale
+        return depths, intrinsics, extrinsics
+    def _build_proj_mats(
+        self, intrinsics: np.ndarray, extrinsics: np.ndarray
+    ) -> np.ndarray:
+        """Compute per-view 4x4 projection matrices from K and [R|t]."""
+        proj_mat_list = []
+        for i in range(len(intrinsics)):
+            proj_mat = np.eye(4, dtype=np.float32)
+            proj_mat[:3, :4] = np.dot(intrinsics[i], extrinsics[i][:3])
+            proj_mat_list.append(proj_mat)
+        return np.stack(proj_mat_list, axis=0)
+    def _fuse_consistent_points(
+        self,
+        depths_t: torch.Tensor,
+        proj_t: torch.Tensor,
+        idx: int,
+        H: int,
+        W: int,
+    ) -> np.ndarray:
+        """Fuse points consistent across multiple source views for a reference index."""
+        device, dtype = depths_t.device, depths_t.dtype
+        pc_buff = torch.zeros((3, H, W), device=device, dtype=dtype)
+        val_cnt = torch.zeros((1, H, W), device=device, dtype=dtype)
+        j = 0
+        batch_size = 20
+        tot_frame = depths_t.shape[0]
+        while True:
+            ref_pc, pcs, dist = self._filter_depth(
+                ref_depth=depths_t[idx : idx + 1],
+                src_depths=depths_t[j : min(j + batch_size, tot_frame)],
+                ref_proj=proj_t[idx : idx + 1],
+                src_projs=proj_t[j : min(j + batch_size, tot_frame)],
+            )
+            masks = (dist < self.dist_thresh).float()
+            masked_pc = pcs * masks
+            pc_buff += masked_pc.sum(dim=0, keepdim=False)
+            val_cnt += masks.sum(dim=0, keepdim=False)
+            j += batch_size
+            if j >= tot_frame:
+                break
+        final_mask = (val_cnt >= self.num_consist).squeeze(0)
+        avg_points = torch.div(pc_buff, val_cnt).permute(1, 2, 0)
+        final_pc = self._extract_points(avg_points, final_mask)
+        return final_pc
+    def _cap_points(self, points: np.ndarray, max_points: int) -> np.ndarray:
+        """Downsample points if exceeding max count."""
+        if len(points) <= max_points:
+            return points
+        # Use fixed seed for reproducibility
+        rng = np.random.default_rng(seed=42)
+        random_idx = rng.choice(len(points), max_points, replace=False)
+        return points[random_idx]

Depth-Anything-3/src/depth_anything_3/bench/datasets/dtu64.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+DTU-64 Dataset implementation for POSE EVALUATION ONLY.
+This is a subset of DTU with 64 images per scene, specifically designed for
+camera pose estimation evaluation. It does NOT support 3D reconstruction.
+Note: GT depth loading is not implemented as it's not needed for pose evaluation.
+"""
+import glob
+import os
+from typing import Dict as TDict
+import numpy as np
+from addict import Dict
+from depth_anything_3.bench.dataset import Dataset
+from depth_anything_3.bench.registries import MONO_REGISTRY, MV_REGISTRY
+from depth_anything_3.utils.constants import (
+    DTU64_CAMERA_ROOT,
+    DTU64_EVAL_DATA_ROOT,
+    DTU64_SCENES,
+)
+@MV_REGISTRY.register(name="dtu64")
+@MONO_REGISTRY.register(name="dtu64")
+class DTU64(Dataset):
+    """
+    DTU-64 Dataset wrapper for DepthAnything3 POSE EVALUATION ONLY.
+    This dataset is a subset of DTU with 64 images per scene.
+    It is specifically designed for camera pose estimation evaluation
+    and does NOT support 3D reconstruction evaluation.
+    Dataset structure:
+        DTU/scans/
+        ├── {scene}/
+        │   └── image/          # RGB images (64 per scene)
+        └── Cameras/
+            └── {idx}_cam.txt   # Camera parameters
+    Supported modes:
+        - pose: Camera pose estimation evaluation
+    NOT supported:
+        - recon_unposed: 3D reconstruction (no GT depth available)
+        - recon_posed: 3D reconstruction (no GT depth available)
+    """
+    data_root = DTU64_EVAL_DATA_ROOT
+    camera_root = DTU64_CAMERA_ROOT
+    SCENES = DTU64_SCENES
+    def __init__(self):
+        super().__init__()
+        self._scene_cache = {}
+    # ------------------------------
+    # Camera file parsing
+    # ------------------------------
+    def read_cam_file(self, filename: str) -> tuple:
+        """
+        Read DTU camera file containing extrinsics and intrinsics.
+        Args:
+            filename: Path to camera text file
+        Returns:
+            Tuple of (intrinsics [3,3], extrinsics [4,4])
+        """
+        with open(filename) as f:
+            lines = [line.rstrip() for line in f.readlines()]
+        # extrinsics: line [1,5), 4x4 matrix
+        extrinsics = np.fromstring(" ".join(lines[1:5]), dtype=np.float32, sep=" ").reshape((4, 4))
+        # intrinsics: line [7-10), 3x3 matrix
+        intrinsics = np.fromstring(" ".join(lines[7:10]), dtype=np.float32, sep=" ").reshape((3, 3))
+        return intrinsics, extrinsics
+    # ------------------------------
+    # Public API
+    # ------------------------------
+    def get_data(self, scene: str) -> Dict:
+        """
+        Collect per-view image paths, intrinsics/extrinsics for a scene.
+        Args:
+            scene: Scene identifier (e.g., "scan105")
+        Returns:
+            Dict with:
+                - image_files: List[str] - paths to images (64 per scene)
+                - extrinsics: np.ndarray [N, 4, 4] - world-to-camera transforms
+                - intrinsics: np.ndarray [N, 3, 3] - camera intrinsics
+                - aux: Dict (empty for this dataset)
+        """
+        if scene in self._scene_cache:
+            return self._scene_cache[scene]
+        rgb_folder = os.path.join(self.data_root, scene, "image")
+        # Get all PNG files sorted
+        files = sorted(glob.glob(os.path.join(rgb_folder, "*.png")))
+        # Reorder: place index 33 first (reference view convention)
+        if len(files) > 33:
+            files = [files[33]] + files[:33] + files[34:]
+        out = Dict({
+            "image_files": [],
+            "extrinsics": [],
+            "intrinsics": [],
+            "aux": Dict({}),
+        })
+        for rgb_file in files:
+            basename = os.path.basename(rgb_file)
+            # File naming: "00000033.png" -> cam_idx = 33
+            file_idx = basename.split(".")[0]
+            cam_idx = int(file_idx)
+            # Camera file path
+            cam_file = os.path.join(self.camera_root, f"{cam_idx:0>8}_cam.txt")
+            if not os.path.exists(cam_file):
+                print(f"[DTU-64] Warning: Camera file not found: {cam_file}")
+                continue
+            intrinsics, extrinsics = self.read_cam_file(cam_file)
+            out.image_files.append(rgb_file)
+            out.extrinsics.append(extrinsics)
+            out.intrinsics.append(intrinsics)
+        out.extrinsics = np.asarray(out.extrinsics, dtype=np.float32)
+        out.intrinsics = np.asarray(out.intrinsics, dtype=np.float32)
+        print(f"[DTU-64] {scene}: {len(out.image_files)} images (pose evaluation only)")
+        self._scene_cache[scene] = out
+        return out
+    def eval3d(self, scene: str, fuse_path: str) -> TDict[str, float]:
+        """
+        NOT SUPPORTED for DTU-64.
+        DTU-64 is only for pose evaluation, not 3D reconstruction.
+        """
+        raise NotImplementedError(
+            "DTU-64 dataset is for POSE EVALUATION ONLY. "
+            "3D reconstruction evaluation is not supported. "
+            "Use the standard 'dtu' dataset for 3D reconstruction evaluation."
+        )
+    def fuse3d(self, scene: str, result_path: str, fuse_path: str, mode: str) -> None:
+        """
+        NOT SUPPORTED for DTU-64.
+        DTU-64 is only for pose evaluation, not 3D reconstruction.
+        """
+        raise NotImplementedError(
+            "DTU-64 dataset is for POSE EVALUATION ONLY. "
+            "3D reconstruction (fuse3d) is not supported. "
+            "Use the standard 'dtu' dataset for 3D reconstruction."
+        )

Depth-Anything-3/src/depth_anything_3/bench/datasets/eth3d.py ADDED Viewed

	@@ -0,0 +1,594 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ETH3D Benchmark dataset implementation.
+ETH3D is a multi-view stereo benchmark with high-resolution images and
+accurate ground truth geometry from laser scanning.
+Reference: https://www.eth3d.net/
+Evaluation metrics:
+- 3D reconstruction: Accuracy, Completeness, F-score
+- Camera pose estimation: AUC metrics
+"""
+import glob
+import os
+from typing import Dict as TDict, List, Optional
+import cv2
+import numpy as np
+import open3d as o3d
+import torch
+import torch.nn.functional as F
+from addict import Dict
+from PIL import Image
+from depth_anything_3.bench.dataset import Dataset, _wait_for_file_ready
+from depth_anything_3.bench.registries import MONO_REGISTRY, MV_REGISTRY
+from depth_anything_3.bench.utils import (
+    create_tsdf_volume,
+    evaluate_3d_reconstruction,
+    fuse_depth_to_tsdf,
+    quat2rotmat,
+    sample_points_from_mesh,
+)
+from depth_anything_3.utils.constants import (
+    ETH3D_DOWN_SAMPLE,
+    ETH3D_EVAL_DATA_ROOT,
+    ETH3D_EVAL_THRESHOLD,
+    ETH3D_FILTER_KEYS,
+    ETH3D_MAX_DEPTH,
+    ETH3D_SAMPLING_NUMBER,
+    ETH3D_SCENES,
+    ETH3D_SDF_TRUNC,
+    ETH3D_VOXEL_LENGTH,
+)
+from depth_anything_3.utils.pose_align import align_poses_umeyama
+@MV_REGISTRY.register(name="eth3d")
+@MONO_REGISTRY.register(name="eth3d")
+class ETH3D(Dataset):
+    """
+    ETH3D Benchmark dataset wrapper for DepthAnything3 evaluation.
+    Supports:
+        - Camera pose estimation evaluation (AUC metrics)
+        - 3D reconstruction evaluation (Accuracy, Completeness, F-score)
+        - TSDF-based point cloud fusion
+    Dataset structure:
+        eth3d/multiview/
+        ├── scene_name/
+        │   ├── images/                    # RGB images
+        │   ├── dslr_calibration_jpg/
+        │   │   ├── cameras.txt            # Camera intrinsics
+        │   │   └── images.txt             # Camera poses
+        │   ├── combined_mesh.ply          # Ground truth mesh
+        │   └── ground_truth_depth/        # GT depth maps (optional)
+    """
+    data_root = ETH3D_EVAL_DATA_ROOT
+    SCENES = ETH3D_SCENES
+    # Evaluation hyperparameters from constants
+    max_depth = ETH3D_MAX_DEPTH
+    sampling_number = ETH3D_SAMPLING_NUMBER
+    voxel_length = ETH3D_VOXEL_LENGTH
+    sdf_trunc = ETH3D_SDF_TRUNC
+    eval_threshold = ETH3D_EVAL_THRESHOLD
+    down_sample = ETH3D_DOWN_SAMPLE
+    def __init__(self):
+        super().__init__()
+        # Pre-load scene data for efficiency
+        self._scene_cache = {}
+    # ------------------------------
+    # Camera file parsing
+    # ------------------------------
+    def _parse_cameras_txt(self, filepath: str) -> dict:
+        """
+        Parse COLMAP-style cameras.txt file.
+        Returns:
+            Dict mapping camera_id to intrinsic parameters
+        """
+        camera_dict = {}
+        with open(filepath) as f:
+            lines = f.readlines()
+            for line in lines[3:]:  # Skip header
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                parts = line.split()
+                if len(parts) < 8:
+                    continue
+                cam_id = parts[0]
+                # Format: ID, MODEL, WIDTH, HEIGHT, fx, fy, cx, cy, [distortion params...]
+                camera_dict[cam_id] = {
+                    "width": float(parts[2]),
+                    "height": float(parts[3]),
+                    "fx": float(parts[4]),
+                    "fy": float(parts[5]),
+                    "cx": float(parts[6]),
+                    "cy": float(parts[7]),
+                }
+        return camera_dict
+    def _parse_images_txt(self, filepath: str) -> dict:
+        """
+        Parse COLMAP-style images.txt file.
+        Returns:
+            Dict mapping image path to pose parameters
+        """
+        pose_dict = {}
+        with open(filepath) as f:
+            lines = f.readlines()
+            for idx, line in enumerate(lines[4:]):  # Skip header
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                # Every other line contains pose info
+                if idx % 2 == 0:
+                    parts = line.split()
+                    if len(parts) < 10:
+                        continue
+                    # Format: IMAGE_ID, QW, QX, QY, QZ, TX, TY, TZ, CAMERA_ID, NAME
+                    image_id = parts[0]
+                    qw, qx, qy, qz = float(parts[1]), float(parts[2]), float(parts[3]), float(parts[4])
+                    tx, ty, tz = float(parts[5]), float(parts[6]), float(parts[7])
+                    camera_id = parts[8]
+                    name = parts[9]
+                    pose_dict[name] = {
+                        "image_id": image_id,
+                        "quat": [qw, qx, qy, qz],
+                        "trans": [tx, ty, tz],
+                        "camera_id": camera_id,
+                    }
+        return pose_dict
+    def _should_filter_image(self, scene: str, image_name: str) -> bool:
+        """Check if image should be filtered out based on known problematic views."""
+        filter_keys = ETH3D_FILTER_KEYS.get(scene, [])
+        for key in filter_keys:
+            if image_name.endswith(key):
+                return True
+        return False
+    # ------------------------------
+    # Public API
+    # ------------------------------
+    def get_data(self, scene: str) -> Dict:
+        """
+        Collect per-view image paths, intrinsics/extrinsics for a scene.
+        Args:
+            scene: Scene identifier (e.g., "courtyard")
+        Returns:
+            Dict with:
+                - image_files: List[str] - paths to images
+                - extrinsics: np.ndarray [N, 4, 4] - world-to-camera transforms
+                - intrinsics: np.ndarray [N, 3, 3] - camera intrinsics
+                - aux: Dict with gt_mesh_path
+        """
+        # Check cache
+        if scene in self._scene_cache:
+            return self._scene_cache[scene]
+        scene_dir = os.path.join(self.data_root, scene)
+        # Parse camera files
+        cameras_file = os.path.join(scene_dir, "dslr_calibration_jpg", "cameras.txt")
+        images_file = os.path.join(scene_dir, "dslr_calibration_jpg", "images.txt")
+        camera_dict = self._parse_cameras_txt(cameras_file)
+        pose_dict = self._parse_images_txt(images_file)
+        # Ground truth mesh path
+        gt_mesh_path = os.path.join(scene_dir, "combined_mesh.ply")
+        out = Dict({
+            "image_files": [],
+            "extrinsics": [],
+            "intrinsics": [],
+            "aux": Dict({
+                "gt_mesh_path": gt_mesh_path,
+                "heights": [],
+                "widths": [],
+            }),
+        })
+        # Process each image (preserve original order from images.txt)
+        filtered_count = 0
+        for image_name, pose_info in pose_dict.items():
+            # Filter problematic views
+            if self._should_filter_image(scene, image_name):
+                filtered_count += 1
+                continue
+            image_path = os.path.join(scene_dir, "images", image_name)
+            if not os.path.exists(image_path):
+                continue
+            cam_info = camera_dict.get(pose_info["camera_id"])
+            if cam_info is None:
+                continue
+            # Build intrinsics matrix
+            ixt = np.array([
+                [cam_info["fx"], 0, cam_info["cx"]],
+                [0, cam_info["fy"], cam_info["cy"]],
+                [0, 0, 1],
+            ], dtype=np.float32)
+            # Build extrinsics matrix (world-to-camera)
+            # COLMAP format: world point -> camera point
+            rot = quat2rotmat(pose_info["quat"])
+            ext = np.eye(4, dtype=np.float32)
+            ext[:3, :3] = rot
+            ext[:3, 3] = pose_info["trans"]
+            out.image_files.append(image_path)
+            out.extrinsics.append(ext)
+            out.intrinsics.append(ixt)
+            out.aux.heights.append(cam_info["height"])
+            out.aux.widths.append(cam_info["width"])
+        out.extrinsics = np.asarray(out.extrinsics, dtype=np.float32)
+        out.intrinsics = np.asarray(out.intrinsics, dtype=np.float32)
+        # Print scene info
+        total_images = len(pose_dict)
+        used_images = len(out.image_files)
+        print(f"[ETH3D] {scene}: {used_images}/{total_images} images "
+              f"(filtered {filtered_count}, missing {total_images - used_images - filtered_count})")
+        if used_images < 3:
+            print(f"[ETH3D] ⚠️  WARNING: {scene} has only {used_images} images - evaluation may fail!")
+        # Cache result
+        self._scene_cache[scene] = out
+        return out
+    def eval3d(self, scene: str, fuse_path: str) -> TDict[str, float]:
+        """
+        Evaluate fused point cloud against ETH3D ground truth mesh.
+        Args:
+            scene: Scene identifier
+            fuse_path: Path to fused point cloud (.ply)
+        Returns:
+            Dict with metrics: acc, comp, overall, precision, recall, fscore
+        """
+        gt_data = self.get_data(scene)
+        gt_mesh_path = gt_data.aux.gt_mesh_path
+        # Load and sample ground truth mesh
+        gt_mesh = o3d.io.read_triangle_mesh(gt_mesh_path)
+        gt_pcd = sample_points_from_mesh(gt_mesh, self.sampling_number)
+        # Load predicted point cloud
+        pred_pcd = o3d.io.read_point_cloud(fuse_path)
+        # Evaluate using shared utility function
+        metrics = evaluate_3d_reconstruction(
+            pred_pcd,
+            gt_pcd,
+            threshold=self.eval_threshold,
+            down_sample=self.down_sample,
+        )
+        return metrics
+    def _load_gt_meta(self, result_path: str) -> Dict:
+        """
+        Load saved GT meta (extrinsics, intrinsics, image_files) for fusion.
+        This is needed when frames are sampled, so fuse3d uses the correct
+        (sampled) GT instead of full dataset GT.
+        Args:
+            result_path: Path to npz file (used to derive gt_meta.npz path)
+        Returns:
+            Dict with GT data, or None if gt_meta.npz doesn't exist
+        """
+        # gt_meta.npz is in the same exports/ directory as results.npz
+        export_dir = os.path.dirname(result_path)  # exports/mini_npz/
+        gt_meta_path = os.path.join(os.path.dirname(export_dir), "gt_meta.npz")
+        if os.path.exists(gt_meta_path):
+            data = np.load(gt_meta_path, allow_pickle=True)
+            return Dict({
+                "extrinsics": data["extrinsics"],
+                "intrinsics": data["intrinsics"],
+                "image_files": data["image_files"] if "image_files" in data else None,
+            })
+        return None
+    def fuse3d(self, scene: str, result_path: str, fuse_path: str, mode: str) -> None:
+        """
+        Fuse per-view depths into a point cloud using TSDF fusion.
+        Pipeline:
+        1. Load original images (keep original size)
+        2. Resize depth to original image size (nearest interpolation)
+        3. Adjust intrinsics to original image size
+        4. Apply scale alignment and mask invalid depths
+        5. TSDF fusion
+        Args:
+            scene: Scene identifier
+            result_path: Path to npz file with predicted depths/poses
+            fuse_path: Output path for fused point cloud (.ply)
+            mode: "recon_unposed" or "recon_posed"
+        """
+        # Try to load saved GT meta (handles frame sampling)
+        gt_meta = self._load_gt_meta(result_path)
+        if gt_meta is not None:
+            gt_data = gt_meta
+        else:
+            gt_data = self.get_data(scene)
+        _wait_for_file_ready(result_path)
+        pred_data = Dict({k: v for k, v in np.load(result_path).items()})
+        # Load original images (keep original size)
+        images = []
+        orig_sizes = []  # (H, W) for each image
+        for img_path in gt_data.image_files:
+            img = cv2.imread(img_path)
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            images.append(img)
+            orig_sizes.append((img.shape[0], img.shape[1]))
+        # Prepare depths, intrinsics, extrinsics with resize to original size
+        if mode == "recon_unposed":
+            depths, intrinsics, extrinsics = self._prep_unposed(
+                pred_data, gt_data, orig_sizes, scene=scene
+            )
+        elif mode == "recon_posed":
+            depths, intrinsics, extrinsics = self._prep_posed(
+                pred_data, gt_data, orig_sizes, scene=scene
+            )
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+        images = np.stack(images, axis=0)
+        # Create TSDF volume and fuse
+        volume = create_tsdf_volume(
+            voxel_length=self.voxel_length,
+            sdf_trunc=self.sdf_trunc,
+        )
+        mesh = fuse_depth_to_tsdf(
+            volume, depths, images, intrinsics, extrinsics, max_depth=self.max_depth
+        )
+        # Sample points from mesh
+        pcd = sample_points_from_mesh(mesh, self.sampling_number)
+        # Save point cloud
+        os.makedirs(os.path.dirname(fuse_path), exist_ok=True)
+        o3d.io.write_point_cloud(fuse_path, pcd)
+    # ------------------------------
+    # Private helpers
+    # ------------------------------
+    def _prep_unposed(
+        self, pred_data: Dict, gt_data: Dict, orig_sizes: list, scene: str = None
+    ) -> tuple:
+        """
+        Prepare depths/intrinsics/extrinsics for recon_unposed mode.
+        Pipeline:
+        1. Umeyama scale alignment
+        2. Load GT mask for each frame
+        3. Resize depth to original image size (nearest)
+        4. Apply GT mask BEFORE scale
+        5. Apply scale
+        6. Adjust intrinsics to original image size
+        """
+        # Scale alignment with fixed random_state for reproducibility
+        _, _, scale, extrinsics = align_poses_umeyama(
+            gt_data.extrinsics.copy(),
+            pred_data.extrinsics.copy(),
+            return_aligned=True,
+            ransac=True,
+            random_state=42,
+        )
+        # Get model output size
+        model_h, model_w = pred_data.depth.shape[1], pred_data.depth.shape[2]
+        # Process each frame
+        depths_out = []
+        intrinsics_out = []
+        for i in range(len(pred_data.depth)):
+            orig_h, orig_w = orig_sizes[i]
+            image_name = os.path.basename(gt_data.image_files[i])
+            # Resize depth to original image size (nearest interpolation)
+            depth = cv2.resize(
+                pred_data.depth[i],
+                (orig_w, orig_h),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Load GT mask (apply BEFORE scale)
+            gt_zero_mask = None
+            if scene is not None:
+                gt_zero_mask = self._load_gt_mask(scene, image_name, (orig_h, orig_w))
+            # Mask invalid depths BEFORE scale
+            depth = self._mask_invalid_depth(depth, gt_zero_mask)
+            # Apply scale AFTER mask
+            depth = depth * scale
+            # Adjust intrinsics to original image size
+            h_ratio = orig_h / model_h
+            w_ratio = orig_w / model_w
+            ixt = pred_data.intrinsics[i].copy()
+            ixt[0, :] *= w_ratio  # fx, 0, cx
+            ixt[1, :] *= h_ratio  # 0, fy, cy
+            depths_out.append(depth)
+            intrinsics_out.append(ixt)
+        return np.stack(depths_out), np.stack(intrinsics_out), extrinsics
+    def _prep_posed(
+        self, pred_data: Dict, gt_data: Dict, orig_sizes: list, scene: str = None
+    ) -> tuple:
+        """
+        Prepare depths/intrinsics/extrinsics for recon_posed mode.
+        Uses GT intrinsics/extrinsics but aligns depth scale via Umeyama.
+        Depth is resized to original image size.
+        """
+        # Scale alignment with fixed random_state for reproducibility
+        _, _, scale, _ = align_poses_umeyama(
+            gt_data.extrinsics.copy(),
+            pred_data.extrinsics.copy(),
+            return_aligned=True,
+            ransac=True,
+            random_state=42,
+        )
+        # Process each frame
+        depths_out = []
+        for i in range(len(pred_data.depth)):
+            orig_h, orig_w = orig_sizes[i]
+            image_name = os.path.basename(gt_data.image_files[i])
+            # Resize depth to original image size (nearest interpolation)
+            depth = cv2.resize(
+                pred_data.depth[i],
+                (orig_w, orig_h),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Load GT mask (apply BEFORE scale)
+            gt_zero_mask = None
+            if scene is not None:
+                gt_zero_mask = self._load_gt_mask(scene, image_name, (orig_h, orig_w))
+            # Mask invalid depths BEFORE scale
+            depth = self._mask_invalid_depth(depth, gt_zero_mask)
+            # Apply scale AFTER mask
+            depth = depth * scale
+            depths_out.append(depth)
+        # Use GT intrinsics and extrinsics (already at original image size)
+        return np.stack(depths_out), gt_data.intrinsics.copy(), gt_data.extrinsics.copy()
+    def _load_gt_mask(self, scene: str, image_name: str, shape: tuple) -> np.ndarray:
+        """
+        Load GT mask for masking invalid regions.
+        GT mask marks occluded or invalid regions that should be excluded
+        from depth fusion and evaluation.
+        Args:
+            scene: Scene identifier
+            image_name: Image filename (e.g., "DSC_0307.JPG")
+            shape: (height, width) of the image
+        Returns:
+            Boolean mask where True = valid region to keep
+        """
+        h, w = shape
+        # GT mask file path
+        gt_mask_path = os.path.join(
+            self.data_root, scene, "masks_for_images", "dslr_images",
+            image_name.replace(".JPG", ".png")
+        )
+        # GT depth file path (used to determine valid depth regions)
+        gt_depth_path = os.path.join(
+            self.data_root, scene, "ground_truth_depth", "dslr_images", image_name
+        )
+        # Load GT depth
+        if os.path.exists(gt_depth_path):
+            gt_depth = np.fromfile(gt_depth_path, dtype=np.float32).reshape(h, w)
+        else:
+            gt_depth = np.ones((h, w), dtype=np.float32)
+        # Load GT mask
+        if os.path.exists(gt_mask_path):
+            gt_mask = cv2.imread(gt_mask_path, cv2.IMREAD_GRAYSCALE)
+            gt_mask = np.asarray(gt_mask)
+        else:
+            gt_mask = np.zeros((h, w), dtype=np.uint8)
+        # Compute zero_mask
+        # gt_mask == 1 means occluded/invalid region
+        invalid_mask_from_gt = gt_mask == 1
+        gt_depth_copy = gt_depth.copy()
+        gt_depth_copy[gt_mask == 1] = 0
+        invalid_mask_from_gt_depth = np.logical_or(gt_depth_copy == 0, gt_depth_copy == np.inf)
+        # zero_mask: valid region that should be kept
+        zero_mask = np.logical_and(
+            np.logical_not(invalid_mask_from_gt),
+            np.logical_not(invalid_mask_from_gt_depth)
+        )
+        return zero_mask
+    def _mask_invalid_depth(
+        self, depth: np.ndarray, gt_zero_mask: np.ndarray = None
+    ) -> np.ndarray:
+        """
+        Mask invalid depth values by setting them to 0.
+        Logic:
+        1. Apply GT mask (if provided) - marks occluded/invalid regions
+        2. Mask pred invalid values (nan, inf)
+        Args:
+            depth: Depth map to mask
+            gt_zero_mask: Optional GT mask (True = valid region)
+        Returns:
+            Masked depth map with invalid regions set to 0
+        """
+        depth = depth.copy()
+        # Apply GT mask first (before scale)
+        if gt_zero_mask is not None:
+            # Also mask out invalid pred depth
+            pred_invalid = np.isnan(depth) | np.isinf(depth)
+            combined_mask = np.logical_and(gt_zero_mask, np.logical_not(pred_invalid))
+            depth = depth * combined_mask.astype(np.float32)
+        else:
+            # Fallback: only mask pred invalid values
+            invalid_mask = np.isnan(depth) | np.isinf(depth) | (depth <= 0)
+            depth[invalid_mask] = 0.0
+        return depth

Depth-Anything-3/src/depth_anything_3/bench/datasets/hiroom.py ADDED Viewed

	@@ -0,0 +1,440 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+HiRoom Dataset implementation.
+HiRoom is an indoor RGB-D dataset containing ground truth camera poses,
+depth maps, and fused point clouds.
+Evaluation metrics:
+- 3D reconstruction: Accuracy, Completeness, F-score
+- Camera pose estimation: AUC metrics
+"""
+import os
+from typing import Dict as TDict, List
+import cv2
+import numpy as np
+import open3d as o3d
+from addict import Dict
+from depth_anything_3.bench.dataset import Dataset, _wait_for_file_ready
+from depth_anything_3.bench.registries import MONO_REGISTRY, MV_REGISTRY
+from depth_anything_3.bench.utils import (
+    create_tsdf_volume,
+    evaluate_3d_reconstruction,
+    fuse_depth_to_tsdf,
+    sample_points_from_mesh,
+)
+from depth_anything_3.utils.constants import (
+    HIROOM_DOWN_SAMPLE,
+    HIROOM_EVAL_DATA_ROOT,
+    HIROOM_EVAL_THRESHOLD,
+    HIROOM_GT_ROOT_PATH,
+    HIROOM_MAX_DEPTH,
+    HIROOM_SAMPLING_NUMBER,
+    HIROOM_SCENE_LIST_PATH,
+    HIROOM_SDF_TRUNC,
+    HIROOM_VOXEL_LENGTH,
+)
+from depth_anything_3.utils.pose_align import align_poses_umeyama
+def _load_scene_list() -> List[str]:
+    """Load scene list from file."""
+    if os.path.exists(HIROOM_SCENE_LIST_PATH):
+        with open(HIROOM_SCENE_LIST_PATH, "r") as f:
+            return f.read().splitlines()
+    return []
+@MV_REGISTRY.register(name="hiroom")
+@MONO_REGISTRY.register(name="hiroom")
+class HiRoomDataset(Dataset):
+    """
+    HiRoom Dataset wrapper for DepthAnything3 evaluation.
+    Supports:
+        - Camera pose estimation evaluation (AUC metrics)
+        - 3D reconstruction evaluation (Accuracy, Completeness, F-score)
+        - TSDF-based point cloud fusion
+    Dataset structure:
+        HiRoom/
+        ├── {scene_path}/
+        │   ├── image/           # RGB images
+        │   ├── depth/           # GT depth maps
+        │   ├── pose/            # Camera poses (.npy)
+        │   ├── cam_K.npy        # Camera intrinsics
+        │   └── aliasing_mask/   # Aliasing masks
+        fused_pcd/
+        └── {scene_name}.ply     # Ground truth fused point cloud
+    """
+    data_root = HIROOM_EVAL_DATA_ROOT
+    gt_root_path = HIROOM_GT_ROOT_PATH
+    SCENES = _load_scene_list()
+    # Evaluation hyperparameters from constants
+    max_depth = HIROOM_MAX_DEPTH
+    sampling_number = HIROOM_SAMPLING_NUMBER
+    voxel_length = HIROOM_VOXEL_LENGTH
+    sdf_trunc = HIROOM_SDF_TRUNC
+    eval_threshold = HIROOM_EVAL_THRESHOLD
+    down_sample = HIROOM_DOWN_SAMPLE
+    def __init__(self):
+        super().__init__()
+        self._scene_cache = {}
+    # ------------------------------
+    # Public API
+    # ------------------------------
+    def get_data(self, scene: str) -> Dict:
+        """
+        Collect per-view image paths, intrinsics/extrinsics for a scene.
+        Args:
+            scene: Scene path (e.g., "xxx/yyy/zzz")
+        Returns:
+            Dict with:
+                - image_files: List[str] - paths to images
+                - extrinsics: np.ndarray [N, 4, 4] - world-to-camera transforms
+                - intrinsics: np.ndarray [N, 3, 3] - camera intrinsics
+                - aux: Dict with gt_pcd_path, gt_depth_files, aliasing_mask_files
+        """
+        if scene in self._scene_cache:
+            return self._scene_cache[scene]
+        scene_dir = os.path.join(self.data_root, scene)
+        image_dir = os.path.join(scene_dir, "image")
+        # Get scene name for GT point cloud
+        scene_name = "-".join(scene.split("/")[-3:])
+        gt_pcd_path = os.path.join(self.gt_root_path, f"{scene_name}.ply")
+        # Load shared camera intrinsics
+        intrin_path = os.path.join(scene_dir, "cam_K.npy")
+        ixt_shared = np.load(intrin_path).astype(np.float32)
+        # Get all image names sorted
+        image_names = sorted(os.listdir(image_dir))
+        out = Dict({
+            "image_files": [],
+            "extrinsics": [],
+            "intrinsics": [],
+            "aux": Dict({
+                "gt_pcd_path": gt_pcd_path,
+                "gt_depth_files": [],
+                "aliasing_mask_files": [],
+            }),
+        })
+        for img_name in image_names:
+            img_path = os.path.join(image_dir, img_name)
+            frame_name = img_name.split(".")[0]
+            # Depth and pose paths
+            depth_path = os.path.join(scene_dir, "depth", f"{frame_name}.png")
+            pose_path = os.path.join(scene_dir, "pose", f"{frame_name}.npy")
+            aliasing_mask_path = os.path.join(scene_dir, "aliasing_mask", f"{frame_name}.png")
+            if not os.path.exists(pose_path):
+                continue
+            # Load extrinsics (world-to-camera)
+            ext = np.load(pose_path).astype(np.float32)
+            out.image_files.append(img_path)
+            out.extrinsics.append(ext)
+            out.intrinsics.append(ixt_shared.copy())
+            out.aux.gt_depth_files.append(depth_path)
+            out.aux.aliasing_mask_files.append(aliasing_mask_path)
+        out.extrinsics = np.asarray(out.extrinsics, dtype=np.float32)
+        out.intrinsics = np.asarray(out.intrinsics, dtype=np.float32)
+        print(f"[HiRoom] {scene}: {len(out.image_files)} images")
+        self._scene_cache[scene] = out
+        return out
+    def eval3d(self, scene: str, fuse_path: str) -> TDict[str, float]:
+        """
+        Evaluate fused point cloud against HiRoom ground truth point cloud.
+        Args:
+            scene: Scene identifier
+            fuse_path: Path to fused point cloud (.ply)
+        Returns:
+            Dict with metrics: acc, comp, overall, precision, recall, fscore
+        """
+        gt_data = self.get_data(scene)
+        gt_pcd_path = gt_data.aux.gt_pcd_path
+        # Load ground truth point cloud
+        gt_pcd = o3d.io.read_point_cloud(gt_pcd_path)
+        # Load predicted point cloud
+        pred_pcd = o3d.io.read_point_cloud(fuse_path)
+        # Evaluate using shared utility function
+        metrics = evaluate_3d_reconstruction(
+            pred_pcd,
+            gt_pcd,
+            threshold=self.eval_threshold,
+            down_sample=self.down_sample,
+        )
+        return metrics
+    def _load_gt_meta(self, result_path: str) -> Dict:
+        """Load saved GT meta for fusion."""
+        export_dir = os.path.dirname(result_path)
+        gt_meta_path = os.path.join(os.path.dirname(export_dir), "gt_meta.npz")
+        if os.path.exists(gt_meta_path):
+            data = np.load(gt_meta_path, allow_pickle=True)
+            image_files = list(data["image_files"])
+            return Dict({
+                "extrinsics": data["extrinsics"],
+                "intrinsics": data["intrinsics"],
+                "image_files": image_files,
+            })
+        return None
+    def fuse3d(self, scene: str, result_path: str, fuse_path: str, mode: str) -> None:
+        """
+        Fuse per-view depths into a point cloud using TSDF fusion.
+        Args:
+            scene: Scene identifier
+            result_path: Path to npz file with predicted depths/poses
+            fuse_path: Output path for fused point cloud (.ply)
+            mode: "recon_unposed" or "recon_posed"
+        """
+        # Get full GT data
+        full_gt_data = self.get_data(scene)
+        # Try to load saved GT meta (handles frame sampling)
+        gt_meta = self._load_gt_meta(result_path)
+        if gt_meta is not None:
+            gt_data = gt_meta
+            image_indices = [
+                full_gt_data.image_files.index(f)
+                for f in gt_data.image_files
+                if f in full_gt_data.image_files
+            ]
+        else:
+            gt_data = full_gt_data
+            image_indices = list(range(len(full_gt_data.image_files)))
+        _wait_for_file_ready(result_path)
+        pred_data = Dict({k: v for k, v in np.load(result_path).items()})
+        # Load images
+        images = []
+        orig_sizes = []
+        for img_idx in image_indices:
+            img_path = full_gt_data.image_files[img_idx]
+            img = cv2.imread(img_path)
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            images.append(img)
+            orig_sizes.append((img.shape[0], img.shape[1]))
+        images = np.stack(images, axis=0)
+        # Prepare depths, intrinsics, extrinsics
+        if mode == "recon_unposed":
+            depths, intrinsics, extrinsics = self._prep_unposed(
+                pred_data, gt_data, full_gt_data, image_indices, orig_sizes, scene=scene
+            )
+        elif mode == "recon_posed":
+            depths, intrinsics, extrinsics = self._prep_posed(
+                pred_data, gt_data, full_gt_data, image_indices, orig_sizes, scene=scene
+            )
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+        # Create TSDF volume and fuse
+        volume = create_tsdf_volume(
+            voxel_length=self.voxel_length,
+            sdf_trunc=self.sdf_trunc,
+        )
+        mesh = fuse_depth_to_tsdf(
+            volume, depths, images, intrinsics, extrinsics, max_depth=self.max_depth
+        )
+        # Sample points from mesh
+        pcd = sample_points_from_mesh(mesh, self.sampling_number)
+        # Save point cloud
+        os.makedirs(os.path.dirname(fuse_path), exist_ok=True)
+        o3d.io.write_point_cloud(fuse_path, pcd)
+    # ------------------------------
+    # Private helpers
+    # ------------------------------
+    def _prep_unposed(
+        self, pred_data: Dict, gt_data: Dict, full_gt_data: Dict,
+        image_indices: list, orig_sizes: list, scene: str = None
+    ) -> tuple:
+        """Prepare depths/intrinsics/extrinsics for recon_unposed mode."""
+        # Scale alignment with fixed random_state for reproducibility
+        _, _, scale, extrinsics = align_poses_umeyama(
+            gt_data.extrinsics.copy(),
+            pred_data.extrinsics.copy(),
+            return_aligned=True,
+            ransac=True,
+            random_state=42,
+        )
+        model_h, model_w = pred_data.depth.shape[1], pred_data.depth.shape[2]
+        depths_out = []
+        intrinsics_out = []
+        for i in range(len(pred_data.depth)):
+            orig_h, orig_w = orig_sizes[i]
+            img_idx = image_indices[i]
+            # Resize depth to original image size
+            depth = cv2.resize(
+                pred_data.depth[i],
+                (orig_w, orig_h),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Load GT mask
+            gt_zero_mask = self._load_gt_mask(
+                full_gt_data.aux.gt_depth_files[img_idx],
+                full_gt_data.aux.aliasing_mask_files[img_idx],
+            )
+            # Mask invalid depths BEFORE scale
+            depth = self._mask_invalid_depth(depth, gt_zero_mask)
+            # Apply scale AFTER mask
+            depth = depth * scale
+            # Adjust intrinsics to original image size
+            h_ratio = orig_h / model_h
+            w_ratio = orig_w / model_w
+            ixt = pred_data.intrinsics[i].copy()
+            ixt[0, :] *= w_ratio
+            ixt[1, :] *= h_ratio
+            depths_out.append(depth)
+            intrinsics_out.append(ixt)
+        return np.stack(depths_out), np.stack(intrinsics_out), extrinsics
+    def _prep_posed(
+        self, pred_data: Dict, gt_data: Dict, full_gt_data: Dict,
+        image_indices: list, orig_sizes: list, scene: str = None
+    ) -> tuple:
+        """Prepare depths/intrinsics/extrinsics for recon_posed mode."""
+        # Scale alignment
+        _, _, scale, _ = align_poses_umeyama(
+            gt_data.extrinsics.copy(),
+            pred_data.extrinsics.copy(),
+            return_aligned=True,
+            ransac=True,
+            random_state=42,
+        )
+        depths_out = []
+        for i in range(len(pred_data.depth)):
+            orig_h, orig_w = orig_sizes[i]
+            img_idx = image_indices[i]
+            # Resize depth to original image size
+            depth = cv2.resize(
+                pred_data.depth[i],
+                (orig_w, orig_h),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Load GT mask
+            gt_zero_mask = self._load_gt_mask(
+                full_gt_data.aux.gt_depth_files[img_idx],
+                full_gt_data.aux.aliasing_mask_files[img_idx],
+            )
+            # Mask invalid depths BEFORE scale
+            depth = self._mask_invalid_depth(depth, gt_zero_mask)
+            # Apply scale AFTER mask
+            depth = depth * scale
+            depths_out.append(depth)
+        # Use GT intrinsics and extrinsics
+        gt_intrinsics = np.stack([full_gt_data.intrinsics[idx] for idx in image_indices])
+        gt_extrinsics = np.stack([full_gt_data.extrinsics[idx] for idx in image_indices])
+        return np.stack(depths_out), gt_intrinsics, gt_extrinsics
+    def _load_gt_mask(self, gt_depth_path: str, aliasing_mask_path: str) -> np.ndarray:
+        """
+        Load GT depth and aliasing mask to create valid mask.
+        For HiRoom:
+        - GT depth is stored as 16-bit PNG, scaled to 100m range
+        - Aliasing mask marks regions to exclude
+        Returns:
+            Boolean mask where True = valid region to keep
+        """
+        # Load GT depth
+        if os.path.exists(gt_depth_path):
+            gt_depth = cv2.imread(gt_depth_path, -1) / 65535.0 * 100.0
+        else:
+            return None
+        # Load aliasing mask
+        aliasing_mask = None
+        if os.path.exists(aliasing_mask_path):
+            aliasing_mask = cv2.imread(aliasing_mask_path, -1) > 0
+        # Valid mask: depth > 0 and not in aliasing region
+        valid_mask = gt_depth > 0
+        if aliasing_mask is not None:
+            valid_mask = np.logical_and(valid_mask, np.logical_not(aliasing_mask))
+        return valid_mask
+    def _mask_invalid_depth(
+        self, depth: np.ndarray, gt_zero_mask: np.ndarray = None
+    ) -> np.ndarray:
+        """Mask invalid depth values by setting them to 0."""
+        depth = depth.copy()
+        if gt_zero_mask is not None:
+            pred_invalid = np.isnan(depth) | np.isinf(depth)
+            combined_mask = np.logical_and(gt_zero_mask, np.logical_not(pred_invalid))
+            depth = depth * combined_mask.astype(np.float32)
+        else:
+            invalid_mask = np.isnan(depth) | np.isinf(depth) | (depth <= 0)
+            depth[invalid_mask] = 0.0
+        return depth

Depth-Anything-3/src/depth_anything_3/bench/datasets/scannetpp.py ADDED Viewed

	@@ -0,0 +1,591 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ScanNet++ Benchmark dataset implementation.
+ScanNet++ is a high-quality indoor RGB-D dataset with iPhone and DSLR images,
+ground truth camera poses from COLMAP, and high-resolution 3D meshes.
+Reference: https://kaldir.vc.in.tum.de/scannetpp/
+Evaluation metrics:
+- 3D reconstruction: Accuracy, Completeness, F-score
+- Camera pose estimation: AUC metrics
+"""
+import os
+from typing import Dict as TDict
+import cv2
+import imageio
+import numpy as np
+import open3d as o3d
+from addict import Dict
+from depth_anything_3.bench.dataset import Dataset, _wait_for_file_ready
+from depth_anything_3.bench.registries import MONO_REGISTRY, MV_REGISTRY
+from depth_anything_3.bench.utils import (
+    create_tsdf_volume,
+    fuse_depth_to_tsdf,
+    nn_correspondance,
+    sample_points_from_mesh,
+)
+from depth_anything_3.utils.constants import (
+    SCANNETPP_DOWN_SAMPLE,
+    SCANNETPP_EVAL_DATA_ROOT,
+    SCANNETPP_EVAL_THRESHOLD,
+    SCANNETPP_INPUT_H,
+    SCANNETPP_INPUT_W,
+    SCANNETPP_MAX_DEPTH,
+    SCANNETPP_SAMPLING_NUMBER,
+    SCANNETPP_SCENES,
+    SCANNETPP_SDF_TRUNC,
+    SCANNETPP_VOXEL_LENGTH,
+)
+from depth_anything_3.utils.pose_align import align_poses_umeyama
+from depth_anything_3.utils.read_write_model import read_model
+@MV_REGISTRY.register(name="scannetpp")
+@MONO_REGISTRY.register(name="scannetpp")
+class ScanNetPP(Dataset):
+    """
+    ScanNet++ Benchmark dataset wrapper for DepthAnything3 evaluation.
+    Supports:
+        - Camera pose estimation evaluation (AUC metrics)
+        - 3D reconstruction evaluation (Accuracy, Completeness, F-score)
+        - TSDF-based point cloud fusion
+    Dataset structure:
+        scannetpp/data/
+        ├── {scene_id}/
+        │   ├── merge_dslr_iphone/
+        │   │   ├── colmap/sparse_render_rgb/  # COLMAP reconstruction
+        │   │   ├── images/                     # RGB images
+        │   │   └── render_depth/               # GT depth maps
+        │   └── scans/
+        │       └── mesh_aligned_0.05.ply       # Ground truth mesh
+    """
+    data_root = SCANNETPP_EVAL_DATA_ROOT
+    SCENES = SCANNETPP_SCENES
+    # Input resolution after undistortion and resize
+    input_h = SCANNETPP_INPUT_H
+    input_w = SCANNETPP_INPUT_W
+    # Evaluation hyperparameters from constants
+    max_depth = SCANNETPP_MAX_DEPTH
+    sampling_number = SCANNETPP_SAMPLING_NUMBER
+    voxel_length = SCANNETPP_VOXEL_LENGTH
+    sdf_trunc = SCANNETPP_SDF_TRUNC
+    eval_threshold = SCANNETPP_EVAL_THRESHOLD
+    down_sample = SCANNETPP_DOWN_SAMPLE
+    def __init__(self):
+        super().__init__()
+        self._scene_cache = {}
+    # ------------------------------
+    # Public API
+    # ------------------------------
+    def get_data(self, scene: str) -> Dict:
+        """
+        Collect per-view image paths, intrinsics/extrinsics for a scene.
+        Only uses iPhone images (not DSLR).
+        Args:
+            scene: Scene identifier (e.g., "09c1414f1b")
+        Returns:
+            Dict with:
+                - image_files: List[str] - paths to images
+                - extrinsics: np.ndarray [N, 4, 4] - world-to-camera transforms
+                - intrinsics: np.ndarray [N, 3, 3] - camera intrinsics
+                - aux: Dict with gt_mesh_path, dist, roi, cam_hw, etc.
+        """
+        if scene in self._scene_cache:
+            return self._scene_cache[scene]
+        input_path = os.path.join(self.data_root, scene, "merge_dslr_iphone")
+        colmap_path = os.path.join(input_path, "colmap/sparse_render_rgb")
+        image_path = os.path.join(input_path, "images")
+        depth_path_dir = os.path.join(input_path, "render_depth")
+        # Read COLMAP model
+        cams, images, points3d = read_model(colmap_path)
+        # Map image names to IDs
+        name2id = {image.name: k for k, image in images.items()}
+        names = sorted([image.name for k, image in images.items()])
+        # Only use iPhone images
+        names = [name for name in names if "iphone" in name]
+        gt_mesh_path = os.path.join(
+            input_path.replace("merge_dslr_iphone", "scans"), "mesh_aligned_0.05.ply"
+        )
+        out = Dict({
+            "image_files": [],
+            "extrinsics": [],
+            "intrinsics": [],
+            "aux": Dict({
+                "gt_mesh_path": gt_mesh_path,
+                "dist_list": [],
+                "roi_list": [],
+                "cam_hw_list": [],
+                "ixt_raw_list": [],
+                "gt_depth_files": [],
+            }),
+        })
+        for name in names:
+            image = images[name2id[name]]
+            img_path = os.path.join(image_path, name)
+            if not os.path.exists(img_path):
+                continue
+            # Build extrinsics (world-to-camera)
+            ext = np.eye(4, dtype=np.float32)
+            ext[:3, :3] = image.qvec2rotmat()
+            ext[:3, 3] = image.tvec
+            # Get camera parameters
+            cam_id = image.camera_id
+            camera = cams[cam_id]
+            cam_height, cam_width = camera.height, camera.width
+            # Build intrinsics
+            ixt = np.eye(3, dtype=np.float32)
+            ixt[0, 0], ixt[1, 1], ixt[0, 2], ixt[1, 2] = camera.params[:4]
+            ixt[:2, 2] -= 0.5  # COLMAP convention adjustment
+            ixt_raw = ixt.copy()
+            # Handle distortion (OPENCV model)
+            dist = np.zeros(5, dtype=np.float32)
+            roi = (0, 0, cam_width, cam_height)
+            if camera.model == "OPENCV":
+                dist[:4] = camera.params[4:]
+                ixt, roi = cv2.getOptimalNewCameraMatrix(
+                    ixt, dist, (cam_width, cam_height), 1, (cam_width, cam_height)
+                )
+            # Depth file path
+            frame_name = os.path.basename(name)[:-4]  # Remove .jpg
+            depth_file = os.path.join(depth_path_dir, f"{frame_name}.png")
+            out.image_files.append(img_path)
+            out.extrinsics.append(ext)
+            out.intrinsics.append(ixt)
+            out.aux.dist_list.append(dist)
+            out.aux.roi_list.append(roi)
+            out.aux.cam_hw_list.append((cam_height, cam_width))
+            out.aux.ixt_raw_list.append(ixt_raw)
+            out.aux.gt_depth_files.append(depth_file)
+        out.extrinsics = np.asarray(out.extrinsics, dtype=np.float32)
+        out.intrinsics = np.asarray(out.intrinsics, dtype=np.float32)
+        print(f"[ScanNet++] {scene}: {len(out.image_files)} images")
+        self._scene_cache[scene] = out
+        return out
+    def load_image(self, img_path: str, idx: int, aux: Dict) -> np.ndarray:
+        """
+        Load and preprocess image with undistortion and cropping.
+        Args:
+            img_path: Path to image file
+            idx: Index of the image in the dataset
+            aux: Auxiliary data from get_data
+        Returns:
+            Preprocessed RGB image
+        """
+        image = imageio.imread(img_path).astype(np.uint8)
+        ixt_raw = aux.ixt_raw_list[idx]
+        ixt = aux.intrinsics[idx] if hasattr(aux, 'intrinsics') else None
+        dist = aux.dist_list[idx]
+        roi = aux.roi_list[idx]
+        # Undistort using raw intrinsics
+        # Use the stored intrinsics from get_data for newCameraMatrix
+        stored_ixt = self._scene_cache.get(aux.scene, {}).get('intrinsics', [None])[idx] if hasattr(aux, 'scene') else None
+        if stored_ixt is None:
+            # Recompute optimal camera matrix for undistortion
+            cam_h, cam_w = aux.cam_hw_list[idx]
+            ixt_for_undistort = ixt_raw.copy()
+            ixt_for_undistort, _ = cv2.getOptimalNewCameraMatrix(
+                ixt_raw, dist, (cam_w, cam_h), 1, (cam_w, cam_h)
+            )
+        else:
+            ixt_for_undistort = stored_ixt
+        image = cv2.undistort(image, ixt_raw, dist, newCameraMatrix=ixt_for_undistort)
+        # Crop to ROI
+        x, y, w, h = roi
+        image = image[y:y+h, x:x+w]
+        # Resize to target resolution
+        image = cv2.resize(image, (self.input_w, self.input_h), interpolation=cv2.INTER_AREA)
+        return image
+    def eval3d(self, scene: str, fuse_path: str) -> TDict[str, float]:
+        """
+        Evaluate fused point cloud against ScanNet++ ground truth mesh.
+        Uses AABB cropping to only evaluate points within GT bounding box.
+        Args:
+            scene: Scene identifier
+            fuse_path: Path to fused point cloud (.ply)
+        Returns:
+            Dict with metrics: acc, comp, overall, precision, recall, fscore
+        """
+        gt_data = self.get_data(scene)
+        gt_mesh_path = gt_data.aux.gt_mesh_path
+        # Load ground truth mesh and sample points
+        gt_mesh = o3d.io.read_triangle_mesh(gt_mesh_path)
+        gt_pcd = sample_points_from_mesh(gt_mesh, self.sampling_number)
+        # Load predicted point cloud
+        pred_pcd = o3d.io.read_point_cloud(fuse_path)
+        # Crop prediction to GT bounding box (with 0.1m margin)
+        aabb = gt_pcd.get_axis_aligned_bounding_box()
+        points = np.asarray(pred_pcd.points)
+        inside_mask = (
+            (points[:, 0] >= aabb.min_bound[0] - 0.1) &
+            (points[:, 0] <= aabb.max_bound[0] + 0.1) &
+            (points[:, 1] >= aabb.min_bound[1] - 0.1) &
+            (points[:, 1] <= aabb.max_bound[1] + 0.1) &
+            (points[:, 2] >= aabb.min_bound[2] - 0.1) &
+            (points[:, 2] <= aabb.max_bound[2] + 0.1)
+        )
+        pred_pcd = pred_pcd.select_by_index(inside_mask.nonzero()[0])
+        # Downsample
+        if self.down_sample > 0:
+            pred_pcd = pred_pcd.voxel_down_sample(self.down_sample)
+            gt_pcd = gt_pcd.voxel_down_sample(self.down_sample)
+        verts_pred = np.asarray(pred_pcd.points)
+        verts_gt = np.asarray(gt_pcd.points)
+        if len(verts_pred) == 0 or len(verts_gt) == 0:
+            return {
+                "acc": float("inf"),
+                "comp": float("inf"),
+                "overall": float("inf"),
+                "precision": 0.0,
+                "recall": 0.0,
+                "fscore": 0.0,
+            }
+        # Compute distances
+        dist_pred_to_gt = nn_correspondance(verts_gt, verts_pred)
+        dist_gt_to_pred = nn_correspondance(verts_pred, verts_gt)
+        # Compute metrics
+        accuracy = float(np.mean(dist_pred_to_gt))
+        completeness = float(np.mean(dist_gt_to_pred))
+        overall = (accuracy + completeness) / 2
+        precision = float(np.mean((dist_pred_to_gt < self.eval_threshold).astype(float)))
+        recall = float(np.mean((dist_gt_to_pred < self.eval_threshold).astype(float)))
+        if precision + recall > 0:
+            fscore = 2 * precision * recall / (precision + recall)
+        else:
+            fscore = 0.0
+        return {
+            "acc": accuracy,
+            "comp": completeness,
+            "overall": overall,
+            "precision": precision,
+            "recall": recall,
+            "fscore": fscore,
+        }
+    def _load_gt_meta(self, result_path: str) -> Dict:
+        """Load saved GT meta for fusion."""
+        export_dir = os.path.dirname(result_path)
+        gt_meta_path = os.path.join(os.path.dirname(export_dir), "gt_meta.npz")
+        if os.path.exists(gt_meta_path):
+            data = np.load(gt_meta_path, allow_pickle=True)
+            image_files = list(data["image_files"])
+            # Reconstruct aux data from image files
+            return Dict({
+                "extrinsics": data["extrinsics"],
+                "intrinsics": data["intrinsics"],
+                "image_files": image_files,
+            })
+        return None
+    def fuse3d(self, scene: str, result_path: str, fuse_path: str, mode: str) -> None:
+        """
+        Fuse per-view depths into a point cloud using TSDF fusion.
+        Args:
+            scene: Scene identifier
+            result_path: Path to npz file with predicted depths/poses
+            fuse_path: Output path for fused point cloud (.ply)
+            mode: "recon_unposed" or "recon_posed"
+        """
+        # Get GT data
+        full_gt_data = self.get_data(scene)
+        # Try to load saved GT meta (handles frame sampling)
+        gt_meta = self._load_gt_meta(result_path)
+        if gt_meta is not None:
+            gt_data = gt_meta
+            # Need to rebuild aux from full GT data based on image indices
+            image_indices = [
+                full_gt_data.image_files.index(f)
+                for f in gt_data.image_files
+                if f in full_gt_data.image_files
+            ]
+        else:
+            gt_data = full_gt_data
+            image_indices = list(range(len(full_gt_data.image_files)))
+        _wait_for_file_ready(result_path)
+        pred_data = Dict({k: v for k, v in np.load(result_path).items()})
+        # Load and preprocess images
+        images = []
+        for idx, img_idx in enumerate(image_indices):
+            img_path = full_gt_data.image_files[img_idx]
+            image = imageio.imread(img_path).astype(np.uint8)
+            # Undistort and crop
+            ixt_raw = full_gt_data.aux.ixt_raw_list[img_idx]
+            ixt = full_gt_data.intrinsics[img_idx]
+            dist = full_gt_data.aux.dist_list[img_idx]
+            roi = full_gt_data.aux.roi_list[img_idx]
+            image = cv2.undistort(image, ixt_raw, dist, newCameraMatrix=ixt)
+            x, y, w, h = roi
+            image = image[y:y+h, x:x+w]
+            image = cv2.resize(image, (self.input_w, self.input_h), interpolation=cv2.INTER_AREA)
+            images.append(image)
+        images = np.stack(images, axis=0)
+        # Prepare depths, intrinsics, extrinsics
+        if mode == "recon_unposed":
+            depths, intrinsics, extrinsics = self._prep_unposed(
+                pred_data, gt_data, full_gt_data, image_indices, scene=scene
+            )
+        elif mode == "recon_posed":
+            depths, intrinsics, extrinsics = self._prep_posed(
+                pred_data, gt_data, full_gt_data, image_indices, scene=scene
+            )
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+        # Create TSDF volume and fuse
+        volume = create_tsdf_volume(
+            voxel_length=self.voxel_length,
+            sdf_trunc=self.sdf_trunc,
+        )
+        mesh = fuse_depth_to_tsdf(
+            volume, depths, images, intrinsics, extrinsics, max_depth=self.max_depth
+        )
+        # Sample points from mesh
+        pcd = sample_points_from_mesh(mesh, self.sampling_number)
+        # Save point cloud
+        os.makedirs(os.path.dirname(fuse_path), exist_ok=True)
+        o3d.io.write_point_cloud(fuse_path, pcd)
+    # ------------------------------
+    # Private helpers
+    # ------------------------------
+    def _prep_unposed(
+        self, pred_data: Dict, gt_data: Dict, full_gt_data: Dict,
+        image_indices: list, scene: str = None
+    ) -> tuple:
+        """Prepare depths/intrinsics/extrinsics for recon_unposed mode."""
+        # Scale alignment with fixed random_state for reproducibility
+        _, _, scale, extrinsics = align_poses_umeyama(
+            gt_data.extrinsics.copy(),
+            pred_data.extrinsics.copy(),
+            return_aligned=True,
+            ransac=True,
+            random_state=42,
+        )
+        model_h, model_w = pred_data.depth.shape[1], pred_data.depth.shape[2]
+        depths_out = []
+        intrinsics_out = []
+        for i in range(len(pred_data.depth)):
+            img_idx = image_indices[i]
+            # Get original image size (after undistort+crop, before resize to input_h/w)
+            orig_h, orig_w = full_gt_data.aux.cam_hw_list[img_idx]
+            # Step 1: nearest resize to original image size
+            depth = cv2.resize(
+                pred_data.depth[i],
+                (orig_w, orig_h),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Step 2: linear resize to target resolution
+            depth = cv2.resize(
+                depth,
+                (self.input_w, self.input_h),
+                interpolation=cv2.INTER_LINEAR,
+            ).astype(np.float32)
+            # Load GT depth for masking
+            gt_zero_mask = self._load_gt_mask(full_gt_data.aux.gt_depth_files[img_idx])
+            # Mask invalid depths BEFORE scale
+            depth = self._mask_invalid_depth(depth, gt_zero_mask)
+            # Apply scale AFTER mask
+            depth = depth * scale
+            # Adjust intrinsics to target resolution
+            h_ratio = self.input_h / model_h
+            w_ratio = self.input_w / model_w
+            ixt = pred_data.intrinsics[i].copy()
+            ixt[0, :] *= w_ratio
+            ixt[1, :] *= h_ratio
+            depths_out.append(depth)
+            intrinsics_out.append(ixt)
+        return np.stack(depths_out), np.stack(intrinsics_out), extrinsics
+    def _prep_posed(
+        self, pred_data: Dict, gt_data: Dict, full_gt_data: Dict,
+        image_indices: list, scene: str = None
+    ) -> tuple:
+        """Prepare depths/intrinsics/extrinsics for recon_posed mode."""
+        # Scale alignment
+        _, _, scale, _ = align_poses_umeyama(
+            gt_data.extrinsics.copy(),
+            pred_data.extrinsics.copy(),
+            return_aligned=True,
+            ransac=True,
+            random_state=42,
+        )
+        depths_out = []
+        intrinsics_out = []
+        extrinsics_out = []
+        for i in range(len(pred_data.depth)):
+            img_idx = image_indices[i]
+            # Get original image size (after undistort+crop, before resize to input_h/w)
+            orig_h, orig_w = full_gt_data.aux.cam_hw_list[img_idx]
+            # Step 1: nearest resize to original image size
+            depth = cv2.resize(
+                pred_data.depth[i],
+                (orig_w, orig_h),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Step 2: linear resize to target resolution
+            depth = cv2.resize(
+                depth,
+                (self.input_w, self.input_h),
+                interpolation=cv2.INTER_LINEAR,
+            ).astype(np.float32)
+            # Load GT depth for masking
+            gt_zero_mask = self._load_gt_mask(full_gt_data.aux.gt_depth_files[img_idx])
+            # Mask invalid depths BEFORE scale
+            depth = self._mask_invalid_depth(depth, gt_zero_mask)
+            # Apply scale AFTER mask
+            depth = depth * scale
+            depths_out.append(depth)
+            # Get GT intrinsics and scale to target resolution
+            ixt = full_gt_data.intrinsics[img_idx].copy()
+            cam_h, cam_w = full_gt_data.aux.cam_hw_list[img_idx]
+            ixt[:2, 2] += 0.5  # Undo COLMAP convention
+            ixt[0, :] *= self.input_w / cam_w
+            ixt[1, :] *= self.input_h / cam_h
+            intrinsics_out.append(ixt)
+            extrinsics_out.append(full_gt_data.extrinsics[img_idx])
+        return np.stack(depths_out), np.stack(intrinsics_out), np.stack(extrinsics_out)
+    def _load_gt_mask(self, gt_depth_path: str) -> np.ndarray:
+        """
+        Load GT depth and create valid mask.
+        For ScanNet++, GT depth is stored as 16-bit PNG in millimeters.
+        Returns:
+            Boolean mask where True = valid region to keep
+        """
+        if not os.path.exists(gt_depth_path):
+            return None
+        gt_depth = imageio.imread(gt_depth_path) / 1000.0  # mm to meters
+        # Resize to target resolution
+        gt_depth = cv2.resize(
+            gt_depth,
+            (self.input_w, self.input_h),
+            interpolation=cv2.INTER_LINEAR,
+        ).astype(np.float32)
+        # Valid mask: depth > 0 and not inf
+        valid_mask = np.logical_and(gt_depth > 0, gt_depth != np.inf)
+        return valid_mask
+    def _mask_invalid_depth(
+        self, depth: np.ndarray, gt_zero_mask: np.ndarray = None
+    ) -> np.ndarray:
+        """Mask invalid depth values by setting them to 0."""
+        depth = depth.copy()
+        if gt_zero_mask is not None:
+            pred_invalid = np.isnan(depth) | np.isinf(depth)
+            combined_mask = np.logical_and(gt_zero_mask, np.logical_not(pred_invalid))
+            depth = depth * combined_mask.astype(np.float32)
+        else:
+            invalid_mask = np.isnan(depth) | np.isinf(depth) | (depth <= 0)
+            depth[invalid_mask] = 0.0
+        return depth

Depth-Anything-3/src/depth_anything_3/bench/datasets/sevenscenes.py ADDED Viewed

	@@ -0,0 +1,449 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+7Scenes Benchmark dataset implementation.
+7Scenes is an indoor RGB-D dataset with ground truth camera poses and 3D meshes.
+Reference: https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/
+Evaluation metrics:
+- 3D reconstruction: Accuracy, Completeness, F-score
+- Camera pose estimation: AUC metrics
+"""
+import os
+from typing import Dict as TDict
+import cv2
+import numpy as np
+import open3d as o3d
+from addict import Dict
+from depth_anything_3.bench.dataset import Dataset, _wait_for_file_ready
+from depth_anything_3.bench.registries import MONO_REGISTRY, MV_REGISTRY
+from depth_anything_3.bench.utils import (
+    create_tsdf_volume,
+    evaluate_3d_reconstruction,
+    fuse_depth_to_tsdf,
+    sample_points_from_mesh,
+)
+from depth_anything_3.utils.constants import (
+    SEVENSCENES_CX,
+    SEVENSCENES_CY,
+    SEVENSCENES_DOWN_SAMPLE,
+    SEVENSCENES_EVAL_DATA_ROOT,
+    SEVENSCENES_EVAL_THRESHOLD,
+    SEVENSCENES_FX,
+    SEVENSCENES_FY,
+    SEVENSCENES_MAX_DEPTH,
+    SEVENSCENES_SAMPLING_NUMBER,
+    SEVENSCENES_SCENES,
+    SEVENSCENES_SDF_TRUNC,
+    SEVENSCENES_VOXEL_LENGTH,
+)
+from depth_anything_3.utils.pose_align import align_poses_umeyama
+@MV_REGISTRY.register(name="7scenes")
+@MONO_REGISTRY.register(name="7scenes")
+class SevenScenes(Dataset):
+    """
+    7Scenes Benchmark dataset wrapper for DepthAnything3 evaluation.
+    Supports:
+        - Camera pose estimation evaluation (AUC metrics)
+        - 3D reconstruction evaluation (Accuracy, Completeness, F-score)
+        - TSDF-based point cloud fusion
+    Dataset structure:
+        7scenes/
+        ├── 7Scenes/
+        │   ├── {scene}/
+        │   │   └── seq-01/  (or seq-02 for stairs)
+        │   │       ├── frame-XXXXXX.color.png
+        │   │       ├── frame-XXXXXX.depth.png
+        │   │       └── frame-XXXXXX.pose.txt
+        │   └── meshes/
+        │       └── {scene}.ply  # Ground truth mesh
+    """
+    data_root = SEVENSCENES_EVAL_DATA_ROOT
+    SCENES = SEVENSCENES_SCENES
+    # Evaluation hyperparameters from constants
+    max_depth = SEVENSCENES_MAX_DEPTH
+    sampling_number = SEVENSCENES_SAMPLING_NUMBER
+    voxel_length = SEVENSCENES_VOXEL_LENGTH
+    sdf_trunc = SEVENSCENES_SDF_TRUNC
+    eval_threshold = SEVENSCENES_EVAL_THRESHOLD
+    down_sample = SEVENSCENES_DOWN_SAMPLE
+    # Fixed camera intrinsics for all 7Scenes images
+    fx = SEVENSCENES_FX
+    fy = SEVENSCENES_FY
+    cx = SEVENSCENES_CX
+    cy = SEVENSCENES_CY
+    def __init__(self):
+        super().__init__()
+        self._scene_cache = {}
+    # ------------------------------
+    # Public API
+    # ------------------------------
+    def get_data(self, scene: str) -> Dict:
+        """
+        Collect per-view image paths, intrinsics/extrinsics for a scene.
+        Args:
+            scene: Scene identifier (e.g., "chess")
+        Returns:
+            Dict with:
+                - image_files: List[str] - paths to images
+                - extrinsics: np.ndarray [N, 4, 4] - world-to-camera transforms
+                - intrinsics: np.ndarray [N, 3, 3] - camera intrinsics
+                - aux: Dict with gt_mesh_path, gt_depth_files
+        """
+        if scene in self._scene_cache:
+            return self._scene_cache[scene]
+        # Different sequence for stairs scene
+        if scene == "stairs":
+            data_folder = os.path.join(self.data_root, "7Scenes", scene, "seq-02")
+            n_imgs = 500
+        else:
+            data_folder = os.path.join(self.data_root, "7Scenes", scene, "seq-01")
+            n_imgs = 1000
+        gt_mesh_path = os.path.join(self.data_root, "7Scenes", "meshes", f"{scene}.ply")
+        # Fixed intrinsics for all images
+        ixt = np.array([
+            [self.fx, 0, self.cx],
+            [0, self.fy, self.cy],
+            [0, 0, 1],
+        ], dtype=np.float32)
+        out = Dict({
+            "image_files": [],
+            "extrinsics": [],
+            "intrinsics": [],
+            "aux": Dict({
+                "gt_mesh_path": gt_mesh_path,
+                "gt_depth_files": [],
+            }),
+        })
+        for i in range(0, n_imgs, 1):
+            img_path = os.path.join(data_folder, f"frame-{i:06d}.color.png")
+            pose_path = os.path.join(data_folder, f"frame-{i:06d}.pose.txt")
+            depth_path = os.path.join(data_folder, f"frame-{i:06d}.depth.png")
+            if not os.path.exists(img_path) or not os.path.exists(pose_path):
+                continue
+            # Load camera-to-world pose and convert to world-to-camera (extrinsic)
+            c2w = np.loadtxt(pose_path)
+            ext = np.linalg.inv(c2w).astype(np.float32)
+            out.image_files.append(img_path)
+            out.extrinsics.append(ext)
+            out.intrinsics.append(ixt.copy())
+            out.aux.gt_depth_files.append(depth_path)
+        out.extrinsics = np.asarray(out.extrinsics, dtype=np.float32)
+        out.intrinsics = np.asarray(out.intrinsics, dtype=np.float32)
+        print(f"[7Scenes] {scene}: {len(out.image_files)} images")
+        self._scene_cache[scene] = out
+        return out
+    def eval3d(self, scene: str, fuse_path: str) -> TDict[str, float]:
+        """
+        Evaluate fused point cloud against 7Scenes ground truth mesh.
+        Args:
+            scene: Scene identifier
+            fuse_path: Path to fused point cloud (.ply)
+        Returns:
+            Dict with metrics: acc, comp, overall, precision, recall, fscore
+        """
+        gt_data = self.get_data(scene)
+        gt_mesh_path = gt_data.aux.gt_mesh_path
+        # Load and sample ground truth mesh
+        gt_mesh = o3d.io.read_triangle_mesh(gt_mesh_path)
+        gt_pcd = sample_points_from_mesh(gt_mesh, self.sampling_number)
+        # Load predicted point cloud
+        pred_pcd = o3d.io.read_point_cloud(fuse_path)
+        # Evaluate using shared utility function
+        metrics = evaluate_3d_reconstruction(
+            pred_pcd,
+            gt_pcd,
+            threshold=self.eval_threshold,
+            down_sample=self.down_sample,
+        )
+        return metrics
+    def _load_gt_meta(self, result_path: str) -> Dict:
+        """
+        Load saved GT meta (extrinsics, intrinsics, image_files) for fusion.
+        This is needed when frames are sampled, so fuse3d uses the correct
+        (sampled) GT instead of full dataset GT.
+        Args:
+            result_path: Path to npz file (used to derive gt_meta.npz path)
+        Returns:
+            Dict with GT data, or None if gt_meta.npz doesn't exist
+        """
+        export_dir = os.path.dirname(result_path)  # exports/mini_npz/
+        gt_meta_path = os.path.join(os.path.dirname(export_dir), "gt_meta.npz")
+        if os.path.exists(gt_meta_path):
+            data = np.load(gt_meta_path, allow_pickle=True)
+            # Build aux with gt_depth_files derived from image_files
+            image_files = list(data["image_files"])
+            gt_depth_files = [
+                img_path.replace("color", "depth").replace(".color.", ".depth.")
+                for img_path in image_files
+            ]
+            return Dict({
+                "extrinsics": data["extrinsics"],
+                "intrinsics": data["intrinsics"],
+                "image_files": image_files,
+                "aux": Dict({"gt_depth_files": gt_depth_files}),
+            })
+        return None
+    def fuse3d(self, scene: str, result_path: str, fuse_path: str, mode: str) -> None:
+        """
+        Fuse per-view depths into a point cloud using TSDF fusion.
+        Args:
+            scene: Scene identifier
+            result_path: Path to npz file with predicted depths/poses
+            fuse_path: Output path for fused point cloud (.ply)
+            mode: "recon_unposed" or "recon_posed"
+        """
+        # Try to load saved GT meta (handles frame sampling)
+        gt_meta = self._load_gt_meta(result_path)
+        if gt_meta is not None:
+            gt_data = gt_meta
+        else:
+            gt_data = self.get_data(scene)
+        _wait_for_file_ready(result_path)
+        pred_data = Dict({k: v for k, v in np.load(result_path).items()})
+        # Load original images (keep original size)
+        images = []
+        orig_sizes = []
+        for img_path in gt_data.image_files:
+            img = cv2.imread(img_path)
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            images.append(img)
+            orig_sizes.append((img.shape[0], img.shape[1]))
+        # Prepare depths, intrinsics, extrinsics
+        if mode == "recon_unposed":
+            depths, intrinsics, extrinsics = self._prep_unposed(
+                pred_data, gt_data, orig_sizes, scene=scene
+            )
+        elif mode == "recon_posed":
+            depths, intrinsics, extrinsics = self._prep_posed(
+                pred_data, gt_data, orig_sizes, scene=scene
+            )
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+        images = np.stack(images, axis=0)
+        # Create TSDF volume and fuse
+        volume = create_tsdf_volume(
+            voxel_length=self.voxel_length,
+            sdf_trunc=self.sdf_trunc,
+        )
+        mesh = fuse_depth_to_tsdf(
+            volume, depths, images, intrinsics, extrinsics, max_depth=self.max_depth
+        )
+        # Sample points from mesh
+        pcd = sample_points_from_mesh(mesh, self.sampling_number)
+        # Save point cloud
+        os.makedirs(os.path.dirname(fuse_path), exist_ok=True)
+        o3d.io.write_point_cloud(fuse_path, pcd)
+    # ------------------------------
+    # Private helpers
+    # ------------------------------
+    def _prep_unposed(
+        self, pred_data: Dict, gt_data: Dict, orig_sizes: list, scene: str
+    ) -> tuple:
+        """
+        Prepare depths/intrinsics/extrinsics for recon_unposed mode.
+        Similar to ETH3D but uses GT depth for masking instead of separate mask files.
+        """
+        # Scale alignment with fixed random_state for reproducibility
+        _, _, scale, extrinsics = align_poses_umeyama(
+            gt_data.extrinsics.copy(),
+            pred_data.extrinsics.copy(),
+            return_aligned=True,
+            ransac=True,
+            random_state=42,
+        )
+        model_h, model_w = pred_data.depth.shape[1], pred_data.depth.shape[2]
+        depths_out = []
+        intrinsics_out = []
+        for i in range(len(pred_data.depth)):
+            orig_h, orig_w = orig_sizes[i]
+            # Resize depth to original image size (nearest interpolation)
+            depth = cv2.resize(
+                pred_data.depth[i],
+                (orig_w, orig_h),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Load GT depth for masking
+            gt_zero_mask = self._load_gt_mask(gt_data.aux.gt_depth_files[i])
+            # Mask invalid depths BEFORE scale
+            depth = self._mask_invalid_depth(depth, gt_zero_mask)
+            # Apply scale AFTER mask
+            depth = depth * scale
+            # Adjust intrinsics to original image size
+            h_ratio = orig_h / model_h
+            w_ratio = orig_w / model_w
+            ixt = pred_data.intrinsics[i].copy()
+            ixt[0, :] *= w_ratio
+            ixt[1, :] *= h_ratio
+            depths_out.append(depth)
+            intrinsics_out.append(ixt)
+        return np.stack(depths_out), np.stack(intrinsics_out), extrinsics
+    def _prep_posed(
+        self, pred_data: Dict, gt_data: Dict, orig_sizes: list, scene: str
+    ) -> tuple:
+        """
+        Prepare depths/intrinsics/extrinsics for recon_posed mode.
+        Uses GT intrinsics/extrinsics but aligns depth scale via Umeyama.
+        """
+        # Scale alignment with fixed random_state
+        _, _, scale, _ = align_poses_umeyama(
+            gt_data.extrinsics.copy(),
+            pred_data.extrinsics.copy(),
+            return_aligned=True,
+            ransac=True,
+            random_state=42,
+        )
+        model_h, model_w = pred_data.depth.shape[1], pred_data.depth.shape[2]
+        depths_out = []
+        for i in range(len(pred_data.depth)):
+            orig_h, orig_w = orig_sizes[i]
+            # Resize depth to original image size
+            depth = cv2.resize(
+                pred_data.depth[i],
+                (orig_w, orig_h),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            # Load GT depth for masking
+            gt_zero_mask = self._load_gt_mask(gt_data.aux.gt_depth_files[i])
+            # Mask invalid depths BEFORE scale
+            depth = self._mask_invalid_depth(depth, gt_zero_mask)
+            # Apply scale AFTER mask
+            depth = depth * scale
+            depths_out.append(depth)
+        # Use GT intrinsics and extrinsics
+        return np.stack(depths_out), gt_data.intrinsics.copy(), gt_data.extrinsics.copy()
+    def _load_gt_mask(self, gt_depth_path: str) -> np.ndarray:
+        """
+        Load GT depth and create valid mask.
+        For 7Scenes, GT depth is stored as 16-bit PNG in millimeters.
+        Value 65535 indicates invalid depth.
+        Returns:
+            Boolean mask where True = valid region to keep
+        """
+        if not os.path.exists(gt_depth_path):
+            return None
+        gt_depth = cv2.imread(gt_depth_path, -1)
+        if gt_depth is None:
+            return None
+        # 65535 is invalid depth marker in 7Scenes
+        gt_depth[gt_depth == 65535] = 0
+        # Convert to meters
+        gt_depth = gt_depth / 1000.0
+        # Valid mask: depth > 0
+        valid_mask = gt_depth > 0
+        return valid_mask
+    def _mask_invalid_depth(
+        self, depth: np.ndarray, gt_zero_mask: np.ndarray = None
+    ) -> np.ndarray:
+        """
+        Mask invalid depth values by setting them to 0.
+        Args:
+            depth: Depth map to mask
+            gt_zero_mask: Optional GT mask (True = valid region)
+        Returns:
+            Masked depth map with invalid regions set to 0
+        """
+        depth = depth.copy()
+        if gt_zero_mask is not None:
+            # Also mask out invalid pred depth
+            pred_invalid = np.isnan(depth) | np.isinf(depth)
+            combined_mask = np.logical_and(gt_zero_mask, np.logical_not(pred_invalid))
+            depth = depth * combined_mask.astype(np.float32)
+        else:
+            # Fallback: only mask pred invalid values
+            invalid_mask = np.isnan(depth) | np.isinf(depth) | (depth <= 0)
+            depth[invalid_mask] = 0.0
+        return depth

Depth-Anything-3/src/depth_anything_3/bench/evaluator.py ADDED Viewed

	@@ -0,0 +1,752 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Main Evaluator class for DepthAnything3 benchmark evaluation.
+Supports multiple datasets and evaluation modes:
+- pose: Camera pose estimation (AUC metrics)
+- recon_unposed: 3D reconstruction with predicted poses
+- recon_posed: 3D reconstruction with GT poses
+- view_syn: Novel view synthesis (TODO)
+"""
+import json
+import os
+import random
+from typing import Dict as TDict, Iterable, List
+import numpy as np
+import torch
+from addict import Dict
+from tqdm import tqdm
+from depth_anything_3.bench.print_metrics import MetricsPrinter
+from depth_anything_3.utils.parallel_utils import parallel_execution
+from depth_anything_3.bench.registries import MV_REGISTRY
+from depth_anything_3.utils.constants import EVAL_REF_VIEW_STRATEGY
+class Evaluator:
+    """
+    Main evaluation orchestrator for DepthAnything3 benchmarks.
+    Usage:
+        evaluator = Evaluator(
+            work_dir="./eval_workspace",
+            datas=["dtu"],
+            modes=["pose", "recon_unposed", "recon_posed"],
+        )
+        api = DepthAnything3.from_pretrained("...")
+        evaluator.infer(api)
+        metrics = evaluator.eval()
+        evaluator.print_metrics()
+    """
+    VALID_MODES = {"pose", "recon_unposed", "recon_posed", "view_syn"}
+    def __init__(
+        self,
+        work_dir: str = "./eval_workspace",
+        datas: List[str] = ("dtu",),
+        modes: List[str] = ("recon_unposed",),
+        ref_view_strategy: str = EVAL_REF_VIEW_STRATEGY,
+        scenes: List[str] = None,
+        debug: bool = False,
+        num_fusion_workers: int = 4,
+        max_frames: int = 100,
+        gpu_id: int = 0,
+        total_gpus: int = 1,
+    ):
+        """
+        Initialize the evaluator.
+        Args:
+            work_dir: Base directory for model outputs and metric files
+            datas: List of dataset names (must be registered in MV_REGISTRY)
+            modes: List of evaluation modes to run
+            ref_view_strategy: Reference view selection strategy for inference
+                               ("first", "saddle_balanced", etc.)
+            scenes: Specific scenes to evaluate (None = all scenes)
+            debug: Enable verbose debug output
+            num_fusion_workers: Number of parallel workers for TSDF fusion (default: 4)
+            max_frames: Maximum number of frames per scene (default: 100).
+                        If a scene has more frames, randomly sample to this limit.
+                        Set to -1 to disable sampling.
+            gpu_id: GPU index for multi-GPU (0-indexed)
+            total_gpus: Total number of GPUs for task distribution
+        """
+        self.work_dir = work_dir
+        self.datas = list(datas)
+        self.modes = set(modes)
+        self.ref_view_strategy = ref_view_strategy
+        self.scenes_filter = scenes
+        self.debug = debug
+        self.num_fusion_workers = num_fusion_workers
+        self.max_frames = max_frames
+        self.gpu_id = gpu_id
+        self.total_gpus = total_gpus
+        # Validate modes
+        unknown = self.modes - self.VALID_MODES
+        if unknown:
+            raise ValueError(f"Unknown modes: {unknown}. Valid: {sorted(self.VALID_MODES)}")
+        os.makedirs(self.work_dir, exist_ok=True)
+        # Initialize datasets
+        self.datasets = Dict()
+        for data in self.datas:
+            if not MV_REGISTRY.has(data):
+                available = list(MV_REGISTRY.all().keys())
+                raise ValueError(f"Dataset '{data}' not found. Available: {available}")
+            self.datasets[data] = MV_REGISTRY.get(data)()
+        # Initialize metrics printer
+        self._printer = MetricsPrinter()
+    # -------------------- Public APIs -------------------- #
+    def all(self, api) -> TDict[str, dict]:
+        """
+        Run complete evaluation pipeline: inference + evaluation.
+        Args:
+            api: DepthAnything3 API instance
+        Returns:
+            Combined metrics dictionary
+        """
+        self.infer(api)
+        return self.eval()
+    def _get_scenes(self, dataset) -> List[str]:
+        """Get list of scenes to evaluate, optionally filtered."""
+        all_scenes = dataset.SCENES
+        if self.scenes_filter:
+            scenes = [s for s in all_scenes if s in self.scenes_filter]
+            if self.debug:
+                print(f"[DEBUG] Filtered scenes: {scenes} (from {len(all_scenes)} total)")
+            return scenes
+        return all_scenes
+    def infer(self, api, model_path: str = None) -> None:
+        """
+        Run inference according to requested modes.
+        - Unposed export if 'pose' or 'recon_unposed' is in modes
+        - Posed export if 'recon_posed' or 'view_syn' is in modes
+        Multi-GPU: Use --gpu_id and --total_gpus to distribute tasks.
+        Example: Launch 4 processes with gpu_id=0,1,2,3 and total_gpus=4
+        Args:
+            api: DepthAnything3 API instance
+            model_path: Model path (unused, kept for API compatibility)
+        """
+        need_unposed = {"pose", "recon_unposed"} & self.modes
+        need_posed = {"recon_posed", "view_syn"} & self.modes
+        export_format = "mini_npz-glb" if self.debug else "mini_npz"
+        # Collect all tasks
+        all_tasks = []
+        for data in self.datas:
+            dataset = self.datasets[data]
+            for scene in self._get_scenes(dataset):
+                all_tasks.append((data, scene))
+        # Distribute tasks across GPUs
+        if self.total_gpus > 1:
+            tasks = [t for i, t in enumerate(all_tasks) if i % self.total_gpus == self.gpu_id]
+            print(f"[INFO] GPU {self.gpu_id}/{self.total_gpus}: {len(tasks)}/{len(all_tasks)} tasks")
+        else:
+            tasks = all_tasks
+            print(f"[INFO] Total inference tasks: {len(tasks)}")
+        for data, scene in tqdm(tasks, desc=f"Inference (GPU {self.gpu_id})"):
+            dataset = self.datasets[data]
+            scene_data = dataset.get_data(scene)
+            scene_data = self._sample_frames(scene_data, scene)
+            if need_unposed:
+                export_dir = self._export_dir(data, scene, posed=False)
+                api.inference(
+                    scene_data.image_files,
+                    export_dir=export_dir,
+                    export_format=export_format,
+                    ref_view_strategy=self.ref_view_strategy,
+                )
+                self._save_gt_meta(export_dir, scene_data)
+            if need_posed:
+                export_dir = self._export_dir(data, scene, posed=True)
+                api.inference(
+                    scene_data.image_files,
+                    scene_data.extrinsics,
+                    scene_data.intrinsics,
+                    export_dir=export_dir,
+                    export_format=export_format,
+                    ref_view_strategy=self.ref_view_strategy,
+                )
+                self._save_gt_meta(export_dir, scene_data)
+    def eval(self) -> TDict[str, dict]:
+        """
+        Evaluate for all configured modes and write JSON files.
+        Evaluation order by mode (all datasets per mode):
+        1. pose - all datasets
+        2. recon_unposed - all datasets
+        3. recon_posed - all datasets
+        Returns:
+            Summary mapping: {"<data>_<mode>": metrics_dict}
+        """
+        summary: TDict[str, dict] = {}
+        # Evaluate by mode (all datasets per mode)
+        if "pose" in self.modes:
+            print(f"\n{'='*60}")
+            print(f"📊 Evaluating POSE for all datasets...")
+            print(f"{'='*60}")
+            for data, result in self._eval_pose():
+                summary[f"{data}_pose"] = result
+        if "recon_unposed" in self.modes:
+            print(f"\n{'='*60}")
+            print(f"📊 Evaluating RECON_UNPOSED for all datasets...")
+            print(f"{'='*60}")
+            for data, result in self._eval_reconstruction("recon_unposed"):
+                summary[f"{data}_recon_unposed"] = result
+        if "recon_posed" in self.modes:
+            print(f"\n{'='*60}")
+            print(f"📊 Evaluating RECON_POSED for all datasets...")
+            print(f"{'='*60}")
+            for data, result in self._eval_reconstruction("recon_posed"):
+                summary[f"{data}_recon_posed"] = result
+        if "view_syn" in self.modes:
+            # TODO: Add view synthesis metrics here when available
+            pass
+        return summary
+    def print_metrics(self, metrics: TDict[str, dict] = None) -> None:
+        """
+        Print evaluation metrics in a beautiful tabular format.
+        Args:
+            metrics: Metrics dictionary. If None, loads from saved JSON files.
+        """
+        if metrics is None:
+            metrics = self._load_metrics()
+        self._printer.print_results(metrics)
+    # -------------------- Evaluation Methods -------------------- #
+    def _eval_pose(self) -> Iterable[tuple]:
+        """Compute pose-estimation metrics for each dataset and scene."""
+        os.makedirs(self._metric_dir, exist_ok=True)
+        for data in tqdm(self.datas, desc="Datasets (pose eval)"):
+            dataset = self.datasets[data]
+            dataset_results = Dict()
+            scenes = self._get_scenes(dataset)
+            for scene in tqdm(scenes, desc=f"{data} scenes", leave=False):
+                export_dir = self._export_dir(data, scene, posed=False)
+                result_path = os.path.join(export_dir, "exports", "mini_npz", "results.npz")
+                # Check if result file exists and is valid
+                if not os.path.exists(result_path):
+                    print(f"\n[ERROR] Result file not found: {result_path}")
+                    print(f"[ERROR] CWD: {os.getcwd()}")
+                    print(f"[ERROR] Please run inference first (remove --eval_only)")
+                    continue
+                try:
+                    # Use saved GT meta (handles frame sampling correctly)
+                    gt_meta = self._load_gt_meta(export_dir)
+                    if gt_meta is not None:
+                        result = self._compute_pose_with_gt(result_path, gt_meta)
+                    else:
+                        # Fallback to dataset GT (no sampling was done)
+                        result = dataset.eval_pose(scene, result_path)
+                    dataset_results[scene] = self._to_float_dict(result)
+                except Exception as e:
+                    print(f"\n[ERROR] Failed to evaluate pose for {data}/{scene}: {e}")
+                    print(f"[ERROR] File path: {os.path.abspath(result_path)}")
+                    if self.debug:
+                        import traceback
+                        traceback.print_exc()
+                    continue
+            if not dataset_results:
+                print(f"[WARNING] No valid results for {data}")
+                continue
+            dataset_results["mean"] = self._mean_of_dicts(dataset_results.values())
+            out_path = os.path.join(self._metric_dir, f"{data}_pose.json")
+            self._dump_json(out_path, dataset_results)
+            yield data, dataset_results
+    def _eval_reconstruction(self, mode: str) -> Iterable[tuple]:
+        """
+        Compute reconstruction metrics for each dataset and scene.
+        Args:
+            mode: "recon_unposed" or "recon_posed"
+        """
+        assert mode in {"recon_unposed", "recon_posed"}
+        os.makedirs(self._metric_dir, exist_ok=True)
+        posed_flag = mode == "recon_posed"
+        # Filter out datasets that don't support reconstruction (e.g., dtu64)
+        recon_datas = [d for d in self.datas if d != "dtu64"]
+        for data in tqdm(recon_datas, desc=f"Datasets ({mode} eval)"):
+            dataset = self.datasets[data]
+            dataset_results = Dict()
+            scenes = self._get_scenes(dataset)
+            # Prepare paths for all scenes
+            scene_list = []
+            result_paths = []
+            fuse_paths = []
+            for scene in scenes:
+                export_dir = self._export_dir(data, scene, posed=posed_flag)
+                result_path = os.path.join(export_dir, "exports", "mini_npz", "results.npz")
+                fuse_path = os.path.join(export_dir, "exports", "fuse", "pcd.ply")
+                scene_list.append(scene)
+                result_paths.append(result_path)
+                fuse_paths.append(fuse_path)
+            # Parallel fusion (default 4 workers)
+            # DTU uses CUDA operations in fusion, which doesn't work well with ThreadPool
+            use_sequential = (data == "dtu")
+            parallel_execution(
+                scene_list,
+                result_paths,
+                fuse_paths,
+                action=lambda s, rp, fp: dataset.fuse3d(s, rp, fp, mode),
+                num_processes=self.num_fusion_workers,
+                print_progress=True,
+                desc=f"{data} fusion",
+                sequential=use_sequential,
+            )
+            # Sequential evaluation (fast, no need to parallelize)
+            for scene, fuse_path in zip(scene_list, fuse_paths):
+                # DTU supports CPU-based evaluation
+                if data == "dtu" and hasattr(dataset, "eval3d"):
+                    result = dataset.eval3d(scene, fuse_path)
+                else:
+                    result = dataset.eval3d(scene, fuse_path)
+                dataset_results[scene] = self._to_float_dict(result)
+                print(f"  {mode} | {data} | {scene}: {result}")
+            dataset_results["mean"] = self._mean_of_dicts(dataset_results.values())
+            out_path = os.path.join(self._metric_dir, f"{data}_{mode}.json")
+            self._dump_json(out_path, dataset_results)
+            yield data, dataset_results
+    # -------------------- Helpers -------------------- #
+    def _save_gt_meta(self, export_dir: str, scene_data: Dict) -> None:
+        """
+        Save GT extrinsics/intrinsics/image_files for evaluation.
+        This is needed when frames are sampled, so eval_pose and fuse3d can use
+        the correct (sampled) GT instead of full dataset GT.
+        Args:
+            export_dir: Export directory for the scene
+            scene_data: Sampled scene data
+        """
+        meta_path = os.path.join(export_dir, "exports", "gt_meta.npz")
+        os.makedirs(os.path.dirname(meta_path), exist_ok=True)
+        np.savez_compressed(
+            meta_path,
+            extrinsics=scene_data.extrinsics,
+            intrinsics=scene_data.intrinsics,
+            image_files=np.array(scene_data.image_files, dtype=object),
+        )
+    def _load_gt_meta(self, export_dir: str) -> Dict:
+        """
+        Load saved GT extrinsics/intrinsics for evaluation.
+        Returns:
+            Dict with extrinsics and intrinsics, or None if not found
+        """
+        meta_path = os.path.join(export_dir, "exports", "gt_meta.npz")
+        if os.path.exists(meta_path):
+            data = np.load(meta_path)
+            return Dict({
+                "extrinsics": data["extrinsics"],
+                "intrinsics": data["intrinsics"],
+            })
+        return None
+    def _compute_pose_with_gt(self, result_path: str, gt_meta: Dict) -> TDict[str, float]:
+        """
+        Compute pose metrics using saved GT meta (handles frame sampling).
+        Args:
+            result_path: Path to npz with predicted extrinsics
+            gt_meta: Dict with GT extrinsics from saved meta
+        Returns:
+            Dict with pose metrics
+        """
+        from depth_anything_3.bench.dataset import _wait_for_file_ready
+        from depth_anything_3.bench.utils import compute_pose
+        from depth_anything_3.utils.geometry import as_homogeneous
+        _wait_for_file_ready(result_path)
+        pred = np.load(result_path)
+        return compute_pose(
+            torch.from_numpy(as_homogeneous(pred["extrinsics"])),
+            torch.from_numpy(as_homogeneous(gt_meta["extrinsics"])),
+        )
+    def _sample_frames(self, scene_data: Dict, scene: str) -> Dict:
+        """
+        Sample frames if scene has more than max_frames.
+        Uses fixed random seed (42) for reproducibility.
+        Args:
+            scene_data: Scene data dict with image_files, extrinsics, intrinsics, aux
+            scene: Scene name (for logging)
+        Returns:
+            Sampled scene_data if num_frames > max_frames, otherwise original
+        """
+        if self.max_frames <= 0:
+            return scene_data
+        num_frames = len(scene_data.image_files)
+        if num_frames <= self.max_frames:
+            return scene_data
+        # Sample with fixed seed for reproducibility
+        random.seed(42)
+        indices = list(range(num_frames))
+        random.shuffle(indices)
+        sampled_indices = sorted(indices[:self.max_frames])
+        print(f"  [Sampling] {scene}: {num_frames} -> {self.max_frames} frames")
+        # Create new scene_data with sampled frames
+        sampled = Dict()
+        sampled.image_files = [scene_data.image_files[i] for i in sampled_indices]
+        sampled.extrinsics = scene_data.extrinsics[sampled_indices]
+        sampled.intrinsics = scene_data.intrinsics[sampled_indices]
+        # Copy aux data, sampling lists if needed
+        sampled.aux = Dict()
+        for key, val in scene_data.aux.items():
+            if isinstance(val, list) and len(val) == num_frames:
+                sampled.aux[key] = [val[i] for i in sampled_indices]
+            elif isinstance(val, np.ndarray) and len(val) == num_frames:
+                sampled.aux[key] = val[sampled_indices]
+            else:
+                sampled.aux[key] = val
+        return sampled
+    @property
+    def _metric_dir(self) -> str:
+        """Directory for storing metric JSON files."""
+        return os.path.join(self.work_dir, "metric_results")
+    def _export_dir(self, data: str, scene: str, posed: bool) -> str:
+        """
+        Get export directory path.
+        Structure: .../model_results/{data}/{scene}/{posed|unposed}
+        """
+        suffix = "posed" if posed else "unposed"
+        export_dir = os.path.join(self.work_dir, "model_results", data, scene, suffix)
+        os.makedirs(export_dir, exist_ok=True)
+        return export_dir
+    @staticmethod
+    def _to_float_dict(d: TDict[str, float]) -> dict:
+        """Convert numpy scalars to plain Python floats for JSON safety."""
+        return {k: float(v) for k, v in d.items()}
+    @staticmethod
+    def _mean_of_dicts(dicts: Iterable[dict]) -> dict:
+        """Compute elementwise mean across a list of homogeneous metric dicts."""
+        dicts = list(dicts)
+        if not dicts:
+            return {}
+        keys = dicts[0].keys()
+        return {k: float(np.mean([d[k] for d in dicts]).item()) for k in keys}
+    @staticmethod
+    def _dump_json(path: str, obj: dict, indent: int = 4) -> None:
+        """Write JSON with UTF-8 and pretty indentation."""
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(obj, f, indent=indent, ensure_ascii=False)
+    def _load_metrics(self) -> TDict[str, dict]:
+        """Load evaluation metrics from JSON files."""
+        metrics = {}
+        metric_dir = self._metric_dir
+        if not os.path.exists(metric_dir):
+            return metrics
+        for filename in os.listdir(metric_dir):
+            if filename.endswith(".json"):
+                filepath = os.path.join(metric_dir, filename)
+                try:
+                    with open(filepath, encoding="utf-8") as f:
+                        data = json.load(f)
+                    key = filename[:-5]  # Remove .json extension
+                    metrics[key] = data
+                except Exception as e:
+                    print(f"Warning: Failed to read metrics file: {filename} - {e}")
+        return metrics
+# -------------------- CLI Entry Point -------------------- #
+if __name__ == "__main__":
+    import sys
+    from omegaconf import OmegaConf
+    from depth_anything_3.cfg import load_config
+    # Get default config path (relative to this file)
+    _default_config = os.path.join(
+        os.path.dirname(__file__), "configs", "eval_bench.yaml"
+    )
+    # Check for help flag first (we need to handle this before OmegaConf)
+    if "--help" in sys.argv or "-h" in sys.argv:
+        pass  # Will handle after config loading
+    # Set up argv for OmegaConf processing
+    argv = sys.argv[1:]
+    # Check if user provides custom config
+    config_path = _default_config
+    if "--config" in argv:
+        config_idx = argv.index("--config")
+        if config_idx + 1 < len(argv):
+            config_path = argv[config_idx + 1]
+            # Remove --config and its value
+            argv = argv[:config_idx] + argv[config_idx + 2:]
+    # Print help if requested
+    if "--help" in sys.argv or "-h" in sys.argv:
+        print("""
+DepthAnything3 Benchmark Evaluation
+Usage:
+  python -m depth_anything_3.bench.evaluator [OPTIONS] [KEY=VALUE ...]
+Configuration:
+  --config PATH                      Config YAML file (default: bench/configs/eval_bench.yaml)
+Config Overrides (using dotlist notation):
+  model.path=VALUE                   Model path or HuggingFace ID
+  workspace.work_dir=VALUE           Working directory for outputs
+  eval.datasets=[dataset1,dataset2]  Datasets to evaluate (eth3d,7scenes,scannetpp,hiroom,dtu,dtu64)
+  eval.modes=[mode1,mode2]           Evaluation modes (pose,recon_unposed,recon_posed)
+  eval.scenes=[scene1,scene2]        Specific scenes to evaluate (null=all)
+  eval.max_frames=VALUE              Max frames per scene (-1=no limit, default: 100)
+  eval.ref_view_strategy=VALUE       Reference view strategy (default: first)
+  eval.eval_only=VALUE               Only run evaluation (skip inference) (true/false)
+  eval.print_only=VALUE              Only print saved metrics (true/false)
+  inference.num_fusion_workers=VALUE Number of parallel workers (default: 4)
+  inference.debug=VALUE              Enable debug mode (true/false)
+Special Flags:
+  --help, -h                         Show this help message
+Multi-GPU:
+  Use CUDA_VISIBLE_DEVICES to specify GPUs (auto-detected and distributed)
+Examples:
+  # Use default config
+  python -m depth_anything_3.bench.evaluator
+  # Override model path
+  python -m depth_anything_3.bench.evaluator model.path=depth-anything/DA3-LARGE
+  # Evaluate specific datasets and modes
+  python -m depth_anything_3.bench.evaluator \\
+      eval.datasets=[eth3d,hiroom] \\
+      eval.modes=[pose]
+  # Use custom config with overrides
+  python -m depth_anything_3.bench.evaluator \\
+      --config my_config.yaml \\
+      model.path=/path/to/model \\
+      eval.max_frames=50
+  # Multi-GPU inference (auto-distributed)
+  CUDA_VISIBLE_DEVICES=0,1,2,3 python -m depth_anything_3.bench.evaluator
+  # Debug specific scenes
+  python -m depth_anything_3.bench.evaluator \\
+      eval.datasets=[eth3d] \\
+      eval.scenes=[courtyard] \\
+      inference.debug=true
+  # Only evaluate (skip inference)
+  python -m depth_anything_3.bench.evaluator eval.eval_only=true
+  # Only print saved metrics
+  python -m depth_anything_3.bench.evaluator eval.print_only=true
+          """)
+        sys.exit(0)
+    # Load config with CLI overrides using OmegaConf dotlist
+    # Example: python evaluator.py model.path=/path/to/model eval.datasets=[eth3d,dtu]
+    config = load_config(config_path, argv=argv)
+    # Extract config values
+    work_dir = config.workspace.work_dir
+    model_path = config.model.path
+    datasets = config.eval.datasets
+    modes = config.eval.modes
+    ref_view_strategy = config.eval.ref_view_strategy
+    scenes = config.eval.scenes
+    max_frames = config.eval.max_frames
+    eval_only = config.eval.eval_only
+    print_only = config.eval.print_only
+    debug = config.inference.debug
+    num_fusion_workers = config.inference.num_fusion_workers
+    # GPU settings: parse from CLI dotlist args (gpu_id=X total_gpus=Y)
+    # These are passed by the main process when spawning workers
+    gpu_id = 0
+    total_gpus = 1
+    for arg in argv:
+        if arg.startswith("gpu_id="):
+            gpu_id = int(arg.split("=")[1])
+        elif arg.startswith("total_gpus="):
+            total_gpus = int(arg.split("=")[1])
+    # Override dataset scenes if specified
+    if scenes:
+        print(f"[INFO] Running on specific scenes: {scenes}")
+    evaluator = Evaluator(
+        work_dir=work_dir,
+        datas=datasets,
+        modes=modes,
+        ref_view_strategy=ref_view_strategy,
+        scenes=scenes,
+        debug=debug,
+        num_fusion_workers=num_fusion_workers,
+        max_frames=max_frames,
+        gpu_id=gpu_id,
+        total_gpus=total_gpus,
+    )
+    if print_only:
+        evaluator.print_metrics()
+    elif eval_only:
+        metrics = evaluator.eval()
+        evaluator.print_metrics(metrics)
+    else:
+        # Parse CUDA_VISIBLE_DEVICES to get GPU list
+        # If not set, use all available GPUs
+        cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cuda_devices is not None and cuda_devices.strip():
+            gpu_list = [g.strip() for g in cuda_devices.split(",") if g.strip()]
+        else:
+            # CUDA_VISIBLE_DEVICES not set, use all available GPUs
+            num_available = torch.cuda.device_count()
+            gpu_list = [str(i) for i in range(num_available)] if num_available > 0 else ["0"]
+        # Auto multi-GPU: if multiple GPUs and not a worker process
+        is_worker = os.environ.get("_DA3_WORKER") == "1"
+        if len(gpu_list) > 1 and not is_worker:
+            # Launch worker processes
+            import subprocess
+            num_gpus = len(gpu_list)
+            print(f"[INFO] Detected {num_gpus} GPUs: {gpu_list}")
+            print(f"[INFO] Launching {num_gpus} workers...")
+            # Build base command
+            base_cmd = [sys.executable, "-m", "depth_anything_3.bench.evaluator"]
+            # Pass config via dotlist instead of CLI args
+            if config_path != _default_config:
+                base_cmd += ["--config", config_path]
+            base_cmd += [f"model.path={model_path}"]
+            base_cmd += [f"workspace.work_dir={work_dir}"]
+            base_cmd += [f"eval.datasets=[{','.join(datasets)}]"]
+            base_cmd += [f"eval.modes=[{','.join(modes)}]"]
+            if scenes:
+                base_cmd += [f"eval.scenes=[{','.join(scenes)}]"]
+            base_cmd += [f"eval.max_frames={max_frames}"]
+            base_cmd += [f"eval.ref_view_strategy={ref_view_strategy}"]
+            base_cmd += [f"inference.debug={str(debug).lower()}"]
+            base_cmd += [f"inference.num_fusion_workers={num_fusion_workers}"]
+            # Launch workers
+            processes = []
+            for idx, gpu_id in enumerate(gpu_list):
+                env = os.environ.copy()
+                env["CUDA_VISIBLE_DEVICES"] = gpu_id
+                env["_DA3_WORKER"] = "1"  # Mark as worker process
+                cmd = base_cmd.copy()
+                # GPU-specific worker config
+                cmd += [f"gpu_id={idx}", f"total_gpus={num_gpus}"]
+                print(f"[INFO] Starting worker {idx} on GPU {gpu_id}")
+                p = subprocess.Popen(cmd, env=env)
+                processes.append(p)
+            # Wait for all workers
+            for p in processes:
+                p.wait()
+            print(f"[INFO] All {num_gpus} workers completed")
+            # Run evaluation after all inference is done
+            metrics = evaluator.eval()
+            evaluator.print_metrics(metrics)
+        else:
+            # Single GPU or worker process
+            from depth_anything_3.api import DepthAnything3
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            api = DepthAnything3.from_pretrained(model_path)
+            api = api.to(device)
+            evaluator.infer(api, model_path=model_path)
+            # Only run eval if single GPU mode (workers don't eval)
+            if not is_worker:
+                metrics = evaluator.eval()
+                evaluator.print_metrics(metrics)

Depth-Anything-3/src/depth_anything_3/bench/print_metrics.py ADDED Viewed

	@@ -0,0 +1,618 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Beautiful metrics printing utilities for benchmark evaluation.
+Provides colorized, well-formatted tabular output for evaluation results.
+Supports highlighting best/worst values and grouping by dataset/mode.
+"""
+import argparse
+import json
+import os
+import re
+from typing import Dict as TDict, List, Optional
+# ANSI color codes for terminal output
+class Colors:
+    """ANSI escape codes for terminal colors."""
+    RESET = "\033[0m"
+    BOLD = "\033[1m"
+    RED = "\033[31m"
+    GREEN = "\033[32m"
+    YELLOW = "\033[33m"
+    BLUE = "\033[34m"
+    MAGENTA = "\033[35m"
+    CYAN = "\033[36m"
+    WHITE = "\033[37m"
+    # Bold variants
+    BOLD_RED = "\033[1;31m"
+    BOLD_GREEN = "\033[1;32m"
+    BOLD_YELLOW = "\033[1;33m"
+    BOLD_BLUE = "\033[1;34m"
+    BOLD_MAGENTA = "\033[1;35m"
+    BOLD_CYAN = "\033[1;36m"
+    # Background
+    BG_DARK = "\033[48;5;236m"
+def strip_ansi(text: str) -> str:
+    """Remove ANSI escape sequences from string for length calculation."""
+    ansi_escape = re.compile(r"\x1b\[[0-9;]*m")
+    return ansi_escape.sub("", text)
+def colorize_value(
+    value: str,
+    is_best: bool = False,
+    is_worst: bool = False,
+    lower_is_better: bool = False,
+) -> str:
+    """
+    Apply color to a metric value based on whether it's best/worst.
+    Args:
+        value: String representation of the value
+        is_best: Whether this is the best value in its column
+        is_worst: Whether this is the worst value in its column
+        lower_is_better: If True, lower values are better (e.g., error metrics)
+    Returns:
+        Colorized string
+    """
+    if lower_is_better:
+        # For metrics like error/distance, lower is better
+        if is_best:
+            return f"{Colors.BOLD_GREEN}{value}{Colors.RESET}"
+        elif is_worst:
+            return f"{Colors.BOLD_RED}{value}{Colors.RESET}"
+    else:
+        # For metrics like accuracy/AUC, higher is better
+        if is_best:
+            return f"{Colors.BOLD_GREEN}{value}{Colors.RESET}"
+        elif is_worst:
+            return f"{Colors.BOLD_RED}{value}{Colors.RESET}"
+    return value
+class MetricsPrinter:
+    """
+    Beautiful tabular metrics printer with color support.
+    Features:
+    - Colorized best/worst values
+    - Grouped by dataset and evaluation mode
+    - Automatic column width calculation
+    - Support for multiple input directories comparison
+    """
+    # Metrics where lower values are better
+    LOWER_IS_BETTER = {"comp", "acc", "overall", "error", "loss", "rmse", "mae"}
+    def __init__(self, use_color: bool = True):
+        """
+        Initialize the printer.
+        Args:
+            use_color: Whether to use ANSI colors in output
+        """
+        self.use_color = use_color
+    def print_results(self, metrics: TDict[str, dict], summary_only: bool = True) -> None:
+        """
+        Print evaluation metrics in a beautiful tabular format.
+        Args:
+            metrics: Dictionary mapping "dataset_mode" to metric results
+            summary_only: If True, only print summary table. If False, print per-dataset details too.
+        """
+        if not metrics:
+            print(f"\n{Colors.BOLD_RED}❌ No evaluation metrics found{Colors.RESET}")
+            return
+        if not summary_only:
+            self._print_header()
+            grouped = self._group_by_dataset(metrics)
+            for dataset, modes_data in grouped.items():
+                self._print_dataset_section(dataset, modes_data)
+        # Print summary table with average metrics across datasets
+        self._print_summary(metrics)
+        self._print_footer()
+    def print_comparison(
+        self,
+        metrics_list: List[TDict[str, dict]],
+        labels: List[str],
+    ) -> None:
+        """
+        Print comparison table for multiple evaluation runs.
+        Args:
+            metrics_list: List of metrics dictionaries
+            labels: Labels for each metrics dictionary
+        """
+        if not metrics_list or not all(metrics_list):
+            print(f"\n{Colors.BOLD_RED}❌ No metrics to compare{Colors.RESET}")
+            return
+        # Collect all datasets and modes
+        all_keys = set()
+        for metrics in metrics_list:
+            all_keys.update(metrics.keys())
+        self._print_header("COMPARISON")
+        for key in sorted(all_keys):
+            parts = key.rsplit("_", 1)
+            if len(parts) == 2:
+                dataset, mode = parts[0], parts[1]
+            else:
+                dataset, mode = key, "unknown"
+            print(f"\n{Colors.BOLD_CYAN}📊 {dataset.upper()} - {mode.upper()}{Colors.RESET}")
+            print("-" * 100)
+            # Collect metrics from all runs
+            all_metric_names = set()
+            for metrics in metrics_list:
+                if key in metrics and "mean" in metrics[key]:
+                    all_metric_names.update(metrics[key]["mean"].keys())
+            if not all_metric_names:
+                continue
+            # Build comparison table
+            metric_width = max(15, max(len(m) for m in all_metric_names) + 2)
+            label_width = max(15, max(len(l) for l in labels) + 2)
+            # Header
+            header = f"{'Metric':<{metric_width}}"
+            for label in labels:
+                header += f"{label:<{label_width}}"
+            print(header)
+            print("-" * len(strip_ansi(header)))
+            # Collect values for highlighting
+            for metric_name in sorted(all_metric_names):
+                values = []
+                for metrics in metrics_list:
+                    if key in metrics and "mean" in metrics[key]:
+                        val = metrics[key]["mean"].get(metric_name)
+                        values.append(val if val is not None else float("nan"))
+                    else:
+                        values.append(float("nan"))
+                # Find best/worst
+                valid_values = [v for v in values if not (v != v)]  # Filter NaN
+                if valid_values:
+                    lower_better = any(
+                        lb in metric_name.lower() for lb in self.LOWER_IS_BETTER
+                    )
+                    best_val = min(valid_values) if lower_better else max(valid_values)
+                    worst_val = max(valid_values) if lower_better else min(valid_values)
+                else:
+                    best_val = worst_val = None
+                # Print row
+                row = f"{metric_name:<{metric_width}}"
+                for val in values:
+                    if val != val:  # NaN check
+                        val_str = "N/A"
+                    else:
+                        val_str = f"{val:.4f}"
+                        if self.use_color and len(valid_values) > 1:
+                            lower_better = any(
+                                lb in metric_name.lower() for lb in self.LOWER_IS_BETTER
+                            )
+                            is_best = abs(val - best_val) < 1e-8 if best_val else False
+                            is_worst = abs(val - worst_val) < 1e-8 if worst_val else False
+                            val_str_padded = f"{val_str:<{label_width}}"
+                            val_str = colorize_value(
+                                val_str_padded, is_best, is_worst, lower_better
+                            )
+                            row += val_str
+                            continue
+                    row += f"{val_str:<{label_width}}"
+                print(row)
+        self._print_footer()
+    def _print_header(self, title: str = "EVALUATION RESULTS") -> None:
+        """Print report header."""
+        width = 100
+        print()
+        print("=" * width)
+        print(f"{Colors.BOLD_CYAN}📊 DEPTH ANYTHING 3 {title}{Colors.RESET}")
+        print("=" * width)
+    def _print_footer(self) -> None:
+        """Print report footer."""
+        width = 100
+        print()
+        print("=" * width)
+        print(f"{Colors.BOLD_GREEN}✅ Evaluation Complete{Colors.RESET}")
+        print("=" * width)
+        print()
+    def _group_by_dataset(self, metrics: TDict[str, dict]) -> TDict[str, dict]:
+        """Group metrics by dataset."""
+        grouped = {}
+        for key, data in metrics.items():
+            if not isinstance(data, dict) or "mean" not in data:
+                continue
+            # Parse key format: "dataset_mode" (e.g., "dtu_recon_unposed")
+            parts = key.split("_", 1)
+            if len(parts) == 2:
+                dataset, mode = parts
+                if dataset not in grouped:
+                    grouped[dataset] = {}
+                grouped[dataset][mode] = data
+        return grouped
+    def _print_dataset_section(self, dataset: str, modes_data: TDict[str, dict]) -> None:
+        """Print metrics section for a single dataset."""
+        print(f"\n{Colors.BOLD_MAGENTA}🔍 {dataset.upper()}{Colors.RESET}")
+        print("-" * 100)
+        # Collect all unique metrics across all modes
+        all_metrics = set()
+        for mode_data in modes_data.values():
+            all_metrics.update(mode_data["mean"].keys())
+        all_metrics = sorted(list(all_metrics))
+        if not all_metrics:
+            print("  No metrics available")
+            return
+        # Calculate column widths
+        metric_width = max(18, max(len(m) for m in all_metrics) + 2)
+        mode_width = 18
+        modes = list(modes_data.keys())
+        # Print header
+        header = f"{'Metric':<{metric_width}}"
+        for mode in modes:
+            header += f"{mode.upper():<{mode_width}}"
+        print(f"{Colors.BOLD}{header}{Colors.RESET}")
+        print("-" * len(header))
+        # Print each metric row
+        for metric in all_metrics:
+            row = f"{metric:<{metric_width}}"
+            # Collect values for this metric across modes
+            values = []
+            for mode in modes:
+                if metric in modes_data[mode]["mean"]:
+                    values.append(modes_data[mode]["mean"][metric])
+                else:
+                    values.append(None)
+            # Find best/worst values
+            valid_values = [v for v in values if v is not None]
+            if valid_values:
+                lower_better = any(lb in metric.lower() for lb in self.LOWER_IS_BETTER)
+                best_val = min(valid_values) if lower_better else max(valid_values)
+                worst_val = max(valid_values) if lower_better else min(valid_values)
+            else:
+                best_val = worst_val = None
+            # Format each value
+            for val in values:
+                if val is None:
+                    row += f"{'N/A':<{mode_width}}"
+                else:
+                    val_str = f"{val:.4f}"
+                    if self.use_color and len(valid_values) > 1:
+                        is_best = abs(val - best_val) < 1e-8 if best_val else False
+                        is_worst = abs(val - worst_val) < 1e-8 if worst_val else False
+                        lower_better = any(
+                            lb in metric.lower() for lb in self.LOWER_IS_BETTER
+                        )
+                        # Pad before colorizing to maintain alignment
+                        val_str_padded = f"{val_str:<{mode_width}}"
+                        row += colorize_value(
+                            val_str_padded, is_best, is_worst, lower_better
+                        )
+                    else:
+                        row += f"{val_str:<{mode_width}}"
+            print(row)
+        # Show scene counts
+        scene_info = []
+        for mode, mode_data in modes_data.items():
+            scene_count = len([k for k in mode_data.keys() if k != "mean"])
+            scene_info.append(f"{mode}: {scene_count} scenes")
+        print(f"\n{Colors.CYAN}📈 {' | '.join(scene_info)}{Colors.RESET}")
+    def _print_summary(self, metrics: TDict[str, dict]) -> None:
+        """
+        Print summary table with key metrics across all datasets.
+        Format: One row per metric, datasets as columns.
+        Order: HiRoom, ETH3D, DTU, 7Scenes, ScanNet++, (DTU-64 for pose only)
+        """
+        print(f"\n{Colors.BOLD_CYAN}{'=' * 120}{Colors.RESET}")
+        print(f"{Colors.BOLD_CYAN}📊 SUMMARY{Colors.RESET}")
+        print(f"{Colors.BOLD_CYAN}{'=' * 120}{Colors.RESET}")
+        # Dataset display order and names
+        DATASET_ORDER = ["hiroom", "eth3d", "dtu", "7scenes", "scannetpp", "dtu64"]
+        DATASET_DISPLAY = {
+            "hiroom": "HiRoom",
+            "eth3d": "ETH3D",
+            "dtu": "DTU",
+            "7scenes": "7Scenes",
+            "scannetpp": "ScanNet++",
+            "dtu64": "DTU-64",
+        }
+        # Collect all metrics into a structured dict
+        # metric_data[dataset][mode] = {"Auc_3": x, "Auc_30": x, "fscore": x, "overall": x}
+        metric_data = {}
+        for key, data in metrics.items():
+            if not isinstance(data, dict) or "mean" not in data:
+                continue
+            parts = key.split("_", 1)
+            if len(parts) != 2:
+                continue
+            dataset, mode = parts
+            dataset_lower = dataset.lower()
+            if dataset_lower not in metric_data:
+                metric_data[dataset_lower] = {}
+            metric_data[dataset_lower][mode] = data["mean"]
+        col_width = 12
+        def fmt_val(val):
+            """Format value or return N/A."""
+            if val is None:
+                return "N/A"
+            return f"{val:.4f}"
+        def get_metric(dataset, mode, metric_name):
+            """Get metric value or None."""
+            if dataset not in metric_data:
+                return None
+            if mode not in metric_data[dataset]:
+                return None
+            return metric_data[dataset][mode].get(metric_name)
+        # ============ POSE METRICS ============
+        print(f"\n{Colors.BOLD_MAGENTA}🎯 POSE ESTIMATION{Colors.RESET}")
+        # Pose: show all datasets except DTU (keep DTU-64 only)
+        # Order: HiRoom, ETH3D, DTU-64, 7Scenes, ScanNet++
+        pose_datasets = ["hiroom", "eth3d", "dtu64", "7scenes", "scannetpp"]
+        # Header: Avg first, then datasets
+        header = f"{'Metric':<15}{'Avg':<{col_width}}"
+        for ds in pose_datasets:
+            header += f"{DATASET_DISPLAY[ds]:<{col_width}}"
+        print("-" * len(strip_ansi(header)))
+        print(f"{Colors.BOLD}{header}{Colors.RESET}")
+        print("-" * len(strip_ansi(header)))
+        # Helper to get metric with fallback names
+        def get_pose_metric(dataset, metric_name):
+            """Get pose metric with fallback for different naming conventions."""
+            # Try different naming conventions
+            names = {
+                "Auc3": ["Auc_3", "auc03", "auc_3", "AUC_3", "Auc3", "auc3"],
+                "Auc30": ["Auc_30", "auc30", "auc_30", "AUC_30", "Auc30"],
+            }
+            for name in names.get(metric_name, [metric_name]):
+                val = get_metric(dataset, "pose", name)
+                if val is not None:
+                    return val
+            return None
+        # Auc3 row
+        values = []
+        for ds in pose_datasets:
+            val = get_pose_metric(ds, "Auc3")
+            if val is not None:
+                values.append(val)
+        avg = sum(values) / len(values) if values else None
+        row = f"{'Auc3':<15}{Colors.BOLD_GREEN}{fmt_val(avg):<{col_width}}{Colors.RESET}"
+        for ds in pose_datasets:
+            val = get_pose_metric(ds, "Auc3")
+            row += f"{fmt_val(val):<{col_width}}"
+        print(row)
+        # Auc30 row
+        values = []
+        for ds in pose_datasets:
+            val = get_pose_metric(ds, "Auc30")
+            if val is not None:
+                values.append(val)
+        avg = sum(values) / len(values) if values else None
+        row = f"{'Auc30':<15}{Colors.BOLD_GREEN}{fmt_val(avg):<{col_width}}{Colors.RESET}"
+        for ds in pose_datasets:
+            val = get_pose_metric(ds, "Auc30")
+            row += f"{fmt_val(val):<{col_width}}"
+        print(row)
+        # ============ RECON_UNPOSED METRICS ============
+        print(f"\n{Colors.BOLD_MAGENTA}🏗️  RECON_UNPOSED (Pred Pose){Colors.RESET}")
+        # For recon, exclude dtu64 from columns
+        recon_datasets = ["hiroom", "eth3d", "dtu", "7scenes", "scannetpp"]
+        avg_datasets = ["hiroom", "eth3d", "7scenes", "scannetpp"]  # Exclude DTU from avg
+        # Header: Avg first, then datasets
+        header = f"{'Metric':<15}{'Avg*':<{col_width}}"
+        for ds in recon_datasets:
+            header += f"{DATASET_DISPLAY[ds]:<{col_width}}"
+        print("-" * len(strip_ansi(header)))
+        print(f"{Colors.BOLD}{header}{Colors.RESET}")
+        print("-" * len(strip_ansi(header)))
+        # F-score row (only metric for avg)
+        values = []
+        for ds in recon_datasets:
+            val = get_metric(ds, "recon_unposed", "fscore")
+            if val is not None and ds in avg_datasets:
+                values.append(val)
+        avg = sum(values) / len(values) if values else None
+        row = f"{'F-score':<15}{Colors.BOLD_GREEN}{fmt_val(avg):<{col_width}}{Colors.RESET}"
+        for ds in recon_datasets:
+            val = get_metric(ds, "recon_unposed", "fscore")
+            row += f"{fmt_val(val):<{col_width}}"
+        print(row)
+        # Overall row (avg over 4 datasets excluding DTU)
+        values = []
+        for ds in recon_datasets:
+            val = get_metric(ds, "recon_unposed", "overall")
+            if val is not None and ds in avg_datasets:
+                values.append(val)
+        avg = sum(values) / len(values) if values else None
+        row = f"{'Overall':<15}{Colors.BOLD_GREEN}{fmt_val(avg):<{col_width}}{Colors.RESET}"
+        for ds in recon_datasets:
+            val = get_metric(ds, "recon_unposed", "overall")
+            row += f"{fmt_val(val):<{col_width}}"
+        print(row)
+        # ============ RECON_POSED METRICS ============
+        print(f"\n{Colors.BOLD_MAGENTA}🏗️  RECON_POSED (GT Pose){Colors.RESET}")
+        # Header: Avg first, then datasets
+        header = f"{'Metric':<15}{'Avg*':<{col_width}}"
+        for ds in recon_datasets:
+            header += f"{DATASET_DISPLAY[ds]:<{col_width}}"
+        print("-" * len(strip_ansi(header)))
+        print(f"{Colors.BOLD}{header}{Colors.RESET}")
+        print("-" * len(strip_ansi(header)))
+        # F-score row (only metric for avg)
+        values = []
+        for ds in recon_datasets:
+            val = get_metric(ds, "recon_posed", "fscore")
+            if val is not None and ds in avg_datasets:
+                values.append(val)
+        avg = sum(values) / len(values) if values else None
+        row = f"{'F-score':<15}{Colors.BOLD_GREEN}{fmt_val(avg):<{col_width}}{Colors.RESET}"
+        for ds in recon_datasets:
+            val = get_metric(ds, "recon_posed", "fscore")
+            row += f"{fmt_val(val):<{col_width}}"
+        print(row)
+        # Overall row (avg over 4 datasets excluding DTU)
+        values = []
+        for ds in recon_datasets:
+            val = get_metric(ds, "recon_posed", "overall")
+            if val is not None and ds in avg_datasets:
+                values.append(val)
+        avg = sum(values) / len(values) if values else None
+        row = f"{'Overall':<15}{Colors.BOLD_GREEN}{fmt_val(avg):<{col_width}}{Colors.RESET}"
+        for ds in recon_datasets:
+            val = get_metric(ds, "recon_posed", "overall")
+            row += f"{fmt_val(val):<{col_width}}"
+        print(row)
+        print(f"\n{Colors.CYAN}* Avg F-score / Overall = average over HiRoom, ETH3D, 7Scenes, ScanNet++ (4 datasets){Colors.RESET}")
+def load_metrics_from_dir(metric_dir: str) -> TDict[str, dict]:
+    """
+    Load all metrics JSON files from a directory.
+    Args:
+        metric_dir: Path to directory containing metric JSON files
+    Returns:
+        Dictionary mapping filename (without .json) to metric data
+    """
+    metrics = {}
+    if not os.path.exists(metric_dir):
+        return metrics
+    for filename in os.listdir(metric_dir):
+        if filename.endswith(".json"):
+            filepath = os.path.join(metric_dir, filename)
+            try:
+                with open(filepath, encoding="utf-8") as f:
+                    content = f.read()
+                # Handle trailing commas in JSON
+                content = re.sub(r",\s*([\]\}])", r"\1", content)
+                data = json.loads(content)
+                key = filename[:-5]
+                metrics[key] = data
+            except Exception as e:
+                print(f"Warning: Failed to load {filename}: {e}")
+    return metrics
+def main():
+    """Command-line interface for metrics printing."""
+    parser = argparse.ArgumentParser(
+        description="Print DepthAnything3 benchmark evaluation metrics."
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        default="./eval_workspace/metric_results",
+        help="Directory containing metric JSON files (comma-separated for comparison)",
+    )
+    parser.add_argument(
+        "--no_color",
+        action="store_true",
+        help="Disable colored output",
+    )
+    parser.add_argument(
+        "--key",
+        type=str,
+        default=None,
+        help="Specific metric key to highlight",
+    )
+    args = parser.parse_args()
+    # Support multiple directories for comparison
+    input_dirs = [d.strip() for d in args.input_dir.split(",") if d.strip()]
+    printer = MetricsPrinter(use_color=not args.no_color)
+    if len(input_dirs) == 1:
+        # Single directory - simple print
+        metrics = load_metrics_from_dir(input_dirs[0])
+        printer.print_results(metrics)
+    else:
+        # Multiple directories - comparison mode
+        metrics_list = []
+        labels = []
+        for d in input_dirs:
+            metrics = load_metrics_from_dir(d)
+            if metrics:
+                metrics_list.append(metrics)
+                labels.append(os.path.basename(d.rstrip("/")))
+        if metrics_list:
+            printer.print_comparison(metrics_list, labels)
+        else:
+            print("No metrics found in specified directories")
+if __name__ == "__main__":
+    main()

Depth-Anything-3/src/depth_anything_3/bench/registries.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Auto-loading registry system for benchmark datasets.
+This module provides registry classes that automatically discover and import
+dataset implementations from the datasets subpackage on first access.
+"""
+import importlib
+import pkgutil
+import threading
+from depth_anything_3.utils.registry import Registry
+__all__ = ["METRIC_REGISTRY", "MONO_REGISTRY", "MV_REGISTRY", "NVS_REGISTRY"]
+# ---- Lazy import: Only scan and import all datasets submodules on first registry access ----
+_loaded = False
+_lock = threading.Lock()
+def _import_all_datasets_once():
+    """
+    Scan and import all .py submodules under depth_anything_3.bench.datasets
+    (skip files/packages starting with underscore), to trigger @REGISTRY.register(...) in each module.
+    """
+    global _loaded
+    if _loaded:
+        return
+    with _lock:
+        if _loaded:
+            return
+        pkg_name = "depth_anything_3.bench.datasets"
+        pkg = importlib.import_module(pkg_name)
+        pkg_paths = list(getattr(pkg, "__path__", []))
+        for finder, name, ispkg in pkgutil.walk_packages(pkg_paths, prefix=pkg_name + "."):
+            base = name.rsplit(".", 1)[-1]
+            if base.startswith("_"):
+                continue
+            try:
+                importlib.import_module(name)
+            except Exception as e:
+                print(f"[datasets auto-import] Failed to import {name}: {e}")
+        _loaded = True
+class AutoRegistry(Registry):
+    """Registry that ensures all datasets are auto-discovered and imported on first use."""
+    def get(self, name):
+        _import_all_datasets_once()
+        return super().get(name)
+    def all(self):
+        _import_all_datasets_once()
+        return super().all()
+    def has(self, name):
+        _import_all_datasets_once()
+        return name in self._map
+# Four auto-lazy registry instances for different evaluation types
+METRIC_REGISTRY = AutoRegistry()  # For metric depth evaluation
+MONO_REGISTRY = AutoRegistry()  # For monocular depth evaluation
+MV_REGISTRY = AutoRegistry()  # For multi-view evaluation
+NVS_REGISTRY = AutoRegistry()  # For novel view synthesis evaluation

Depth-Anything-3/src/depth_anything_3/bench/utils.py ADDED Viewed

	@@ -0,0 +1,525 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility functions for benchmark evaluation.
+Contains:
+- Pose evaluation metrics (AUC) and helper functions
+- 3D reconstruction evaluation metrics (Acc/Comp/F-score)
+- Geometry utilities (quaternion conversion, etc.)
+"""
+from typing import Dict as TDict, Optional, Tuple, Union
+import numpy as np
+import open3d as o3d
+import torch
+from addict import Dict
+from scipy.spatial import KDTree
+from depth_anything_3.utils.geometry import mat_to_quat
+# =============================================================================
+# Geometry Utilities
+# =============================================================================
+def quat2rotmat(qvec: list) -> np.ndarray:
+    """
+    Convert quaternion (WXYZ order) to rotation matrix.
+    Args:
+        qvec: Quaternion as [w, x, y, z]
+    Returns:
+        3x3 rotation matrix
+    """
+    rotmat = np.array(
+        [
+            1 - 2 * qvec[2] ** 2 - 2 * qvec[3] ** 2,
+            2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
+            2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2],
+            2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
+            1 - 2 * qvec[1] ** 2 - 2 * qvec[3] ** 2,
+            2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1],
+            2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
+            2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
+            1 - 2 * qvec[1] ** 2 - 2 * qvec[2] ** 2,
+        ]
+    )
+    rotmat = rotmat.reshape(3, 3)
+    return rotmat
+# =============================================================================
+# 3D Reconstruction Evaluation
+# =============================================================================
+def nn_correspondance(verts1: np.ndarray, verts2: np.ndarray) -> np.ndarray:
+    """
+    Compute nearest neighbor distances from verts2 to verts1 using KDTree.
+    Args:
+        verts1: Reference point cloud [N, 3]
+        verts2: Query point cloud [M, 3]
+    Returns:
+        Distance array [M,] - distance from each point in verts2 to nearest in verts1
+    """
+    if len(verts1) == 0 or len(verts2) == 0:
+        return np.array([])
+    kdtree = KDTree(verts1)
+    distances, _ = kdtree.query(verts2)
+    return distances.reshape(-1)
+def evaluate_3d_reconstruction(
+    pcd_pred: Union[o3d.geometry.PointCloud, np.ndarray],
+    pcd_trgt: Union[o3d.geometry.PointCloud, np.ndarray],
+    threshold: float = 0.05,
+    down_sample: Optional[float] = None,
+) -> TDict[str, float]:
+    """
+    Evaluate 3D reconstruction quality using standard metrics.
+    This function computes:
+    - Accuracy: Mean distance from predicted points to GT surface
+    - Completeness: Mean distance from GT points to predicted surface
+    - Overall: Average of accuracy and completeness
+    - Precision: Fraction of predicted points within threshold of GT
+    - Recall: Fraction of GT points within threshold of prediction
+    - F-score: Harmonic mean of precision and recall
+    Args:
+        pcd_pred: Predicted point cloud (Open3D or numpy array)
+        pcd_trgt: Ground truth point cloud (Open3D or numpy array)
+        threshold: Distance threshold for precision/recall (meters)
+        down_sample: Voxel size for downsampling (None to skip)
+    Returns:
+        Dict with metrics: acc, comp, overall, precision, recall, fscore
+    """
+    # Convert to Open3D if needed
+    if isinstance(pcd_pred, np.ndarray):
+        pcd_pred_o3d = o3d.geometry.PointCloud()
+        pcd_pred_o3d.points = o3d.utility.Vector3dVector(pcd_pred)
+        pcd_pred = pcd_pred_o3d
+    if isinstance(pcd_trgt, np.ndarray):
+        pcd_trgt_o3d = o3d.geometry.PointCloud()
+        pcd_trgt_o3d.points = o3d.utility.Vector3dVector(pcd_trgt)
+        pcd_trgt = pcd_trgt_o3d
+    # Downsample if requested
+    if down_sample is not None and down_sample > 0:
+        pcd_pred = pcd_pred.voxel_down_sample(down_sample)
+        pcd_trgt = pcd_trgt.voxel_down_sample(down_sample)
+    verts_pred = np.asarray(pcd_pred.points)
+    verts_trgt = np.asarray(pcd_trgt.points)
+    # Handle empty point clouds
+    if len(verts_pred) == 0 or len(verts_trgt) == 0:
+        return {
+            "acc": float("inf"),
+            "comp": float("inf"),
+            "overall": float("inf"),
+            "precision": 0.0,
+            "recall": 0.0,
+            "fscore": 0.0,
+        }
+    # Compute distances
+    dist_pred_to_gt = nn_correspondance(verts_trgt, verts_pred)  # Accuracy
+    dist_gt_to_pred = nn_correspondance(verts_pred, verts_trgt)  # Completeness
+    # Compute metrics
+    accuracy = float(np.mean(dist_pred_to_gt))
+    completeness = float(np.mean(dist_gt_to_pred))
+    overall = (accuracy + completeness) / 2
+    precision = float(np.mean((dist_pred_to_gt < threshold).astype(float)))
+    recall = float(np.mean((dist_gt_to_pred < threshold).astype(float)))
+    if precision + recall > 0:
+        fscore = 2 * precision * recall / (precision + recall)
+    else:
+        fscore = 0.0
+    return {
+        "acc": accuracy,
+        "comp": completeness,
+        "overall": overall,
+        "precision": precision,
+        "recall": recall,
+        "fscore": fscore,
+    }
+def create_tsdf_volume(
+    voxel_length: float = 4.0 / 512.0,
+    sdf_trunc: float = 0.04,
+    color_type: str = "RGB8",
+) -> o3d.pipelines.integration.ScalableTSDFVolume:
+    """
+    Create a scalable TSDF volume for depth fusion.
+    Args:
+        voxel_length: Size of each voxel
+        sdf_trunc: Truncation distance for SDF
+        color_type: Color integration type ("RGB8" or "Gray32")
+    Returns:
+        Initialized ScalableTSDFVolume
+    """
+    if color_type == "RGB8":
+        color_enum = o3d.pipelines.integration.TSDFVolumeColorType.RGB8
+    else:
+        color_enum = o3d.pipelines.integration.TSDFVolumeColorType.Gray32
+    volume = o3d.pipelines.integration.ScalableTSDFVolume(
+        voxel_length=voxel_length,
+        sdf_trunc=sdf_trunc,
+        color_type=color_enum,
+    )
+    return volume
+def fuse_depth_to_tsdf(
+    volume: o3d.pipelines.integration.ScalableTSDFVolume,
+    depths: np.ndarray,
+    images: np.ndarray,
+    intrinsics: np.ndarray,
+    extrinsics: np.ndarray,
+    max_depth: float = 10.0,
+) -> o3d.geometry.TriangleMesh:
+    """
+    Fuse multiple depth maps into TSDF volume and extract mesh.
+    Args:
+        volume: TSDF volume to integrate into
+        depths: Depth maps [N, H, W]
+        images: RGB images [N, H, W, 3]
+        intrinsics: Camera intrinsics [N, 3, 3]
+        extrinsics: Camera extrinsics (world-to-camera) [N, 4, 4]
+        max_depth: Maximum depth for truncation
+    Returns:
+        Extracted triangle mesh
+    """
+    for i in range(len(depths)):
+        depth = depths[i]
+        image = images[i]
+        ixt = intrinsics[i]
+        ext = extrinsics[i]
+        h, w = depth.shape[:2]
+        # Create RGBD image
+        depth_o3d = o3d.geometry.Image(depth.astype(np.float32))
+        color_o3d = o3d.geometry.Image(image.astype(np.uint8))
+        rgbd = o3d.geometry.RGBDImage.create_from_color_and_depth(
+            color_o3d,
+            depth_o3d,
+            depth_trunc=max_depth,
+            convert_rgb_to_intensity=False,
+            depth_scale=1.0,
+        )
+        # Create camera intrinsics
+        ixt_o3d = o3d.camera.PinholeCameraIntrinsic(
+            w, h, ixt[0, 0], ixt[1, 1], ixt[0, 2], ixt[1, 2]
+        )
+        # Integrate into volume
+        volume.integrate(rgbd, ixt_o3d, ext)
+    # Extract mesh
+    mesh = volume.extract_triangle_mesh()
+    return mesh
+def sample_points_from_mesh(
+    mesh: o3d.geometry.TriangleMesh,
+    num_points: int = 1000000,
+) -> o3d.geometry.PointCloud:
+    """
+    Uniformly sample points from a triangle mesh.
+    Args:
+        mesh: Input triangle mesh
+        num_points: Number of points to sample
+    Returns:
+        Sampled point cloud
+    """
+    try:
+        pcd = mesh.sample_points_uniformly(number_of_points=num_points)
+        # Clamp colors to valid range [0, 1] for Open3D PLY export
+        if pcd.has_colors():
+            colors = np.asarray(pcd.colors)
+            colors = np.clip(colors, 0.0, 1.0)
+            pcd.colors = o3d.utility.Vector3dVector(colors)
+    except Exception:
+        # Fallback: create random points if mesh is invalid (with fixed seed for reproducibility)
+        rng = np.random.default_rng(seed=42)
+        points = rng.uniform(-1, 1, size=(num_points, 3))
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(points)
+    return pcd
+# =============================================================================
+# Pose Evaluation
+# =============================================================================
+def build_pair_index(N: int, B: int = 1):
+    """
+    Build indices for all possible pairs of frames.
+    Args:
+        N: Number of frames
+        B: Batch size
+    Returns:
+        i1, i2: Indices for all possible pairs
+    """
+    i1_, i2_ = torch.combinations(torch.arange(N), 2, with_replacement=False).unbind(-1)
+    i1, i2 = ((i[None] + torch.arange(B)[:, None] * N).reshape(-1) for i in [i1_, i2_])
+    return i1, i2
+def compute_pose(pred_se3: torch.Tensor, gt_se3: torch.Tensor) -> Dict:
+    """
+    Compute pose estimation metrics between predicted and ground truth trajectories.
+    Args:
+        pred_se3: Predicted SE(3) transformations [N, 4, 4]
+        gt_se3: Ground truth SE(3) transformations [N, 4, 4]
+    Returns:
+        Dict with AUC metrics at different thresholds (auc30, auc15, auc05, auc03)
+    """
+    pred_se3 = align_to_first_camera(pred_se3)
+    gt_se3 = align_to_first_camera(gt_se3)
+    rel_rangle_deg, rel_tangle_deg = se3_to_relative_pose_error(pred_se3, gt_se3, len(pred_se3))
+    rError = rel_rangle_deg.cpu().numpy()
+    tError = rel_tangle_deg.cpu().numpy()
+    output = Dict()
+    output.auc30, _ = calculate_auc_np(rError, tError, max_threshold=30)
+    output.auc15, _ = calculate_auc_np(rError, tError, max_threshold=15)
+    output.auc05, _ = calculate_auc_np(rError, tError, max_threshold=5)
+    output.auc03, _ = calculate_auc_np(rError, tError, max_threshold=3)
+    return output
+def align_to_first_camera(camera_poses: torch.Tensor) -> torch.Tensor:
+    """
+    Align all camera poses to the first camera's coordinate frame.
+    Args:
+        camera_poses: Camera poses as SE3 transformations [N, 4, 4]
+    Returns:
+        Aligned camera poses [N, 4, 4]
+    """
+    first_cam_extrinsic_inv = closed_form_inverse_se3(camera_poses[0][None])
+    aligned_poses = torch.matmul(camera_poses, first_cam_extrinsic_inv)
+    return aligned_poses
+def rotation_angle(
+    rot_gt: torch.Tensor, rot_pred: torch.Tensor, batch_size: int = None, eps: float = 1e-15
+) -> torch.Tensor:
+    """
+    Calculate rotation angle error between ground truth and predicted rotations.
+    Args:
+        rot_gt: Ground truth rotation matrices
+        rot_pred: Predicted rotation matrices
+        batch_size: Batch size for reshaping the result
+        eps: Small value to avoid numerical issues
+    Returns:
+        Rotation angle error in degrees
+    """
+    q_pred = mat_to_quat(rot_pred)
+    q_gt = mat_to_quat(rot_gt)
+    loss_q = (1 - (q_pred * q_gt).sum(dim=1) ** 2).clamp(min=eps)
+    err_q = torch.arccos(1 - 2 * loss_q)
+    rel_rangle_deg = err_q * 180 / np.pi
+    if batch_size is not None:
+        rel_rangle_deg = rel_rangle_deg.reshape(batch_size, -1)
+    return rel_rangle_deg
+def translation_angle(
+    tvec_gt: torch.Tensor,
+    tvec_pred: torch.Tensor,
+    batch_size: int = None,
+    ambiguity: bool = True,
+) -> torch.Tensor:
+    """
+    Calculate translation angle error between ground truth and predicted translations.
+    Args:
+        tvec_gt: Ground truth translation vectors
+        tvec_pred: Predicted translation vectors
+        batch_size: Batch size for reshaping the result
+        ambiguity: Whether to handle direction ambiguity
+    Returns:
+        Translation angle error in degrees
+    """
+    rel_tangle_deg = compare_translation_by_angle(tvec_gt, tvec_pred)
+    rel_tangle_deg = rel_tangle_deg * 180.0 / np.pi
+    if ambiguity:
+        rel_tangle_deg = torch.min(rel_tangle_deg, (180 - rel_tangle_deg).abs())
+    if batch_size is not None:
+        rel_tangle_deg = rel_tangle_deg.reshape(batch_size, -1)
+    return rel_tangle_deg
+def compare_translation_by_angle(
+    t_gt: torch.Tensor, t: torch.Tensor, eps: float = 1e-15, default_err: float = 1e6
+) -> torch.Tensor:
+    """
+    Normalize the translation vectors and compute the angle between them.
+    Args:
+        t_gt: Ground truth translation vectors
+        t: Predicted translation vectors
+        eps: Small value to avoid division by zero
+        default_err: Default error value for invalid cases
+    Returns:
+        Angular error between translation vectors in radians
+    """
+    t_norm = torch.norm(t, dim=1, keepdim=True)
+    t = t / (t_norm + eps)
+    t_gt_norm = torch.norm(t_gt, dim=1, keepdim=True)
+    t_gt = t_gt / (t_gt_norm + eps)
+    loss_t = torch.clamp_min(1.0 - torch.sum(t * t_gt, dim=1) ** 2, eps)
+    err_t = torch.acos(torch.sqrt(1 - loss_t))
+    err_t[torch.isnan(err_t) | torch.isinf(err_t)] = default_err
+    return err_t
+def calculate_auc_np(
+    r_error: np.ndarray, t_error: np.ndarray, max_threshold: int = 30
+) -> tuple:
+    """
+    Calculate the Area Under the Curve (AUC) for the given error arrays.
+    Args:
+        r_error: Rotation error values in degrees
+        t_error: Translation error values in degrees
+        max_threshold: Maximum threshold value for binning
+    Returns:
+        Tuple of (AUC value, normalized histogram)
+    """
+    error_matrix = np.concatenate((r_error[:, None], t_error[:, None]), axis=1)
+    max_errors = np.max(error_matrix, axis=1)
+    bins = np.arange(max_threshold + 1)
+    histogram, _ = np.histogram(max_errors, bins=bins)
+    num_pairs = float(len(max_errors))
+    normalized_histogram = histogram.astype(float) / num_pairs
+    return np.mean(np.cumsum(normalized_histogram)), normalized_histogram
+def se3_to_relative_pose_error(
+    pred_se3: torch.Tensor, gt_se3: torch.Tensor, num_frames: int
+) -> tuple:
+    """
+    Compute rotation and translation errors between predicted and ground truth poses.
+    Args:
+        pred_se3: Predicted SE(3) transformations
+        gt_se3: Ground truth SE(3) transformations
+        num_frames: Number of frames
+    Returns:
+        Tuple of (rotation angle errors, translation angle errors) in degrees
+    """
+    pair_idx_i1, pair_idx_i2 = build_pair_index(num_frames)
+    # Compute relative camera poses between pairs using closed-form inverse
+    relative_pose_gt = closed_form_inverse_se3(gt_se3[pair_idx_i1]).bmm(gt_se3[pair_idx_i2])
+    relative_pose_pred = closed_form_inverse_se3(pred_se3[pair_idx_i1]).bmm(pred_se3[pair_idx_i2])
+    # Compute the difference in rotation and translation
+    rel_rangle_deg = rotation_angle(relative_pose_gt[:, :3, :3], relative_pose_pred[:, :3, :3])
+    rel_tangle_deg = translation_angle(relative_pose_gt[:, :3, 3], relative_pose_pred[:, :3, 3])
+    return rel_rangle_deg, rel_tangle_deg
+def closed_form_inverse_se3(
+    se3: torch.Tensor, R: torch.Tensor = None, T: torch.Tensor = None
+) -> torch.Tensor:
+    """
+    Compute the inverse of each 4x4 (or 3x4) SE3 matrix in a batch.
+    Uses closed-form solution instead of torch.inverse() for numerical stability.
+    Args:
+        se3: Nx4x4 or Nx3x4 tensor of SE3 matrices
+        R: Optional Nx3x3 rotation matrices
+        T: Optional Nx3x1 translation vectors
+    Returns:
+        Inverted SE3 matrices with same shape as input
+    """
+    is_numpy = isinstance(se3, np.ndarray)
+    if se3.shape[-2:] != (4, 4) and se3.shape[-2:] != (3, 4):
+        raise ValueError(f"se3 must be of shape (N,4,4), got {se3.shape}.")
+    if R is None:
+        R = se3[:, :3, :3]
+    if T is None:
+        T = se3[:, :3, 3:]
+    if is_numpy:
+        R_transposed = np.transpose(R, (0, 2, 1))
+        top_right = -np.matmul(R_transposed, T)
+        inverted_matrix = np.tile(np.eye(4), (len(R), 1, 1))
+    else:
+        R_transposed = R.transpose(1, 2)
+        top_right = -torch.bmm(R_transposed, T)
+        inverted_matrix = torch.eye(4, 4)[None].repeat(len(R), 1, 1)
+        inverted_matrix = inverted_matrix.to(R.dtype).to(R.device)
+    inverted_matrix[:, :3, :3] = R_transposed
+    inverted_matrix[:, :3, 3:] = top_right
+    return inverted_matrix

Depth-Anything-3/src/depth_anything_3/cfg.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Configuration utility functions
+"""
+import importlib
+from pathlib import Path
+from typing import Any, Callable, List, Union
+from omegaconf import DictConfig, ListConfig, OmegaConf
+try:
+    OmegaConf.register_new_resolver("eval", eval)
+except Exception as e:
+    # if eval is not available, we can just pass
+    print(f"Error registering eval resolver: {e}")
+def load_config(path: str, argv: List[str] = None) -> Union[DictConfig, ListConfig]:
+    """
+    Load a configuration. Will resolve inheritance.
+    Supports both file paths and module paths (e.g., depth_anything_3.configs.giant).
+    """
+    # Check if path is a module path (contains dots but no slashes and doesn't end with .yaml)
+    if "." in path and "/" not in path and not path.endswith(".yaml"):
+        # It's a module path, load from package resources
+        path_parts = path.split(".")[1:]
+        config_path = Path(__file__).resolve().parent
+        for part in path_parts:
+            config_path = config_path.joinpath(part)
+        config_path = config_path.with_suffix(".yaml")
+        config = OmegaConf.load(str(config_path))
+    else:
+        # It's a file path (absolute, relative, or with .yaml extension)
+        config = OmegaConf.load(path)
+    if argv is not None:
+        config_argv = OmegaConf.from_dotlist(argv)
+        config = OmegaConf.merge(config, config_argv)
+    config = resolve_recursive(config, resolve_inheritance)
+    return config
+def resolve_recursive(
+    config: Any,
+    resolver: Callable[[Union[DictConfig, ListConfig]], Union[DictConfig, ListConfig]],
+) -> Any:
+    config = resolver(config)
+    if isinstance(config, DictConfig):
+        for k in config.keys():
+            v = config.get(k)
+            if isinstance(v, (DictConfig, ListConfig)):
+                config[k] = resolve_recursive(v, resolver)
+    if isinstance(config, ListConfig):
+        for i in range(len(config)):
+            v = config.get(i)
+            if isinstance(v, (DictConfig, ListConfig)):
+                config[i] = resolve_recursive(v, resolver)
+    return config
+def resolve_inheritance(config: Union[DictConfig, ListConfig]) -> Any:
+    """
+    Recursively resolve inheritance if the config contains:
+    __inherit__: path/to/parent.yaml or a ListConfig of such paths.
+    """
+    if isinstance(config, DictConfig):
+        inherit = config.pop("__inherit__", None)
+        if inherit:
+            inherit_list = inherit if isinstance(inherit, ListConfig) else [inherit]
+            parent_config = None
+            for parent_path in inherit_list:
+                assert isinstance(parent_path, str)
+                parent_config = (
+                    load_config(parent_path)
+                    if parent_config is None
+                    else OmegaConf.merge(parent_config, load_config(parent_path))
+                )
+            if len(config.keys()) > 0:
+                config = OmegaConf.merge(parent_config, config)
+            else:
+                config = parent_config
+    return config
+def import_item(path: str, name: str) -> Any:
+    """
+    Import a python item. Example: import_item("path.to.file", "MyClass") -> MyClass
+    """
+    return getattr(importlib.import_module(path), name)
+def create_object(config: DictConfig) -> Any:
+    """
+    Create an object from config.
+    The config is expected to contains the following:
+    __object__:
+      path: path.to.module
+      name: MyClass
+      args: as_config | as_params (default to as_config)
+    """
+    config = DictConfig(config)
+    item = import_item(
+        path=config.__object__.path,
+        name=config.__object__.name,
+    )
+    args = config.__object__.get("args", "as_config")
+    if args == "as_config":
+        return item(config)
+    if args == "as_params":
+        config = OmegaConf.to_object(config)
+        config.pop("__object__")
+        return item(**config)
+    raise NotImplementedError(f"Unknown args type: {args}")
+def create_dataset(path: str, *args, **kwargs) -> Any:
+    """
+    Create a dataset. Requires the file to contain a "create_dataset" function.
+    """
+    return import_item(path, "create_dataset")(*args, **kwargs)
+def to_dict_recursive(config_obj):
+    if isinstance(config_obj, DictConfig):
+        return {k: to_dict_recursive(v) for k, v in config_obj.items()}
+    elif isinstance(config_obj, ListConfig):
+        return [to_dict_recursive(item) for item in config_obj]
+    return config_obj

Depth-Anything-3/src/depth_anything_3/cli.py ADDED Viewed

	@@ -0,0 +1,803 @@

+# flake8: noqa: E402
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Refactored Depth Anything 3 CLI
+Clean, modular command-line interface
+"""
+from __future__ import annotations
+import os
+import typer
+from depth_anything_3.services import start_server
+from depth_anything_3.services.gallery import gallery as gallery_main
+from depth_anything_3.services.inference_service import run_inference
+from depth_anything_3.services.input_handlers import (
+    ColmapHandler,
+    ImageHandler,
+    ImagesHandler,
+    InputHandler,
+    VideoHandler,
+    parse_export_feat,
+)
+from depth_anything_3.utils.constants import (
+    DEFAULT_EXPORT_DIR,
+    DEFAULT_GALLERY_DIR,
+    DEFAULT_GRADIO_DIR,
+    DEFAULT_MODEL,
+)
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+app = typer.Typer(help="Depth Anything 3 - Video depth estimation CLI", add_completion=False)
+# ============================================================================
+# Input type detection utilities
+# ============================================================================
+# Supported file extensions
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff", ".tif"}
+VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm", ".m4v"}
+def detect_input_type(input_path: str) -> str:
+    """
+    Detect input type from path.
+    Returns:
+        - "image": Single image file
+        - "images": Directory containing images
+        - "video": Video file
+        - "colmap": COLMAP directory structure
+        - "unknown": Cannot determine type
+    """
+    if not os.path.exists(input_path):
+        return "unknown"
+    # Check if it's a file
+    if os.path.isfile(input_path):
+        ext = os.path.splitext(input_path)[1].lower()
+        if ext in IMAGE_EXTENSIONS:
+            return "image"
+        elif ext in VIDEO_EXTENSIONS:
+            return "video"
+        return "unknown"
+    # Check if it's a directory
+    if os.path.isdir(input_path):
+        # Check for COLMAP structure
+        images_dir = os.path.join(input_path, "images")
+        sparse_dir = os.path.join(input_path, "sparse")
+        if os.path.isdir(images_dir) and os.path.isdir(sparse_dir):
+            return "colmap"
+        # Check if directory contains image files
+        for item in os.listdir(input_path):
+            item_path = os.path.join(input_path, item)
+            if os.path.isfile(item_path):
+                ext = os.path.splitext(item)[1].lower()
+                if ext in IMAGE_EXTENSIONS:
+                    return "images"
+        return "unknown"
+    return "unknown"
+# ============================================================================
+# Common parameters and configuration
+# ============================================================================
+# ============================================================================
+# Inference commands
+# ============================================================================
+@app.command()
+def auto(
+    input_path: str = typer.Argument(
+        ..., help="Path to input (image, directory, video, or COLMAP)"
+    ),
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    export_dir: str = typer.Option(DEFAULT_EXPORT_DIR, help="Export directory"),
+    export_format: str = typer.Option("glb", help="Export format"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    use_backend: bool = typer.Option(False, help="Use backend service for inference"),
+    backend_url: str = typer.Option(
+        "http://localhost:8008", help="Backend URL (default: http://localhost:8008)"
+    ),
+    process_res: int = typer.Option(504, help="Processing resolution"),
+    process_res_method: str = typer.Option(
+        "upper_bound_resize", help="Processing resolution method"
+    ),
+    export_feat: str = typer.Option(
+        "",
+        help="[FEAT_VIS]Export features from specified layers using comma-separated indices (e.g., '0,1,2').",
+    ),
+    auto_cleanup: bool = typer.Option(
+        False, help="Automatically clean export directory if it exists (no prompt)"
+    ),
+    # Video-specific options
+    fps: float = typer.Option(1.0, help="[Video] Sampling FPS for frame extraction"),
+    # COLMAP-specific options
+    sparse_subdir: str = typer.Option(
+        "", help="[COLMAP] Sparse reconstruction subdirectory (e.g., '0' for sparse/0/)"
+    ),
+    align_to_input_ext_scale: bool = typer.Option(
+        True, help="[COLMAP] Align prediction to input extrinsics scale"
+    ),
+    # Pose estimation options
+    use_ray_pose: bool = typer.Option(
+        False, help="Use ray-based pose estimation instead of camera decoder"
+    ),
+    ref_view_strategy: str = typer.Option(
+        "saddle_balanced",
+        help="Reference view selection strategy: empty, first, middle, saddle_balanced, saddle_sim_range",
+    ),
+    # GLB export options
+    conf_thresh_percentile: float = typer.Option(
+        40.0, help="[GLB] Lower percentile for adaptive confidence threshold"
+    ),
+    num_max_points: int = typer.Option(
+        1_000_000, help="[GLB] Maximum number of points in the point cloud"
+    ),
+    show_cameras: bool = typer.Option(
+        True, help="[GLB] Show camera wireframes in the exported scene"
+    ),
+    # Feat_vis export options
+    feat_vis_fps: int = typer.Option(15, help="[FEAT_VIS] Frame rate for output video"),
+):
+    """
+    Automatically detect input type and run appropriate processing.
+    Supports:
+    - Single image file (.jpg, .png, etc.)
+    - Directory of images
+    - Video file (.mp4, .avi, etc.)
+    - COLMAP directory (with 'images' and 'sparse' subdirectories)
+    """
+    # Detect input type
+    input_type = detect_input_type(input_path)
+    if input_type == "unknown":
+        typer.echo(f"❌ Error: Cannot determine input type for: {input_path}", err=True)
+        typer.echo("Supported inputs:", err=True)
+        typer.echo("  - Single image file (.jpg, .png, etc.)", err=True)
+        typer.echo("  - Directory containing images", err=True)
+        typer.echo("  - Video file (.mp4, .avi, etc.)", err=True)
+        typer.echo("  - COLMAP directory (with 'images/' and 'sparse/' subdirectories)", err=True)
+        raise typer.Exit(1)
+    # Display detected type
+    typer.echo(f"🔍 Detected input type: {input_type.upper()}")
+    typer.echo(f"📁 Input path: {input_path}")
+    typer.echo()
+    # Determine backend URL based on use_backend flag
+    final_backend_url = backend_url if use_backend else None
+    # Parse export_feat parameter
+    export_feat_layers = parse_export_feat(export_feat)
+    # Route to appropriate handler
+    if input_type == "image":
+        typer.echo("Processing single image...")
+        # Process input
+        image_files = ImageHandler.process(input_path)
+        # Handle export directory
+        export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+        # Run inference
+        run_inference(
+            image_paths=image_files,
+            export_dir=export_dir,
+            model_dir=model_dir,
+            device=device,
+            backend_url=final_backend_url,
+            export_format=export_format,
+            process_res=process_res,
+            process_res_method=process_res_method,
+            export_feat_layers=export_feat_layers,
+            use_ray_pose=use_ray_pose,
+            ref_view_strategy=ref_view_strategy,
+            conf_thresh_percentile=conf_thresh_percentile,
+            num_max_points=num_max_points,
+            show_cameras=show_cameras,
+            feat_vis_fps=feat_vis_fps,
+        )
+    elif input_type == "images":
+        typer.echo("Processing directory of images...")
+        # Process input - use default extensions
+        image_files = ImagesHandler.process(input_path, "png,jpg,jpeg")
+        # Handle export directory
+        export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+        # Run inference
+        run_inference(
+            image_paths=image_files,
+            export_dir=export_dir,
+            model_dir=model_dir,
+            device=device,
+            backend_url=final_backend_url,
+            export_format=export_format,
+            process_res=process_res,
+            process_res_method=process_res_method,
+            export_feat_layers=export_feat_layers,
+            use_ray_pose=use_ray_pose,
+            ref_view_strategy=ref_view_strategy,
+            conf_thresh_percentile=conf_thresh_percentile,
+            num_max_points=num_max_points,
+            show_cameras=show_cameras,
+            feat_vis_fps=feat_vis_fps,
+        )
+    elif input_type == "video":
+        typer.echo(f"Processing video with FPS={fps}...")
+        # Handle export directory
+        export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+        # Process input
+        image_files = VideoHandler.process(input_path, export_dir, fps)
+        # Run inference
+        run_inference(
+            image_paths=image_files,
+            export_dir=export_dir,
+            model_dir=model_dir,
+            device=device,
+            backend_url=final_backend_url,
+            export_format=export_format,
+            process_res=process_res,
+            process_res_method=process_res_method,
+            export_feat_layers=export_feat_layers,
+            use_ray_pose=use_ray_pose,
+            ref_view_strategy=ref_view_strategy,
+            conf_thresh_percentile=conf_thresh_percentile,
+            num_max_points=num_max_points,
+            show_cameras=show_cameras,
+            feat_vis_fps=feat_vis_fps,
+        )
+    elif input_type == "colmap":
+        typer.echo(
+            f"Processing COLMAP directory (sparse subdirectory: '{sparse_subdir or 'default'}')..."
+        )
+        # Process input
+        image_files, extrinsics, intrinsics = ColmapHandler.process(input_path, sparse_subdir)
+        # Handle export directory
+        export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+        # Run inference
+        run_inference(
+            image_paths=image_files,
+            export_dir=export_dir,
+            model_dir=model_dir,
+            device=device,
+            backend_url=final_backend_url,
+            export_format=export_format,
+            process_res=process_res,
+            process_res_method=process_res_method,
+            export_feat_layers=export_feat_layers,
+            extrinsics=extrinsics,
+            intrinsics=intrinsics,
+            align_to_input_ext_scale=align_to_input_ext_scale,
+            use_ray_pose=use_ray_pose,
+            ref_view_strategy=ref_view_strategy,
+            conf_thresh_percentile=conf_thresh_percentile,
+            num_max_points=num_max_points,
+            show_cameras=show_cameras,
+            feat_vis_fps=feat_vis_fps,
+        )
+    typer.echo()
+    typer.echo("✅ Processing completed successfully!")
+@app.command()
+def image(
+    image_path: str = typer.Argument(..., help="Path to input image file"),
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    export_dir: str = typer.Option(DEFAULT_EXPORT_DIR, help="Export directory"),
+    export_format: str = typer.Option("glb", help="Export format"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    use_backend: bool = typer.Option(False, help="Use backend service for inference"),
+    backend_url: str = typer.Option(
+        "http://localhost:8008", help="Backend URL (default: http://localhost:8008)"
+    ),
+    process_res: int = typer.Option(504, help="Processing resolution"),
+    process_res_method: str = typer.Option(
+        "upper_bound_resize", help="Processing resolution method"
+    ),
+    export_feat: str = typer.Option(
+        "",
+        help="[FEAT_VIS] Export features from specified layers using comma-separated indices (e.g., '0,1,2').",
+    ),
+    auto_cleanup: bool = typer.Option(
+        False, help="Automatically clean export directory if it exists (no prompt)"
+    ),
+    # Pose estimation options
+    use_ray_pose: bool = typer.Option(
+        False, help="Use ray-based pose estimation instead of camera decoder"
+    ),
+    ref_view_strategy: str = typer.Option(
+        "saddle_balanced",
+        help="Reference view selection strategy: empty, first, middle, saddle_balanced, saddle_sim_range",
+    ),
+    # GLB export options
+    conf_thresh_percentile: float = typer.Option(
+        40.0, help="[GLB] Lower percentile for adaptive confidence threshold"
+    ),
+    num_max_points: int = typer.Option(
+        1_000_000, help="[GLB] Maximum number of points in the point cloud"
+    ),
+    show_cameras: bool = typer.Option(
+        True, help="[GLB] Show camera wireframes in the exported scene"
+    ),
+    # Feat_vis export options
+    feat_vis_fps: int = typer.Option(15, help="[FEAT_VIS] Frame rate for output video"),
+):
+    """Run camera pose and depth estimation on a single image."""
+    # Process input
+    image_files = ImageHandler.process(image_path)
+    # Handle export directory
+    export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+    # Parse export_feat parameter
+    export_feat_layers = parse_export_feat(export_feat)
+    # Determine backend URL based on use_backend flag
+    final_backend_url = backend_url if use_backend else None
+    # Run inference
+    run_inference(
+        image_paths=image_files,
+        export_dir=export_dir,
+        model_dir=model_dir,
+        device=device,
+        backend_url=final_backend_url,
+        export_format=export_format,
+        process_res=process_res,
+        process_res_method=process_res_method,
+        export_feat_layers=export_feat_layers,
+        use_ray_pose=use_ray_pose,
+        reference_view_strategy=reference_view_strategy,
+        conf_thresh_percentile=conf_thresh_percentile,
+        num_max_points=num_max_points,
+        show_cameras=show_cameras,
+        feat_vis_fps=feat_vis_fps,
+    )
+@app.command()
+def images(
+    images_dir: str = typer.Argument(..., help="Path to directory containing input images"),
+    image_extensions: str = typer.Option(
+        "png,jpg,jpeg", help="Comma-separated image file extensions to process"
+    ),
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    export_dir: str = typer.Option(DEFAULT_EXPORT_DIR, help="Export directory"),
+    export_format: str = typer.Option("glb", help="Export format"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    use_backend: bool = typer.Option(False, help="Use backend service for inference"),
+    backend_url: str = typer.Option(
+        "http://localhost:8008", help="Backend URL (default: http://localhost:8008)"
+    ),
+    process_res: int = typer.Option(504, help="Processing resolution"),
+    process_res_method: str = typer.Option(
+        "upper_bound_resize", help="Processing resolution method"
+    ),
+    export_feat: str = typer.Option(
+        "",
+        help="[FEAT_VIS] Export features from specified layers using comma-separated indices (e.g., '0,1,2').",
+    ),
+    auto_cleanup: bool = typer.Option(
+        False, help="Automatically clean export directory if it exists (no prompt)"
+    ),
+    # Pose estimation options
+    use_ray_pose: bool = typer.Option(
+        False, help="Use ray-based pose estimation instead of camera decoder"
+    ),
+    ref_view_strategy: str = typer.Option(
+        "saddle_balanced",
+        help="Reference view selection strategy: empty, first, middle, saddle_balanced, saddle_sim_range",
+    ),
+    # GLB export options
+    conf_thresh_percentile: float = typer.Option(
+        40.0, help="[GLB] Lower percentile for adaptive confidence threshold"
+    ),
+    num_max_points: int = typer.Option(
+        1_000_000, help="[GLB] Maximum number of points in the point cloud"
+    ),
+    show_cameras: bool = typer.Option(
+        True, help="[GLB] Show camera wireframes in the exported scene"
+    ),
+    # Feat_vis export options
+    feat_vis_fps: int = typer.Option(15, help="[FEAT_VIS] Frame rate for output video"),
+):
+    """Run camera pose and depth estimation on a directory of images."""
+    # Process input
+    image_files = ImagesHandler.process(images_dir, image_extensions)
+    # Handle export directory
+    export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+    # Parse export_feat parameter
+    export_feat_layers = parse_export_feat(export_feat)
+    # Determine backend URL based on use_backend flag
+    final_backend_url = backend_url if use_backend else None
+    # Run inference
+    run_inference(
+        image_paths=image_files,
+        export_dir=export_dir,
+        model_dir=model_dir,
+        device=device,
+        backend_url=final_backend_url,
+        export_format=export_format,
+        process_res=process_res,
+        process_res_method=process_res_method,
+        export_feat_layers=export_feat_layers,
+        use_ray_pose=use_ray_pose,
+        reference_view_strategy=reference_view_strategy,
+        conf_thresh_percentile=conf_thresh_percentile,
+        num_max_points=num_max_points,
+        show_cameras=show_cameras,
+        feat_vis_fps=feat_vis_fps,
+    )
+@app.command()
+def colmap(
+    colmap_dir: str = typer.Argument(
+        ..., help="Path to COLMAP directory containing 'images' and 'sparse' subdirectories"
+    ),
+    sparse_subdir: str = typer.Option(
+        "", help="Sparse reconstruction subdirectory (e.g., '0' for sparse/0/, empty for sparse/)"
+    ),
+    align_to_input_ext_scale: bool = typer.Option(
+        True, help="Align prediction to input extrinsics scale"
+    ),
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    export_dir: str = typer.Option(DEFAULT_EXPORT_DIR, help="Export directory"),
+    export_format: str = typer.Option("glb", help="Export format"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    use_backend: bool = typer.Option(False, help="Use backend service for inference"),
+    backend_url: str = typer.Option(
+        "http://localhost:8008", help="Backend URL (default: http://localhost:8008)"
+    ),
+    process_res: int = typer.Option(504, help="Processing resolution"),
+    process_res_method: str = typer.Option(
+        "upper_bound_resize", help="Processing resolution method"
+    ),
+    export_feat: str = typer.Option(
+        "",
+        help="Export features from specified layers using comma-separated indices (e.g., '0,1,2').",
+    ),
+    auto_cleanup: bool = typer.Option(
+        False, help="Automatically clean export directory if it exists (no prompt)"
+    ),
+    # Pose estimation options
+    use_ray_pose: bool = typer.Option(
+        False, help="Use ray-based pose estimation instead of camera decoder"
+    ),
+    ref_view_strategy: str = typer.Option(
+        "saddle_balanced",
+        help="Reference view selection strategy: empty, first, middle, saddle_balanced, saddle_sim_range",
+    ),
+    # GLB export options
+    conf_thresh_percentile: float = typer.Option(
+        40.0, help="[GLB] Lower percentile for adaptive confidence threshold"
+    ),
+    num_max_points: int = typer.Option(
+        1_000_000, help="[GLB] Maximum number of points in the point cloud"
+    ),
+    show_cameras: bool = typer.Option(
+        True, help="[GLB] Show camera wireframes in the exported scene"
+    ),
+    # Feat_vis export options
+    feat_vis_fps: int = typer.Option(15, help="[FEAT_VIS] Frame rate for output video"),
+):
+    """Run pose conditioned depth estimation on COLMAP data."""
+    # Process input
+    image_files, extrinsics, intrinsics = ColmapHandler.process(colmap_dir, sparse_subdir)
+    # Handle export directory
+    export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+    # Parse export_feat parameter
+    export_feat_layers = parse_export_feat(export_feat)
+    # Determine backend URL based on use_backend flag
+    final_backend_url = backend_url if use_backend else None
+    # Run inference
+    run_inference(
+        image_paths=image_files,
+        export_dir=export_dir,
+        model_dir=model_dir,
+        device=device,
+        backend_url=final_backend_url,
+        export_format=export_format,
+        process_res=process_res,
+        process_res_method=process_res_method,
+        export_feat_layers=export_feat_layers,
+        extrinsics=extrinsics,
+        intrinsics=intrinsics,
+        align_to_input_ext_scale=align_to_input_ext_scale,
+        use_ray_pose=use_ray_pose,
+        reference_view_strategy=reference_view_strategy,
+        conf_thresh_percentile=conf_thresh_percentile,
+        num_max_points=num_max_points,
+        show_cameras=show_cameras,
+        feat_vis_fps=feat_vis_fps,
+    )
+@app.command()
+def video(
+    video_path: str = typer.Argument(..., help="Path to input video file"),
+    fps: float = typer.Option(1.0, help="Sampling FPS for frame extraction"),
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    export_dir: str = typer.Option(DEFAULT_EXPORT_DIR, help="Export directory"),
+    export_format: str = typer.Option("glb", help="Export format"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    use_backend: bool = typer.Option(False, help="Use backend service for inference"),
+    backend_url: str = typer.Option(
+        "http://localhost:8008", help="Backend URL (default: http://localhost:8008)"
+    ),
+    process_res: int = typer.Option(504, help="Processing resolution"),
+    process_res_method: str = typer.Option(
+        "upper_bound_resize", help="Processing resolution method"
+    ),
+    export_feat: str = typer.Option(
+        "",
+        help="[FEAT_VIS] Export features from specified layers using comma-separated indices (e.g., '0,1,2').",
+    ),
+    auto_cleanup: bool = typer.Option(
+        False, help="Automatically clean export directory if it exists (no prompt)"
+    ),
+    # Pose estimation options
+    use_ray_pose: bool = typer.Option(
+        False, help="Use ray-based pose estimation instead of camera decoder"
+    ),
+    ref_view_strategy: str = typer.Option(
+        "saddle_balanced",
+        help="Reference view selection strategy: empty, first, middle, saddle_balanced, saddle_sim_range",
+    ),
+    # GLB export options
+    conf_thresh_percentile: float = typer.Option(
+        40.0, help="[GLB] Lower percentile for adaptive confidence threshold"
+    ),
+    num_max_points: int = typer.Option(
+        1_000_000, help="[GLB] Maximum number of points in the point cloud"
+    ),
+    show_cameras: bool = typer.Option(
+        True, help="[GLB] Show camera wireframes in the exported scene"
+    ),
+    # Feat_vis export options
+    feat_vis_fps: int = typer.Option(15, help="[FEAT_VIS] Frame rate for output video"),
+):
+    """Run depth estimation on video by extracting frames and processing them."""
+    # Handle export directory
+    export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+    # Process input
+    image_files = VideoHandler.process(video_path, export_dir, fps)
+    # Parse export_feat parameter
+    export_feat_layers = parse_export_feat(export_feat)
+    # Determine backend URL based on use_backend flag
+    final_backend_url = backend_url if use_backend else None
+    # Run inference
+    run_inference(
+        image_paths=image_files,
+        export_dir=export_dir,
+        model_dir=model_dir,
+        device=device,
+        backend_url=final_backend_url,
+        export_format=export_format,
+        process_res=process_res,
+        process_res_method=process_res_method,
+        export_feat_layers=export_feat_layers,
+        use_ray_pose=use_ray_pose,
+        reference_view_strategy=reference_view_strategy,
+        conf_thresh_percentile=conf_thresh_percentile,
+        num_max_points=num_max_points,
+        show_cameras=show_cameras,
+        feat_vis_fps=feat_vis_fps,
+    )
+# ============================================================================
+# Service management commands
+# ============================================================================
+@app.command()
+def backend(
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    host: str = typer.Option("127.0.0.1", help="Host to bind to"),
+    port: int = typer.Option(8008, help="Port to bind to"),
+    gallery_dir: str = typer.Option(DEFAULT_GALLERY_DIR, help="Gallery directory path (optional)"),
+):
+    """Start model backend service with integrated gallery."""
+    typer.echo("=" * 60)
+    typer.echo("🚀 Starting Depth Anything 3 Backend Server")
+    typer.echo("=" * 60)
+    typer.echo(f"Model directory: {model_dir}")
+    typer.echo(f"Device: {device}")
+    # Check if gallery directory exists
+    if gallery_dir and os.path.exists(gallery_dir):
+        typer.echo(f"Gallery directory: {gallery_dir}")
+    else:
+        gallery_dir = None  # Disable gallery if directory doesn't exist
+    typer.echo()
+    typer.echo("📡 Server URLs (Ctrl/CMD+Click to open):")
+    typer.echo(f"  🏠 Home:      http://{host}:{port}")
+    typer.echo(f"  📊 Dashboard: http://{host}:{port}/dashboard")
+    typer.echo(f"  📈 API Status: http://{host}:{port}/status")
+    if gallery_dir:
+        typer.echo(f"  🎨 Gallery:   http://{host}:{port}/gallery/")
+    typer.echo("=" * 60)
+    try:
+        start_server(model_dir, device, host, port, gallery_dir)
+    except KeyboardInterrupt:
+        typer.echo("\n👋 Backend server stopped.")
+    except Exception as e:
+        typer.echo(f"❌ Failed to start backend: {e}")
+        raise typer.Exit(1)
+# ============================================================================
+# Application launch commands
+# ============================================================================
+@app.command()
+def gradio(
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    workspace_dir: str = typer.Option(DEFAULT_GRADIO_DIR, help="Workspace directory path"),
+    gallery_dir: str = typer.Option(DEFAULT_GALLERY_DIR, help="Gallery directory path"),
+    host: str = typer.Option("127.0.0.1", help="Host address to bind to"),
+    port: int = typer.Option(7860, help="Port number to bind to"),
+    share: bool = typer.Option(False, help="Create a public link for the app"),
+    debug: bool = typer.Option(False, help="Enable debug mode"),
+    cache_examples: bool = typer.Option(
+        False, help="Pre-cache all example scenes at startup for faster loading"
+    ),
+    cache_gs_tag: str = typer.Option(
+        "",
+        help="Tag to match scene names for high-res+3DGS caching (e.g., 'dl3dv'). Scenes containing this tag will use high_res and infer_gs=True; others will use low_res only.",
+    ),
+):
+    """Launch Depth Anything 3 Gradio interactive web application"""
+    from depth_anything_3.app.gradio_app import DepthAnything3App
+    # Create necessary directories
+    os.makedirs(workspace_dir, exist_ok=True)
+    os.makedirs(gallery_dir, exist_ok=True)
+    typer.echo("Launching Depth Anything 3 Gradio application...")
+    typer.echo(f"Model directory: {model_dir}")
+    typer.echo(f"Workspace directory: {workspace_dir}")
+    typer.echo(f"Gallery directory: {gallery_dir}")
+    typer.echo(f"Host: {host}")
+    typer.echo(f"Port: {port}")
+    typer.echo(f"Share: {share}")
+    typer.echo(f"Debug mode: {debug}")
+    typer.echo(f"Cache examples: {cache_examples}")
+    if cache_examples:
+        if cache_gs_tag:
+            typer.echo(
+                f"Cache GS Tag: '{cache_gs_tag}' (scenes matching this tag will use high-res + 3DGS)"
+            )
+        else:
+            typer.echo(f"Cache GS Tag: None (all scenes will use low-res only)")
+    try:
+        # Initialize and launch application
+        app = DepthAnything3App(
+            model_dir=model_dir, workspace_dir=workspace_dir, gallery_dir=gallery_dir
+        )
+        # Pre-cache examples if requested
+        if cache_examples:
+            typer.echo("\n" + "=" * 60)
+            typer.echo("Pre-caching mode enabled")
+            if cache_gs_tag:
+                typer.echo(f"Scenes containing '{cache_gs_tag}' will use HIGH-RES + 3DGS")
+                typer.echo(f"Other scenes will use LOW-RES only")
+            else:
+                typer.echo(f"All scenes will use LOW-RES only")
+            typer.echo("=" * 60)
+            app.cache_examples(
+                show_cam=True,
+                filter_black_bg=False,
+                filter_white_bg=False,
+                save_percentage=20.0,
+                num_max_points=1000,
+                cache_gs_tag=cache_gs_tag,
+                gs_trj_mode="smooth",
+                gs_video_quality="low",
+            )
+        # Prepare launch arguments
+        launch_kwargs = {"share": share, "debug": debug}
+        app.launch(host=host, port=port, **launch_kwargs)
+    except KeyboardInterrupt:
+        typer.echo("\nGradio application stopped.")
+    except Exception as e:
+        typer.echo(f"Failed to launch Gradio application: {e}")
+        raise typer.Exit(1)
+@app.command()
+def gallery(
+    gallery_dir: str = typer.Option(DEFAULT_GALLERY_DIR, help="Gallery root directory"),
+    host: str = typer.Option("127.0.0.1", help="Host address to bind to"),
+    port: int = typer.Option(8007, help="Port number to bind to"),
+    open_browser: bool = typer.Option(False, help="Open browser after launch"),
+):
+    """Launch Depth Anything 3 Gallery server"""
+    # Validate gallery directory
+    if not os.path.exists(gallery_dir):
+        raise typer.BadParameter(f"Gallery directory not found: {gallery_dir}")
+    typer.echo("Launching Depth Anything 3 Gallery server...")
+    typer.echo(f"Gallery directory: {gallery_dir}")
+    typer.echo(f"Host: {host}")
+    typer.echo(f"Port: {port}")
+    typer.echo(f"Auto-open browser: {open_browser}")
+    try:
+        # Set command line arguments
+        import sys
+        sys.argv = ["gallery", "--dir", gallery_dir, "--host", host, "--port", str(port)]
+        if open_browser:
+            sys.argv.append("--open")
+        # Launch gallery server
+        gallery_main()
+    except KeyboardInterrupt:
+        typer.echo("\nGallery server stopped.")
+    except Exception as e:
+        typer.echo(f"Failed to launch Gallery server: {e}")
+        raise typer.Exit(1)
+if __name__ == "__main__":
+    app()

Depth-Anything-3/src/depth_anything_3/configs/da3-base.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vitb
+  out_layers: [5, 7, 9, 11]
+  alt_start: 4
+  qknorm_start: 4
+  rope_start: 4
+  cat_token: True
+head:
+  __object__:
+    path: depth_anything_3.model.dualdpt
+    name: DualDPT
+    args: as_params
+  dim_in: &head_dim_in 1536
+  output_dim: 2
+  features: &head_features 128
+  out_channels: &head_out_channels [96, 192, 384, 768]
+cam_enc:
+  __object__:
+    path: depth_anything_3.model.cam_enc
+    name: CameraEnc
+    args: as_params
+  dim_out: 768
+cam_dec:
+  __object__:
+    path: depth_anything_3.model.cam_dec
+    name: CameraDec
+    args: as_params
+  dim_in: 1536

Depth-Anything-3/src/depth_anything_3/configs/da3-giant.yaml ADDED Viewed

	@@ -0,0 +1,71 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vitg
+  out_layers: [19, 27, 33, 39]
+  alt_start: 13
+  qknorm_start: 13
+  rope_start: 13
+  cat_token: True
+head:
+  __object__:
+    path: depth_anything_3.model.dualdpt
+    name: DualDPT
+    args: as_params
+  dim_in: &head_dim_in 3072
+  output_dim: 2
+  features: &head_features 256
+  out_channels: &head_out_channels [256, 512, 1024, 1024]
+cam_enc:
+  __object__:
+    path: depth_anything_3.model.cam_enc
+    name: CameraEnc
+    args: as_params
+  dim_out: 1536
+cam_dec:
+  __object__:
+    path: depth_anything_3.model.cam_dec
+    name: CameraDec
+    args: as_params
+  dim_in: 3072
+gs_head:
+  __object__:
+    path: depth_anything_3.model.gsdpt
+    name: GSDPT
+    args: as_params
+  dim_in: *head_dim_in
+  output_dim: 38  # should align with gs_adapter's setting, for gs params
+  features: *head_features
+  out_channels: *head_out_channels
+gs_adapter:
+  __object__:
+    path: depth_anything_3.model.gs_adapter
+    name: GaussianAdapter
+    args: as_params
+  sh_degree: 2
+  pred_color: false  # predict SH coefficient if false
+  pred_offset_depth: true
+  pred_offset_xy: true
+  gaussian_scale_min: 1e-5
+  gaussian_scale_max: 30.0

Depth-Anything-3/src/depth_anything_3/configs/da3-large.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vitl
+  out_layers: [11, 15, 19, 23]
+  alt_start: 8
+  qknorm_start: 8
+  rope_start: 8
+  cat_token: True
+head:
+  __object__:
+    path: depth_anything_3.model.dualdpt
+    name: DualDPT
+    args: as_params
+  dim_in: &head_dim_in 2048
+  output_dim: 2
+  features: &head_features 256
+  out_channels: &head_out_channels [256, 512, 1024, 1024]
+cam_enc:
+  __object__:
+    path: depth_anything_3.model.cam_enc
+    name: CameraEnc
+    args: as_params
+  dim_out: 1024
+cam_dec:
+  __object__:
+    path: depth_anything_3.model.cam_dec
+    name: CameraDec
+    args: as_params
+  dim_in: 2048

Depth-Anything-3/src/depth_anything_3/configs/da3-small.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vits
+  out_layers: [5, 7, 9, 11]
+  alt_start: 4
+  qknorm_start: 4
+  rope_start: 4
+  cat_token: True
+head:
+  __object__:
+    path: depth_anything_3.model.dualdpt
+    name: DualDPT
+    args: as_params
+  dim_in: &head_dim_in 768
+  output_dim: 2
+  features: &head_features 64
+  out_channels: &head_out_channels [48, 96, 192, 384]
+cam_enc:
+  __object__:
+    path: depth_anything_3.model.cam_enc
+    name: CameraEnc
+    args: as_params
+  dim_out: 384
+cam_dec:
+  __object__:
+    path: depth_anything_3.model.cam_dec
+    name: CameraDec
+    args: as_params
+  dim_in: 768

Depth-Anything-3/src/depth_anything_3/configs/da3metric-large.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vitl
+  out_layers: [4, 11, 17, 23]
+  alt_start: -1 # -1 means disable
+  qknorm_start: -1
+  rope_start: -1
+  cat_token: False
+head:
+  __object__:
+    path: depth_anything_3.model.dpt
+    name: DPT
+    args: as_params
+  dim_in: 1024
+  output_dim: 1
+  features: 256
+  out_channels: [256, 512, 1024, 1024]

Depth-Anything-3/src/depth_anything_3/configs/da3mono-large.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vitl
+  out_layers: [4, 11, 17, 23]
+  alt_start: -1 # -1 means disable
+  qknorm_start: -1
+  rope_start: -1
+  cat_token: False
+head:
+  __object__:
+    path: depth_anything_3.model.dpt
+    name: DPT
+    args: as_params
+  dim_in: 1024
+  output_dim: 1
+  features: 256
+  out_channels: [256, 512, 1024, 1024]

Depth-Anything-3/src/depth_anything_3/configs/da3nested-giant-large.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: NestedDepthAnything3Net
+  args: as_params
+anyview:
+  __inherit__: depth_anything_3.configs.da3-giant
+metric:
+  __inherit__: depth_anything_3.configs.da3metric-large

Depth-Anything-3/src/depth_anything_3/model/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from depth_anything_3.model.da3 import DepthAnything3Net, NestedDepthAnything3Net
+__export__ = [
+    NestedDepthAnything3Net,
+    DepthAnything3Net,
+]

Depth-Anything-3/src/depth_anything_3/model/cam_dec.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+class CameraDec(nn.Module):
+    def __init__(self, dim_in=1536):
+        super().__init__()
+        output_dim = dim_in
+        self.backbone = nn.Sequential(
+            nn.Linear(output_dim, output_dim),
+            nn.ReLU(),
+            nn.Linear(output_dim, output_dim),
+            nn.ReLU(),
+        )
+        self.fc_t = nn.Linear(output_dim, 3)
+        self.fc_qvec = nn.Linear(output_dim, 4)
+        self.fc_fov = nn.Sequential(nn.Linear(output_dim, 2), nn.ReLU())
+    def forward(self, feat, camera_encoding=None, *args, **kwargs):
+        B, N = feat.shape[:2]
+        feat = feat.reshape(B * N, -1)
+        feat = self.backbone(feat)
+        out_t = self.fc_t(feat.float()).reshape(B, N, 3)
+        if camera_encoding is None:
+            out_qvec = self.fc_qvec(feat.float()).reshape(B, N, 4)
+            out_fov = self.fc_fov(feat.float()).reshape(B, N, 2)
+        else:
+            out_qvec = camera_encoding[..., 3:7]
+            out_fov = camera_encoding[..., -2:]
+        pose_enc = torch.cat([out_t, out_qvec, out_fov], dim=-1)
+        return pose_enc