zeyuren2002

Add files using upload-large-folder tool

4165f20 verified 9 days ago

4.36 kB

	# Reference: https://github.com/ByteDance-Seed/Depth-Anything-3
	# Variant of `baselines/da3.py` that loads DA3's monocular preset(s).
	# DA3 README: "DA3 Monocular Series (DA3Mono-Large). A dedicated model for high-quality
	# relative monocular depth estimation. Unlike disparity-based models (e.g. Depth Anything 2),
	# it directly predicts depth, resulting in superior geometric accuracy."
	#
	# Strictly follows the same Python API as da3.py:
	# model = DepthAnything3.from_pretrained(<hf_id>)
	# output = model(image) # image shape [B, N, 3, H, W]
	# depth = output['depth'][:, 0] # [B, H, W]
	#
	# NOTE on output key: DA3-Mono outputs depth directly (per README), not disparity.
	# We therefore return `depth_scale_invariant` instead of `disparity_affine_invariant`.

	import os
	import sys
	from typing import *
	from pathlib import Path

	import click
	import torch
	import torch.nn.functional as F
	import torchvision.transforms as T
	import torchvision.transforms.functional as TF

	from moge.test.baseline import MGEBaselineInterface


	class Baseline(MGEBaselineInterface):
	def __init__(self, repo_path: str, hf_id: str, num_tokens: Optional[int], device: Union[torch.device, str]):
	repo_path = os.path.abspath(repo_path)
	if not Path(repo_path).exists():
	raise FileNotFoundError(
	f"Cannot find Depth-Anything-3 repo at {repo_path}. Clone "
	f"https://github.com/ByteDance-Seed/Depth-Anything-3."
	)
	src_path = os.path.join(repo_path, 'src')
	if src_path not in sys.path:
	sys.path.insert(0, src_path)

	# Silence DA3's verbose per-image INFO logs (DA3_LOG_LEVEL is read at logger init)
	os.environ.setdefault('DA3_LOG_LEVEL', 'WARN')

	from depth_anything_3.api import DepthAnything3

	device = torch.device(device)
	model = DepthAnything3.from_pretrained(hf_id)
	model.to(device).eval()

	self.model = model
	self.num_tokens = num_tokens
	self.device = device

	@click.command()
	@click.option('--repo', 'repo_path', type=click.Path(), default='../Depth-Anything-3',
	help='Path to the ByteDance-Seed/Depth-Anything-3 repository.')
	@click.option('--hf_id', type=str, default='depth-anything/DA3MONO-LARGE',
	help='HF repo id of the DA3-Mono variant (e.g. depth-anything/DA3MONO-LARGE).')
	@click.option('--num_tokens', type=int, default=None,
	help='Number of tokens; None uses 518 / min(H, W) factor as in da3.py.')
	@click.option('--device', type=str, default='cuda')
	@staticmethod
	def load(repo_path: str, hf_id: str, num_tokens: Optional[int], device: str = 'cuda'):
	return Baseline(repo_path, hf_id, num_tokens, device)

	@torch.inference_mode()
	def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
	# Same input pipeline as baselines/da3.py to keep apples-to-apples.
	assert intrinsics is None, "DA3-Mono does not consume intrinsics."
	original_height, original_width = image.shape[-2:]

	if image.ndim == 3:
	image = image.unsqueeze(0)
	omit_batch_dim = True
	else:
	omit_batch_dim = False

	# Use DA3's high-level `model.inference()` API per README. Direct `model(x)`
	# goes through `forward(... export_feat_layers=None)` and the DA3-Mono backbone
	# (DINOv2 fork) crashes inside `_get_intermediate_layers_not_chunked` because it
	# tries `i in export_feat_layers` on None. `inference()` handles processing,
	# autocast, and post-processing correctly.
	import numpy as np
	np_img = (image[0].cpu().permute(1, 2, 0).clamp(0, 1).numpy() * 255).astype(np.uint8)
	prediction = self.model.inference([np_img])

	# prediction.depth: [N, H, W] float32
	depth_t = torch.as_tensor(prediction.depth[0], device=self.device, dtype=torch.float32)
	if depth_t.shape != (original_height, original_width):
	depth_t = F.interpolate(depth_t[None, None], size=(original_height, original_width),
	mode='bilinear', align_corners=False)[0, 0]

	if not omit_batch_dim:
	depth_t = depth_t.unsqueeze(0)
	return {'depth_scale_invariant': depth_t}