# Reference: https://github.com/indu1ge/DepthMaster # Strictly follows official `run.py`: # from depthmaster import DepthMasterPipeline # from depthmaster.modules.unet_2d_condition_s2 import UNet2DConditionModel # pipe = DepthMasterPipeline.from_pretrained(checkpoint_path, variant=variant, torch_dtype=dtype) # unet = UNet2DConditionModel.from_pretrained(os.path.join(checkpoint_path, 'unet')) # pipe.unet = unet # pipe = pipe.to(device) # pipe_out = pipe(input_pil_image, processing_res=..., match_input_res=..., # batch_size=..., color_map=..., show_progress_bar=..., resample_method=...) # depth_pred = pipe_out.depth_np # H x W float, affine-invariant depth import os import sys from typing import * from pathlib import Path import click import torch import torch.nn.functional as F import numpy as np from PIL import Image from moge.test.baseline import MGEBaselineInterface class Baseline(MGEBaselineInterface): def __init__(self, repo_path: str, checkpoint: str, processing_res: Optional[int], half_precision: bool, device: Union[torch.device, str]): repo_path = os.path.abspath(repo_path) if not Path(repo_path).exists(): raise FileNotFoundError( f"Cannot find DepthMaster repo at {repo_path}. Clone https://github.com/indu1ge/DepthMaster." ) if repo_path not in sys.path: sys.path.insert(0, repo_path) from depthmaster import DepthMasterPipeline from depthmaster.modules.unet_2d_condition_s2 import UNet2DConditionModel device = torch.device(device) dtype = torch.float16 if half_precision else torch.float32 variant = "fp16" if half_precision else None pipe = DepthMasterPipeline.from_pretrained(checkpoint, variant=variant, torch_dtype=dtype) unet_dir = os.path.join(checkpoint, "unet") unet = UNet2DConditionModel.from_pretrained(unet_dir) pipe.unet = unet try: pipe.enable_xformers_memory_efficient_attention() except ImportError: pass pipe = pipe.to(device) self.pipe = pipe self.device = device self.processing_res = processing_res @click.command() @click.option('--repo', 'repo_path', type=click.Path(), default='../DepthMaster', help='Path to the indu1ge/DepthMaster repository.') @click.option('--checkpoint', type=click.Path(), required=True, help='Local checkpoint directory containing pipeline files + unet subdir (HF: zysong212/DepthMaster).') @click.option('--processing_res', type=int, default=768, help='Pipeline processing resolution (run.py default 768).') @click.option('--fp16', 'half_precision', is_flag=True, help='Run in half precision.') @click.option('--device', type=str, default='cuda') @staticmethod def load(repo_path: str, checkpoint: str, processing_res: Optional[int], half_precision: bool, device: str = 'cuda'): return Baseline(repo_path, checkpoint, processing_res, half_precision, device) @torch.inference_mode() def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: omit_batch = image.ndim == 3 if omit_batch: image = image.unsqueeze(0) assert image.shape[0] == 1, "DepthMaster baseline only supports batch size 1" _, _, H, W = image.shape # Pipeline takes a PIL.Image (per run.py). arr = (image[0].cpu().permute(1, 2, 0).clamp(0, 1).numpy() * 255).astype(np.uint8) pil = Image.fromarray(arr) out = self.pipe( pil, processing_res=self.processing_res, match_input_res=True, batch_size=0, color_map='Spectral', show_progress_bar=False, resample_method='bilinear', ) depth_np = out.depth_np depth = torch.from_numpy(np.ascontiguousarray(depth_np)).to(self.device).float() if depth.shape != (H, W): depth = F.interpolate(depth[None, None], size=(H, W), mode='bilinear', align_corners=False)[0, 0] # DepthMaster predicts affine-invariant depth (TCSVT 2026). Emit only this physical key. result = {'depth_affine_invariant': depth} if not omit_batch: result['depth_affine_invariant'] = result['depth_affine_invariant'].unsqueeze(0) return result