Diffusers
Safetensors
EvalMDE / baselines /ppd.py
zeyuren2002's picture
Add files using upload-large-folder tool
4165f20 verified
# Reference: https://github.com/gangweiX/Pixel-Perfect-Depth
# Strictly follows official `run.py`:
# from ppd.models.ppd import PixelPerfectDepth
# model = PixelPerfectDepth(semantics_model='DA2', semantics_pth='checkpoints/depth_anything_v2_vitl.pth',
# sampling_steps=4)
# model.load_state_dict(torch.load(model_pth, map_location='cpu'), strict=False)
# model = model.to(DEVICE).eval()
# image = cv2.imread(filename) # BGR uint8 numpy
# H, W = image.shape[:2]
# depth, _ = model.infer_image(image) # torch.Tensor, may be (1, 1, h, w)
# depth = F.interpolate(depth, size=(H, W), mode='bilinear', align_corners=False)[0, 0]
import os
import sys
from typing import *
from pathlib import Path
import click
import torch
import torch.nn.functional as F
import numpy as np
from moge.test.baseline import MGEBaselineInterface
class Baseline(MGEBaselineInterface):
def __init__(self, repo_path: str, semantics_model: str, semantics_pth: str,
model_pth: str, sampling_steps: int, device: Union[torch.device, str]):
repo_path = os.path.abspath(repo_path)
if not Path(repo_path).exists():
raise FileNotFoundError(
f"Cannot find PPD repo at {repo_path}. Clone https://github.com/gangweiX/Pixel-Perfect-Depth."
)
if repo_path not in sys.path:
sys.path.insert(0, repo_path)
from ppd.models.ppd import PixelPerfectDepth
from ppd.utils.set_seed import set_seed
set_seed(666) # mirror run.py
# Allow relative paths against repo root (mirror run.py expectations).
if not os.path.isabs(semantics_pth):
semantics_pth = os.path.join(repo_path, semantics_pth)
if not os.path.isabs(model_pth):
model_pth = os.path.join(repo_path, model_pth)
if not os.path.exists(semantics_pth):
raise FileNotFoundError(f"Cannot find PPD semantics checkpoint at {semantics_pth}.")
if not os.path.exists(model_pth):
raise FileNotFoundError(f"Cannot find PPD model checkpoint at {model_pth}.")
device = torch.device(device)
model = PixelPerfectDepth(
semantics_model=semantics_model,
semantics_pth=semantics_pth,
sampling_steps=sampling_steps,
)
model.load_state_dict(torch.load(model_pth, map_location='cpu'), strict=False)
model = model.to(device).eval()
self.model = model
self.device = device
@click.command()
@click.option('--repo', 'repo_path', type=click.Path(), default='../Pixel-Perfect-Depth',
help='Path to the gangweiX/Pixel-Perfect-Depth repository.')
@click.option('--semantics_model', type=click.Choice(['DA2', 'MoGe2']), default='DA2',
help='Semantics encoder used by PPD (run.py default DA2).')
@click.option('--semantics_pth', type=click.Path(),
default='checkpoints/depth_anything_v2_vitl.pth',
help='Semantics encoder ckpt path (relative to --repo if not absolute).')
@click.option('--model_pth', type=click.Path(), default='checkpoints/ppd.pth',
help='PPD model ckpt path (relative to --repo if not absolute).')
@click.option('--sampling_steps', type=int, default=4,
help='Number of DiT sampling steps (run.py default 4).')
@click.option('--device', type=str, default='cuda')
@staticmethod
def load(repo_path: str, semantics_model: str, semantics_pth: str,
model_pth: str, sampling_steps: int, device: str = 'cuda'):
return Baseline(repo_path, semantics_model, semantics_pth, model_pth, sampling_steps, device)
@torch.inference_mode()
def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
omit_batch = image.ndim == 3
if omit_batch:
image = image.unsqueeze(0)
assert image.shape[0] == 1, "PPD baseline only supports batch size 1"
_, _, H, W = image.shape
# run.py calls cv2.imread which returns BGR uint8 numpy (H, W, 3).
rgb_uint8 = (image[0].cpu().permute(1, 2, 0).clamp(0, 1).numpy() * 255).astype(np.uint8)
bgr_uint8 = rgb_uint8[..., ::-1].copy() # BGR for cv2 parity
depth, _ = self.model.infer_image(bgr_uint8)
# run.py: depth = F.interpolate(depth, size=(H, W), ...)[0, 0]; so depth here is 4D.
if depth.ndim == 4:
depth = F.interpolate(depth, size=(H, W), mode='bilinear', align_corners=False)[0, 0]
elif depth.ndim == 2 and depth.shape != (H, W):
depth = F.interpolate(depth[None, None], size=(H, W), mode='bilinear', align_corners=False)[0, 0]
depth = depth.to(self.device).float()
# PPD predicts affine-invariant depth (Xu et al., 2025). Emit only this physical key.
result = {'depth_affine_invariant': depth}
if not omit_batch:
result['depth_affine_invariant'] = result['depth_affine_invariant'].unsqueeze(0)
return result