Diffusers
Safetensors
File size: 3,411 Bytes
4165f20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
EvalMDE-native dataloader for MoGe eval_baseline.py.

Reads Infinigen-style scene dirs (per EvalMDE convention):
    <scene>/rgb.png
    <scene>/gt_depth.npz   keys: depth (H,W) float32, intr (4,)=[fx,fy,cx,cy] pixels, valid (H,W) bool
Returns the same sample dict shape as MoGe's EvalDataLoaderPipeline.
"""
from pathlib import Path
from typing import Optional
import numpy as np
import torch
from PIL import Image


class EvalMDELoaderPipeline:
    def __init__(self, path: str, split: Optional[str] = None,
                 has_sharp_boundary: bool = False,
                 include_segmentation: bool = False,
                 depth_unit: float = 1.0,
                 **_):
        root = Path(path)
        if split is not None and (root / split).exists():
            names = [s.strip() for s in (root / split).read_text().splitlines() if s.strip()]
            self.scene_dirs = [root / n for n in names]
        else:
            self.scene_dirs = sorted([
                d for d in root.iterdir()
                if d.is_dir() and (d / 'rgb.png').exists() and (d / 'gt_depth.npz').exists()
            ])
        self.has_sharp_boundary = has_sharp_boundary
        self.depth_unit = depth_unit
        self._idx = 0

    def __enter__(self):
        return self

    def __exit__(self, *a):
        pass

    def __len__(self):
        return len(self.scene_dirs)

    def get(self):
        scene = self.scene_dirs[self._idx]
        self._idx += 1

        rgb = np.array(Image.open(scene / 'rgb.png').convert('RGB'))  # (H, W, 3) uint8
        H, W = rgb.shape[:2]

        gt = np.load(scene / 'gt_depth.npz')
        depth = gt['depth'].astype(np.float32)
        intr = gt['intr'].astype(np.float32)   # [fx, fy, cx, cy] in pixels
        valid = gt['valid'].astype(bool)
        # EvalMDE convention (evalmde/utils/depth.py:load_data): replace invalid/NaN
        # with 1.0 so depth-derived quantities (pointmap, etc.) stay finite.
        depth = np.where(valid & np.isfinite(depth), depth, np.float32(1.0))
        fx, fy, cx, cy = float(intr[0]), float(intr[1]), float(intr[2]), float(intr[3])

        # MoGe convention: 3x3 normalized intrinsics (fx/W, fy/H, cx/W, cy/H)
        K = np.array([
            [fx / W, 0.0,    cx / W],
            [0.0,    fy / H, cy / H],
            [0.0,    0.0,    1.0]
        ], dtype=np.float32)

        # Compute 3D pointmap (in camera frame, with native pixel intrinsics)
        u, v = np.meshgrid(np.arange(W), np.arange(H))
        x = (u.astype(np.float32) - cx) / fx * depth
        y = (v.astype(np.float32) - cy) / fy * depth
        points = np.stack([x, y, depth], axis=-1).astype(np.float32)

        # Multiply depth by depth_unit if specified (Infinigen is metric meters by default → 1.0)
        depth = depth * self.depth_unit

        return {
            'image':       torch.from_numpy(rgb.astype(np.float32) / 255.0).permute(2, 0, 1),
            'depth':       torch.from_numpy(depth).float(),
            'depth_mask':  torch.from_numpy(valid).bool(),
            'intrinsics':  torch.from_numpy(K).float(),
            'points':      torch.from_numpy(points).float(),
            'is_metric':   True,
            'has_sharp_boundary': self.has_sharp_boundary,
            'filename':    scene.name,
            # Carry raw pixel intrinsics for downstream EvalMDE metric usage
            '_intr_px':    intr,
        }