OzzyGT HF Staff commited on
Commit
88c28fc
·
1 Parent(s): 61253a4

initial commit

Browse files
Files changed (6) hide show
  1. .gitignore +10 -0
  2. README.md +118 -3
  3. block.py +214 -0
  4. modular_config.json +5 -0
  5. pyproject.toml +16 -0
  6. requirements.txt +8 -0
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
README.md CHANGED
@@ -1,3 +1,118 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: diffusers
3
+ license: apache-2.0
4
+ tags:
5
+ - modular-diffusers
6
+ - diffusers
7
+ - depth-estimation
8
+ ---
9
+ # Depth Pro Estimator Block
10
+
11
+ A custom [Modular Diffusers](https://huggingface.co/docs/diffusers/modular_diffusers/overview) block for monocular depth estimation using Apple's [Depth Pro](https://huggingface.co/apple/DepthPro-hf) model. Supports both images and videos.
12
+
13
+ ## Features
14
+
15
+ - **Metric depth estimation** in real-world meters using Depth Pro
16
+ - **Image and video** input support
17
+ - **Grayscale or turbo colormap** visualization
18
+ - Inverse depth normalization (following Apple's reference implementation) for robust handling of outdoor/sky scenes
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ # Using uv
24
+ uv sync
25
+
26
+ # Using pip
27
+ pip install -r requirements.txt
28
+ ```
29
+
30
+ ## Quick Start
31
+
32
+ ### Load the block
33
+
34
+ ```python
35
+ from diffusers import ModularPipelineBlocks
36
+ import torch
37
+
38
+ blocks = ModularPipelineBlocks.from_pretrained(
39
+ "your-username/depth-pro-estimator", # or local path "."
40
+ trust_remote_code=True,
41
+ )
42
+ pipeline = blocks.init_pipeline()
43
+ pipeline.load_components(torch_dtype=torch.float16)
44
+ pipeline.to("cuda")
45
+ ```
46
+
47
+ ### Single image - grayscale depth
48
+
49
+ ```python
50
+ from PIL import Image
51
+
52
+ image = Image.open("photo.jpg")
53
+ output = pipeline(image=image)
54
+
55
+ # Save depth map
56
+ output.depth_image.save("photo_depth.png")
57
+
58
+ # Access raw metric depth tensor (in meters)
59
+ print(output.predicted_depth.shape) # (H, W)
60
+ print(output.field_of_view) # estimated FOV
61
+ print(output.focal_length) # estimated focal length
62
+ ```
63
+
64
+ ### Single image - turbo colormap
65
+
66
+ ```python
67
+ output = pipeline(image=image, colormap="turbo")
68
+ output.depth_image.save("photo_depth_turbo.png")
69
+ ```
70
+
71
+ ### Video - grayscale depth
72
+
73
+ ```python
74
+ from block import save_video
75
+
76
+ output = pipeline(video_path="input.mp4", colormap="grayscale")
77
+ save_video(output.depth_frames, output.fps, "output_depth.mp4")
78
+ ```
79
+
80
+ ### Video - turbo colormap
81
+
82
+ ```python
83
+ output = pipeline(video_path="input.mp4", colormap="turbo")
84
+ save_video(output.depth_frames, output.fps, "output_depth_turbo.mp4")
85
+ ```
86
+
87
+ ## Inputs
88
+
89
+ | Parameter | Type | Default | Description |
90
+ |-----------|------|---------|-------------|
91
+ | `image` | `PIL.Image` | - | Image to estimate depth for |
92
+ | `video_path` | `str` | - | Path to input video. When provided, `image` is ignored |
93
+ | `colormap` | `str` | `"grayscale"` | `"grayscale"` or `"turbo"` (colormapped) |
94
+
95
+ ## Outputs
96
+
97
+ ### Image mode
98
+
99
+ | Output | Type | Description |
100
+ |--------|------|-------------|
101
+ | `depth_image` | `PIL.Image` | Normalized depth visualization |
102
+ | `predicted_depth` | `torch.Tensor` | Raw metric depth in meters (H x W) |
103
+ | `field_of_view` | `float` | Estimated horizontal FOV |
104
+ | `focal_length` | `float` | Estimated focal length |
105
+
106
+ ### Video mode
107
+
108
+ | Output | Type | Description |
109
+ |--------|------|-------------|
110
+ | `depth_frames` | `List[PIL.Image]` | Per-frame depth visualizations |
111
+ | `fps` | `float` | Source video frame rate |
112
+
113
+ ## Depth Normalization
114
+
115
+ Depth visualization uses inverse depth clipped to [0.1m, 250m], following [Apple's reference implementation](https://github.com/apple/ml-depth-pro). This prevents sky/infinity values (clamped at 10,000m by the model) from crushing near-field detail into a binary mask.
116
+
117
+ - **Bright = close**, **dark = far** (grayscale)
118
+ - **Warm (red/yellow) = close**, **cool (blue) = far** (turbo)
block.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+
3
+ import av
4
+ import numpy as np
5
+ import torch
6
+ from diffusers.modular_pipelines import (
7
+ ComponentSpec,
8
+ InputParam,
9
+ ModularPipelineBlocks,
10
+ OutputParam,
11
+ PipelineState,
12
+ )
13
+ from matplotlib import colormaps
14
+ from PIL import Image
15
+ from transformers import DepthProForDepthEstimation, DepthProImageProcessor
16
+
17
+ TURBO_CMAP = colormaps["turbo"]
18
+
19
+
20
+ def save_video(frames: List[Image.Image], fps: float, output_path: str) -> None:
21
+ """Save a list of PIL Image frames as an MP4 video."""
22
+ container = av.open(output_path, mode="w")
23
+ stream = container.add_stream("libx264", rate=int(fps))
24
+ stream.pix_fmt = "yuv420p"
25
+ stream.width = frames[0].width
26
+ stream.height = frames[0].height
27
+
28
+ for frame in frames:
29
+ video_frame = av.VideoFrame.from_image(frame)
30
+ for packet in stream.encode(video_frame):
31
+ container.mux(packet)
32
+
33
+ for packet in stream.encode():
34
+ container.mux(packet)
35
+ container.close()
36
+
37
+
38
+ class DepthProEstimatorBlock(ModularPipelineBlocks):
39
+ _requirements = {
40
+ "transformers": ">=5.1.0",
41
+ "torch": ">=2.9.0",
42
+ "torchvision": ">=0.16.0",
43
+ "av": ">=12.0.0",
44
+ "matplotlib": ">=3.7.0",
45
+ }
46
+
47
+ @property
48
+ def expected_components(self) -> List[ComponentSpec]:
49
+ return [
50
+ ComponentSpec(
51
+ name="depth_estimator",
52
+ type_hint=DepthProForDepthEstimation,
53
+ pretrained_model_name_or_path="apple/DepthPro-hf",
54
+ ),
55
+ ComponentSpec(
56
+ name="depth_estimator_processor",
57
+ type_hint=DepthProImageProcessor,
58
+ pretrained_model_name_or_path="apple/DepthPro-hf",
59
+ ),
60
+ ]
61
+
62
+ @property
63
+ def inputs(self) -> List[InputParam]:
64
+ return [
65
+ InputParam(
66
+ "image",
67
+ type_hint=Union[Image.Image, List[Image.Image]],
68
+ required=False,
69
+ description="Image(s) to estimate depth for",
70
+ ),
71
+ InputParam(
72
+ "video_path",
73
+ type_hint=str,
74
+ required=False,
75
+ description="Path to input video file. When provided, image is ignored.",
76
+ ),
77
+ InputParam(
78
+ "output_type",
79
+ type_hint=str,
80
+ default="depth_image",
81
+ description="Output type: 'depth_image', 'depth_tensor', or 'depth_and_fov'",
82
+ ),
83
+ InputParam(
84
+ "colormap",
85
+ type_hint=str,
86
+ default="grayscale",
87
+ description="Depth visualization format: 'grayscale' or 'turbo' (colormapped)",
88
+ ),
89
+ ]
90
+
91
+ @property
92
+ def intermediate_outputs(self) -> List[OutputParam]:
93
+ return [
94
+ OutputParam(
95
+ "depth_image",
96
+ type_hint=Image.Image,
97
+ description="Normalized depth map as a grayscale PIL image (single image mode)",
98
+ ),
99
+ OutputParam(
100
+ "predicted_depth",
101
+ type_hint=torch.Tensor,
102
+ description="Raw metric depth tensor (H x W) (single image mode)",
103
+ ),
104
+ OutputParam(
105
+ "field_of_view",
106
+ type_hint=float,
107
+ description="Estimated horizontal field of view (single image mode)",
108
+ ),
109
+ OutputParam(
110
+ "focal_length",
111
+ type_hint=float,
112
+ description="Estimated focal length (single image mode)",
113
+ ),
114
+ OutputParam(
115
+ "depth_frames",
116
+ type_hint=list,
117
+ description="List of per-frame depth PIL images (video mode)",
118
+ ),
119
+ OutputParam(
120
+ "fps",
121
+ type_hint=float,
122
+ description="Source video frame rate (video mode)",
123
+ ),
124
+ ]
125
+
126
+ def _estimate_depth(self, image: Image.Image, processor, model) -> np.ndarray:
127
+ inputs = processor(images=[image], return_tensors="pt").to(model.device)
128
+ outputs = model(**inputs)
129
+ post_processed = processor.post_process_depth_estimation(
130
+ outputs, target_sizes=[(image.height, image.width)]
131
+ )
132
+ return post_processed[0]
133
+
134
+ def _normalize_depth(self, depth: np.ndarray) -> np.ndarray:
135
+ inverse_depth = 1.0 / np.clip(depth, 0.1, 250.0)
136
+ inv_min = inverse_depth.min()
137
+ inv_max = inverse_depth.max()
138
+ return (inverse_depth - inv_min) / (inv_max - inv_min + 1e-8)
139
+
140
+ def _apply_colormap(self, normalized: np.ndarray, mode: str) -> np.ndarray:
141
+ if mode == "turbo":
142
+ colored = (TURBO_CMAP(normalized)[..., :3] * 255).astype(np.uint8)
143
+ return colored
144
+ return (normalized * 255.0).astype(np.uint8)
145
+
146
+ def _process_video(self, video_path, processor, model, colormap):
147
+ input_container = av.open(video_path)
148
+ video_stream = input_container.streams.video[0]
149
+ fps = video_stream.average_rate
150
+
151
+ depth_frames = []
152
+ for frame in input_container.decode(video=0):
153
+ pil_image = frame.to_image().convert("RGB")
154
+
155
+ result = self._estimate_depth(pil_image, processor, model)
156
+ depth_np = result["predicted_depth"].float().cpu().numpy()
157
+ normalized = self._normalize_depth(depth_np)
158
+ colored = self._apply_colormap(normalized, colormap)
159
+
160
+ if colormap == "turbo":
161
+ depth_frame = Image.fromarray(colored, mode="RGB")
162
+ else:
163
+ depth_frame = Image.fromarray(colored, mode="L")
164
+ depth_frames.append(depth_frame)
165
+
166
+ input_container.close()
167
+
168
+ return depth_frames, fps
169
+
170
+ @torch.no_grad()
171
+ def __call__(self, components, state: PipelineState) -> PipelineState:
172
+ block_state = self.get_block_state(state)
173
+
174
+ processor = components.depth_estimator_processor
175
+ model = components.depth_estimator
176
+
177
+ video_path = getattr(block_state, "video_path", None)
178
+
179
+ if video_path:
180
+ depth_frames, fps = self._process_video(
181
+ video_path, processor, model, block_state.colormap
182
+ )
183
+ block_state.depth_frames = depth_frames
184
+ block_state.fps = float(fps)
185
+ block_state.depth_image = None
186
+ block_state.predicted_depth = None
187
+ block_state.field_of_view = None
188
+ block_state.focal_length = None
189
+ else:
190
+ image = block_state.image
191
+ if not isinstance(image, list):
192
+ image = [image]
193
+
194
+ result = self._estimate_depth(image[0], processor, model)
195
+ predicted_depth = result["predicted_depth"]
196
+
197
+ block_state.predicted_depth = predicted_depth
198
+ block_state.field_of_view = result.get("field_of_view")
199
+ block_state.focal_length = result.get("focal_length")
200
+
201
+ depth_np = predicted_depth.float().cpu().numpy()
202
+ normalized = self._normalize_depth(depth_np)
203
+ colored = self._apply_colormap(normalized, block_state.colormap)
204
+ if block_state.colormap == "turbo":
205
+ block_state.depth_image = Image.fromarray(colored, mode="RGB")
206
+ else:
207
+ block_state.depth_image = Image.fromarray(colored, mode="L")
208
+
209
+ block_state.depth_frames = None
210
+ block_state.fps = None
211
+
212
+ self.set_block_state(state, block_state)
213
+
214
+ return components, state
modular_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "ModularPipelineBlocks": "block.DepthProEstimatorBlock"
4
+ }
5
+ }
pyproject.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "depth-pro-estimator"
3
+ version = "0.1.0"
4
+ description = "Modular Diffusers custom block for monocular depth estimation using Apple Depth Pro"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "accelerate>=1.0.0",
9
+ "av>=12.0.0",
10
+ "diffusers>=0.37.0",
11
+ "matplotlib>=3.7.0",
12
+ "pillow>=10.0.0",
13
+ "torch>=2.9.0",
14
+ "torchvision>=0.16.0",
15
+ "transformers>=5.1.0",
16
+ ]
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch>=2.9.0
2
+ torchvision>=0.16.0
3
+ transformers>=5.1.0
4
+ diffusers>=0.37.0
5
+ accelerate>=1.0.0
6
+ av>=12.0.0
7
+ matplotlib>=3.7.0
8
+ pillow>=10.0.0