initial commit

Browse files

Files changed (6) hide show

.gitignore +10 -0
README.md +118 -3
block.py +214 -0
modular_config.json +5 -0
pyproject.toml +16 -0
requirements.txt +8 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv

README.md CHANGED Viewed

@@ -1,3 +1,118 @@
----
-license: apache-2.0
----

+---
+library_name: diffusers
+license: apache-2.0
+tags:
+- modular-diffusers
+- diffusers
+- depth-estimation
+---
+# Depth Pro Estimator Block
+A custom [Modular Diffusers](https://huggingface.co/docs/diffusers/modular_diffusers/overview) block for monocular depth estimation using Apple's [Depth Pro](https://huggingface.co/apple/DepthPro-hf) model. Supports both images and videos.
+## Features
+- **Metric depth estimation** in real-world meters using Depth Pro
+- **Image and video** input support
+- **Grayscale or turbo colormap** visualization
+- Inverse depth normalization (following Apple's reference implementation) for robust handling of outdoor/sky scenes
+## Installation
+```bash
+# Using uv
+uv sync
+# Using pip
+pip install -r requirements.txt
+```
+## Quick Start
+### Load the block
+```python
+from diffusers import ModularPipelineBlocks
+import torch
+blocks = ModularPipelineBlocks.from_pretrained(
+    "your-username/depth-pro-estimator",  # or local path "."
+    trust_remote_code=True,
+)
+pipeline = blocks.init_pipeline()
+pipeline.load_components(torch_dtype=torch.float16)
+pipeline.to("cuda")
+```
+### Single image - grayscale depth
+```python
+from PIL import Image
+image = Image.open("photo.jpg")
+output = pipeline(image=image)
+# Save depth map
+output.depth_image.save("photo_depth.png")
+# Access raw metric depth tensor (in meters)
+print(output.predicted_depth.shape)  # (H, W)
+print(output.field_of_view)          # estimated FOV
+print(output.focal_length)           # estimated focal length
+```
+### Single image - turbo colormap
+```python
+output = pipeline(image=image, colormap="turbo")
+output.depth_image.save("photo_depth_turbo.png")
+```
+### Video - grayscale depth
+```python
+from block import save_video
+output = pipeline(video_path="input.mp4", colormap="grayscale")
+save_video(output.depth_frames, output.fps, "output_depth.mp4")
+```
+### Video - turbo colormap
+```python
+output = pipeline(video_path="input.mp4", colormap="turbo")
+save_video(output.depth_frames, output.fps, "output_depth_turbo.mp4")
+```
+## Inputs
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `image` | `PIL.Image` | - | Image to estimate depth for |
+| `video_path` | `str` | - | Path to input video. When provided, `image` is ignored |
+| `colormap` | `str` | `"grayscale"` | `"grayscale"` or `"turbo"` (colormapped) |
+## Outputs
+### Image mode
+| Output | Type | Description |
+|--------|------|-------------|
+| `depth_image` | `PIL.Image` | Normalized depth visualization |
+| `predicted_depth` | `torch.Tensor` | Raw metric depth in meters (H x W) |
+| `field_of_view` | `float` | Estimated horizontal FOV |
+| `focal_length` | `float` | Estimated focal length |
+### Video mode
+| Output | Type | Description |
+|--------|------|-------------|
+| `depth_frames` | `List[PIL.Image]` | Per-frame depth visualizations |
+| `fps` | `float` | Source video frame rate |
+## Depth Normalization
+Depth visualization uses inverse depth clipped to [0.1m, 250m], following [Apple's reference implementation](https://github.com/apple/ml-depth-pro). This prevents sky/infinity values (clamped at 10,000m by the model) from crushing near-field detail into a binary mask.
+- **Bright = close**, **dark = far** (grayscale)
+- **Warm (red/yellow) = close**, **cool (blue) = far** (turbo)

block.py ADDED Viewed

	@@ -0,0 +1,214 @@

+from typing import List, Union
+import av
+import numpy as np
+import torch
+from diffusers.modular_pipelines import (
+    ComponentSpec,
+    InputParam,
+    ModularPipelineBlocks,
+    OutputParam,
+    PipelineState,
+)
+from matplotlib import colormaps
+from PIL import Image
+from transformers import DepthProForDepthEstimation, DepthProImageProcessor
+TURBO_CMAP = colormaps["turbo"]
+def save_video(frames: List[Image.Image], fps: float, output_path: str) -> None:
+    """Save a list of PIL Image frames as an MP4 video."""
+    container = av.open(output_path, mode="w")
+    stream = container.add_stream("libx264", rate=int(fps))
+    stream.pix_fmt = "yuv420p"
+    stream.width = frames[0].width
+    stream.height = frames[0].height
+    for frame in frames:
+        video_frame = av.VideoFrame.from_image(frame)
+        for packet in stream.encode(video_frame):
+            container.mux(packet)
+    for packet in stream.encode():
+        container.mux(packet)
+    container.close()
+class DepthProEstimatorBlock(ModularPipelineBlocks):
+    _requirements = {
+        "transformers": ">=5.1.0",
+        "torch": ">=2.9.0",
+        "torchvision": ">=0.16.0",
+        "av": ">=12.0.0",
+        "matplotlib": ">=3.7.0",
+    }
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                name="depth_estimator",
+                type_hint=DepthProForDepthEstimation,
+                pretrained_model_name_or_path="apple/DepthPro-hf",
+            ),
+            ComponentSpec(
+                name="depth_estimator_processor",
+                type_hint=DepthProImageProcessor,
+                pretrained_model_name_or_path="apple/DepthPro-hf",
+            ),
+        ]
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "image",
+                type_hint=Union[Image.Image, List[Image.Image]],
+                required=False,
+                description="Image(s) to estimate depth for",
+            ),
+            InputParam(
+                "video_path",
+                type_hint=str,
+                required=False,
+                description="Path to input video file. When provided, image is ignored.",
+            ),
+            InputParam(
+                "output_type",
+                type_hint=str,
+                default="depth_image",
+                description="Output type: 'depth_image', 'depth_tensor', or 'depth_and_fov'",
+            ),
+            InputParam(
+                "colormap",
+                type_hint=str,
+                default="grayscale",
+                description="Depth visualization format: 'grayscale' or 'turbo' (colormapped)",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "depth_image",
+                type_hint=Image.Image,
+                description="Normalized depth map as a grayscale PIL image (single image mode)",
+            ),
+            OutputParam(
+                "predicted_depth",
+                type_hint=torch.Tensor,
+                description="Raw metric depth tensor (H x W) (single image mode)",
+            ),
+            OutputParam(
+                "field_of_view",
+                type_hint=float,
+                description="Estimated horizontal field of view (single image mode)",
+            ),
+            OutputParam(
+                "focal_length",
+                type_hint=float,
+                description="Estimated focal length (single image mode)",
+            ),
+            OutputParam(
+                "depth_frames",
+                type_hint=list,
+                description="List of per-frame depth PIL images (video mode)",
+            ),
+            OutputParam(
+                "fps",
+                type_hint=float,
+                description="Source video frame rate (video mode)",
+            ),
+        ]
+    def _estimate_depth(self, image: Image.Image, processor, model) -> np.ndarray:
+        inputs = processor(images=[image], return_tensors="pt").to(model.device)
+        outputs = model(**inputs)
+        post_processed = processor.post_process_depth_estimation(
+            outputs, target_sizes=[(image.height, image.width)]
+        )
+        return post_processed[0]
+    def _normalize_depth(self, depth: np.ndarray) -> np.ndarray:
+        inverse_depth = 1.0 / np.clip(depth, 0.1, 250.0)
+        inv_min = inverse_depth.min()
+        inv_max = inverse_depth.max()
+        return (inverse_depth - inv_min) / (inv_max - inv_min + 1e-8)
+    def _apply_colormap(self, normalized: np.ndarray, mode: str) -> np.ndarray:
+        if mode == "turbo":
+            colored = (TURBO_CMAP(normalized)[..., :3] * 255).astype(np.uint8)
+            return colored
+        return (normalized * 255.0).astype(np.uint8)
+    def _process_video(self, video_path, processor, model, colormap):
+        input_container = av.open(video_path)
+        video_stream = input_container.streams.video[0]
+        fps = video_stream.average_rate
+        depth_frames = []
+        for frame in input_container.decode(video=0):
+            pil_image = frame.to_image().convert("RGB")
+            result = self._estimate_depth(pil_image, processor, model)
+            depth_np = result["predicted_depth"].float().cpu().numpy()
+            normalized = self._normalize_depth(depth_np)
+            colored = self._apply_colormap(normalized, colormap)
+            if colormap == "turbo":
+                depth_frame = Image.fromarray(colored, mode="RGB")
+            else:
+                depth_frame = Image.fromarray(colored, mode="L")
+            depth_frames.append(depth_frame)
+        input_container.close()
+        return depth_frames, fps
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        processor = components.depth_estimator_processor
+        model = components.depth_estimator
+        video_path = getattr(block_state, "video_path", None)
+        if video_path:
+            depth_frames, fps = self._process_video(
+                video_path, processor, model, block_state.colormap
+            )
+            block_state.depth_frames = depth_frames
+            block_state.fps = float(fps)
+            block_state.depth_image = None
+            block_state.predicted_depth = None
+            block_state.field_of_view = None
+            block_state.focal_length = None
+        else:
+            image = block_state.image
+            if not isinstance(image, list):
+                image = [image]
+            result = self._estimate_depth(image[0], processor, model)
+            predicted_depth = result["predicted_depth"]
+            block_state.predicted_depth = predicted_depth
+            block_state.field_of_view = result.get("field_of_view")
+            block_state.focal_length = result.get("focal_length")
+            depth_np = predicted_depth.float().cpu().numpy()
+            normalized = self._normalize_depth(depth_np)
+            colored = self._apply_colormap(normalized, block_state.colormap)
+            if block_state.colormap == "turbo":
+                block_state.depth_image = Image.fromarray(colored, mode="RGB")
+            else:
+                block_state.depth_image = Image.fromarray(colored, mode="L")
+            block_state.depth_frames = None
+            block_state.fps = None
+        self.set_block_state(state, block_state)
+        return components, state

modular_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "auto_map": {
+    "ModularPipelineBlocks": "block.DepthProEstimatorBlock"
+  }
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[project]
+name = "depth-pro-estimator"
+version = "0.1.0"
+description = "Modular Diffusers custom block for monocular depth estimation using Apple Depth Pro"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "accelerate>=1.0.0",
+    "av>=12.0.0",
+    "diffusers>=0.37.0",
+    "matplotlib>=3.7.0",
+    "pillow>=10.0.0",
+    "torch>=2.9.0",
+    "torchvision>=0.16.0",
+    "transformers>=5.1.0",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch>=2.9.0
+torchvision>=0.16.0
+transformers>=5.1.0
+diffusers>=0.37.0
+accelerate>=1.0.0
+av>=12.0.0
+matplotlib>=3.7.0
+pillow>=10.0.0