intiial commit

Browse files

Files changed (6) hide show

.gitignore +10 -0
README.md +119 -3
block.py +195 -0
modular_config.json +5 -0
pyproject.toml +16 -0
requirements.txt +8 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv

README.md CHANGED Viewed

@@ -1,3 +1,119 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+---
+# Depth Anything V2 Estimator Block
+A custom [Modular Diffusers](https://huggingface.co/docs/diffusers/modular_diffusers/overview) block for monocular depth estimation using [Depth Anything V2](https://huggingface.co/depth-anything/Depth-Anything-V2-Large-hf). Supports both images and videos.
+## Features
+- **Relative depth estimation** using Depth Anything V2 (Large variant, 335M params)
+- **Image and video** input support
+- **Grayscale or turbo colormap** visualization
+## Installation
+```bash
+# Using uv
+uv sync
+# Using pip
+pip install -r requirements.txt
+```
+## Quick Start
+### Load the block
+```python
+from diffusers import ModularPipelineBlocks
+import torch
+blocks = ModularPipelineBlocks.from_pretrained(
+    "your-username/depth-anything-v2-estimator",  # or local path "."
+    trust_remote_code=True,
+)
+pipeline = blocks.init_pipeline()
+pipeline.load_components(torch_dtype=torch.float16)
+pipeline.to("cuda")
+```
+### Single image - grayscale depth
+```python
+from PIL import Image
+image = Image.open("photo.jpg")
+output = pipeline(image=image)
+# Save depth map
+output.depth_image.save("photo_depth.png")
+# Access raw relative depth tensor
+print(output.predicted_depth.shape)  # (H, W)
+```
+### Single image - turbo colormap
+```python
+output = pipeline(image=image, colormap="turbo")
+output.depth_image.save("photo_depth_turbo.png")
+```
+### Video - grayscale depth
+```python
+from block import save_video
+output = pipeline(video_path="input.mp4", colormap="grayscale")
+save_video(output.depth_frames, output.fps, "output_depth.mp4")
+```
+### Video - turbo colormap
+```python
+output = pipeline(video_path="input.mp4", colormap="turbo")
+save_video(output.depth_frames, output.fps, "output_depth_turbo.mp4")
+```
+## Inputs
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `image` | `PIL.Image` | - | Image to estimate depth for |
+| `video_path` | `str` | - | Path to input video. When provided, `image` is ignored |
+| `colormap` | `str` | `"grayscale"` | `"grayscale"` or `"turbo"` (colormapped) |
+## Outputs
+### Image mode
+| Output | Type | Description |
+|--------|------|-------------|
+| `depth_image` | `PIL.Image` | Normalized depth visualization |
+| `predicted_depth` | `torch.Tensor` | Raw relative depth (H x W) |
+### Video mode
+| Output | Type | Description |
+|--------|------|-------------|
+| `depth_frames` | `List[PIL.Image]` | Per-frame depth visualizations |
+| `fps` | `float` | Source video frame rate |
+## Depth Normalization
+Depth values are min-max normalized and inverted so that bright areas represent nearby surfaces and dark areas represent distant ones.
+- **Bright = close**, **dark = far** (grayscale)
+- **Warm (red/yellow) = close**, **cool (blue) = far** (turbo)
+## Model Variants
+The block defaults to `depth-anything/Depth-Anything-V2-Large-hf`. Other available variants:
+| Variant | Model ID | Params |
+|---------|----------|--------|
+| Small | `depth-anything/Depth-Anything-V2-Small-hf` | 24.8M |
+| Base | `depth-anything/Depth-Anything-V2-Base-hf` | 97.5M |
+| **Large** (default) | `depth-anything/Depth-Anything-V2-Large-hf` | 335M |

block.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from typing import List, Union
+import av
+import numpy as np
+import torch
+from diffusers.modular_pipelines import (
+    ComponentSpec,
+    InputParam,
+    ModularPipelineBlocks,
+    OutputParam,
+    PipelineState,
+)
+from matplotlib import colormaps
+from PIL import Image
+from transformers import DepthAnythingForDepthEstimation, DPTImageProcessor
+TURBO_CMAP = colormaps["turbo"]
+def save_video(frames: List[Image.Image], fps: float, output_path: str) -> None:
+    """Save a list of PIL Image frames as an MP4 video."""
+    container = av.open(output_path, mode="w")
+    stream = container.add_stream("libx264", rate=int(fps))
+    stream.pix_fmt = "yuv420p"
+    stream.width = frames[0].width
+    stream.height = frames[0].height
+    for frame in frames:
+        video_frame = av.VideoFrame.from_image(frame)
+        for packet in stream.encode(video_frame):
+            container.mux(packet)
+    for packet in stream.encode():
+        container.mux(packet)
+    container.close()
+class DepthAnythingV2EstimatorBlock(ModularPipelineBlocks):
+    _requirements = {
+        "transformers": ">=5.1.0",
+        "torch": ">=2.9.0",
+        "torchvision": ">=0.16.0",
+        "av": ">=12.0.0",
+        "matplotlib": ">=3.7.0",
+    }
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                name="depth_estimator",
+                type_hint=DepthAnythingForDepthEstimation,
+                pretrained_model_name_or_path="depth-anything/Depth-Anything-V2-Large-hf",
+            ),
+            ComponentSpec(
+                name="depth_estimator_processor",
+                type_hint=DPTImageProcessor,
+                pretrained_model_name_or_path="depth-anything/Depth-Anything-V2-Large-hf",
+            ),
+        ]
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "image",
+                type_hint=Union[Image.Image, List[Image.Image]],
+                required=False,
+                description="Image(s) to estimate depth for",
+            ),
+            InputParam(
+                "video_path",
+                type_hint=str,
+                required=False,
+                description="Path to input video file. When provided, image is ignored.",
+            ),
+            InputParam(
+                "colormap",
+                type_hint=str,
+                default="grayscale",
+                description="Depth visualization format: 'grayscale' or 'turbo' (colormapped)",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "depth_image",
+                type_hint=Image.Image,
+                description="Normalized depth map as a PIL image (single image mode)",
+            ),
+            OutputParam(
+                "predicted_depth",
+                type_hint=torch.Tensor,
+                description="Raw relative depth tensor (H x W) (single image mode)",
+            ),
+            OutputParam(
+                "depth_frames",
+                type_hint=list,
+                description="List of per-frame depth PIL images (video mode)",
+            ),
+            OutputParam(
+                "fps",
+                type_hint=float,
+                description="Source video frame rate (video mode)",
+            ),
+        ]
+    def _estimate_depth(self, image: Image.Image, processor, model) -> dict:
+        inputs = processor(images=[image], return_tensors="pt").to(model.device)
+        outputs = model(**inputs)
+        post_processed = processor.post_process_depth_estimation(
+            outputs, target_sizes=[(image.height, image.width)]
+        )
+        return post_processed[0]
+    def _normalize_depth(self, depth: np.ndarray) -> np.ndarray:
+        d_min = depth.min()
+        d_max = depth.max()
+        normalized = (depth - d_min) / (d_max - d_min + 1e-8)
+        # Invert so bright = close, dark = far (consistent with depth_pro)
+        return 1.0 - normalized
+    def _apply_colormap(self, normalized: np.ndarray, mode: str) -> np.ndarray:
+        if mode == "turbo":
+            colored = (TURBO_CMAP(normalized)[..., :3] * 255).astype(np.uint8)
+            return colored
+        return (normalized * 255.0).astype(np.uint8)
+    def _process_video(self, video_path, processor, model, colormap):
+        input_container = av.open(video_path)
+        video_stream = input_container.streams.video[0]
+        fps = video_stream.average_rate
+        depth_frames = []
+        for frame in input_container.decode(video=0):
+            pil_image = frame.to_image().convert("RGB")
+            result = self._estimate_depth(pil_image, processor, model)
+            depth_np = result["predicted_depth"].float().cpu().numpy()
+            normalized = self._normalize_depth(depth_np)
+            colored = self._apply_colormap(normalized, colormap)
+            if colormap == "turbo":
+                depth_frame = Image.fromarray(colored, mode="RGB")
+            else:
+                depth_frame = Image.fromarray(colored, mode="L")
+            depth_frames.append(depth_frame)
+        input_container.close()
+        return depth_frames, fps
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        processor = components.depth_estimator_processor
+        model = components.depth_estimator
+        video_path = getattr(block_state, "video_path", None)
+        if video_path:
+            depth_frames, fps = self._process_video(
+                video_path, processor, model, block_state.colormap
+            )
+            block_state.depth_frames = depth_frames
+            block_state.fps = float(fps)
+            block_state.depth_image = None
+            block_state.predicted_depth = None
+        else:
+            image = block_state.image
+            if not isinstance(image, list):
+                image = [image]
+            result = self._estimate_depth(image[0], processor, model)
+            predicted_depth = result["predicted_depth"]
+            block_state.predicted_depth = predicted_depth
+            depth_np = predicted_depth.float().cpu().numpy()
+            normalized = self._normalize_depth(depth_np)
+            colored = self._apply_colormap(normalized, block_state.colormap)
+            if block_state.colormap == "turbo":
+                block_state.depth_image = Image.fromarray(colored, mode="RGB")
+            else:
+                block_state.depth_image = Image.fromarray(colored, mode="L")
+            block_state.depth_frames = None
+            block_state.fps = None
+        self.set_block_state(state, block_state)
+        return components, state

modular_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "auto_map": {
+    "ModularPipelineBlocks": "block.DepthAnythingV2EstimatorBlock"
+  }
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[project]
+name = "depth-anything-v2-estimator"
+version = "0.1.0"
+description = "Modular Diffusers custom block for monocular depth estimation using Depth Anything V2"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "accelerate>=1.0.0",
+    "av>=12.0.0",
+    "diffusers>=0.37.0",
+    "matplotlib>=3.7.0",
+    "pillow>=10.0.0",
+    "torch>=2.9.0",
+    "torchvision>=0.16.0",
+    "transformers>=5.1.0",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch>=2.9.0
+torchvision>=0.16.0
+transformers>=5.1.0
+diffusers>=0.37.0
+accelerate>=1.0.0
+av>=12.0.0
+matplotlib>=3.7.0
+pillow>=10.0.0