OzzyGT HF Staff commited on
Commit
b36de4d
·
1 Parent(s): 05ce239

intiial commit

Browse files
Files changed (6) hide show
  1. .gitignore +10 -0
  2. README.md +119 -3
  3. block.py +195 -0
  4. modular_config.json +5 -0
  5. pyproject.toml +16 -0
  6. requirements.txt +8 -0
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
README.md CHANGED
@@ -1,3 +1,119 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ # Depth Anything V2 Estimator Block
6
+
7
+ A custom [Modular Diffusers](https://huggingface.co/docs/diffusers/modular_diffusers/overview) block for monocular depth estimation using [Depth Anything V2](https://huggingface.co/depth-anything/Depth-Anything-V2-Large-hf). Supports both images and videos.
8
+
9
+ ## Features
10
+
11
+ - **Relative depth estimation** using Depth Anything V2 (Large variant, 335M params)
12
+ - **Image and video** input support
13
+ - **Grayscale or turbo colormap** visualization
14
+
15
+ ## Installation
16
+
17
+ ```bash
18
+ # Using uv
19
+ uv sync
20
+
21
+ # Using pip
22
+ pip install -r requirements.txt
23
+ ```
24
+
25
+ ## Quick Start
26
+
27
+ ### Load the block
28
+
29
+ ```python
30
+ from diffusers import ModularPipelineBlocks
31
+ import torch
32
+
33
+ blocks = ModularPipelineBlocks.from_pretrained(
34
+ "your-username/depth-anything-v2-estimator", # or local path "."
35
+ trust_remote_code=True,
36
+ )
37
+ pipeline = blocks.init_pipeline()
38
+ pipeline.load_components(torch_dtype=torch.float16)
39
+ pipeline.to("cuda")
40
+ ```
41
+
42
+ ### Single image - grayscale depth
43
+
44
+ ```python
45
+ from PIL import Image
46
+
47
+ image = Image.open("photo.jpg")
48
+ output = pipeline(image=image)
49
+
50
+ # Save depth map
51
+ output.depth_image.save("photo_depth.png")
52
+
53
+ # Access raw relative depth tensor
54
+ print(output.predicted_depth.shape) # (H, W)
55
+ ```
56
+
57
+ ### Single image - turbo colormap
58
+
59
+ ```python
60
+ output = pipeline(image=image, colormap="turbo")
61
+ output.depth_image.save("photo_depth_turbo.png")
62
+ ```
63
+
64
+ ### Video - grayscale depth
65
+
66
+ ```python
67
+ from block import save_video
68
+
69
+ output = pipeline(video_path="input.mp4", colormap="grayscale")
70
+ save_video(output.depth_frames, output.fps, "output_depth.mp4")
71
+ ```
72
+
73
+ ### Video - turbo colormap
74
+
75
+ ```python
76
+ output = pipeline(video_path="input.mp4", colormap="turbo")
77
+ save_video(output.depth_frames, output.fps, "output_depth_turbo.mp4")
78
+ ```
79
+
80
+ ## Inputs
81
+
82
+ | Parameter | Type | Default | Description |
83
+ |-----------|------|---------|-------------|
84
+ | `image` | `PIL.Image` | - | Image to estimate depth for |
85
+ | `video_path` | `str` | - | Path to input video. When provided, `image` is ignored |
86
+ | `colormap` | `str` | `"grayscale"` | `"grayscale"` or `"turbo"` (colormapped) |
87
+
88
+ ## Outputs
89
+
90
+ ### Image mode
91
+
92
+ | Output | Type | Description |
93
+ |--------|------|-------------|
94
+ | `depth_image` | `PIL.Image` | Normalized depth visualization |
95
+ | `predicted_depth` | `torch.Tensor` | Raw relative depth (H x W) |
96
+
97
+ ### Video mode
98
+
99
+ | Output | Type | Description |
100
+ |--------|------|-------------|
101
+ | `depth_frames` | `List[PIL.Image]` | Per-frame depth visualizations |
102
+ | `fps` | `float` | Source video frame rate |
103
+
104
+ ## Depth Normalization
105
+
106
+ Depth values are min-max normalized and inverted so that bright areas represent nearby surfaces and dark areas represent distant ones.
107
+
108
+ - **Bright = close**, **dark = far** (grayscale)
109
+ - **Warm (red/yellow) = close**, **cool (blue) = far** (turbo)
110
+
111
+ ## Model Variants
112
+
113
+ The block defaults to `depth-anything/Depth-Anything-V2-Large-hf`. Other available variants:
114
+
115
+ | Variant | Model ID | Params |
116
+ |---------|----------|--------|
117
+ | Small | `depth-anything/Depth-Anything-V2-Small-hf` | 24.8M |
118
+ | Base | `depth-anything/Depth-Anything-V2-Base-hf` | 97.5M |
119
+ | **Large** (default) | `depth-anything/Depth-Anything-V2-Large-hf` | 335M |
block.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+
3
+ import av
4
+ import numpy as np
5
+ import torch
6
+ from diffusers.modular_pipelines import (
7
+ ComponentSpec,
8
+ InputParam,
9
+ ModularPipelineBlocks,
10
+ OutputParam,
11
+ PipelineState,
12
+ )
13
+ from matplotlib import colormaps
14
+ from PIL import Image
15
+ from transformers import DepthAnythingForDepthEstimation, DPTImageProcessor
16
+
17
+ TURBO_CMAP = colormaps["turbo"]
18
+
19
+
20
+ def save_video(frames: List[Image.Image], fps: float, output_path: str) -> None:
21
+ """Save a list of PIL Image frames as an MP4 video."""
22
+ container = av.open(output_path, mode="w")
23
+ stream = container.add_stream("libx264", rate=int(fps))
24
+ stream.pix_fmt = "yuv420p"
25
+ stream.width = frames[0].width
26
+ stream.height = frames[0].height
27
+
28
+ for frame in frames:
29
+ video_frame = av.VideoFrame.from_image(frame)
30
+ for packet in stream.encode(video_frame):
31
+ container.mux(packet)
32
+
33
+ for packet in stream.encode():
34
+ container.mux(packet)
35
+ container.close()
36
+
37
+
38
+ class DepthAnythingV2EstimatorBlock(ModularPipelineBlocks):
39
+ _requirements = {
40
+ "transformers": ">=5.1.0",
41
+ "torch": ">=2.9.0",
42
+ "torchvision": ">=0.16.0",
43
+ "av": ">=12.0.0",
44
+ "matplotlib": ">=3.7.0",
45
+ }
46
+
47
+ @property
48
+ def expected_components(self) -> List[ComponentSpec]:
49
+ return [
50
+ ComponentSpec(
51
+ name="depth_estimator",
52
+ type_hint=DepthAnythingForDepthEstimation,
53
+ pretrained_model_name_or_path="depth-anything/Depth-Anything-V2-Large-hf",
54
+ ),
55
+ ComponentSpec(
56
+ name="depth_estimator_processor",
57
+ type_hint=DPTImageProcessor,
58
+ pretrained_model_name_or_path="depth-anything/Depth-Anything-V2-Large-hf",
59
+ ),
60
+ ]
61
+
62
+ @property
63
+ def inputs(self) -> List[InputParam]:
64
+ return [
65
+ InputParam(
66
+ "image",
67
+ type_hint=Union[Image.Image, List[Image.Image]],
68
+ required=False,
69
+ description="Image(s) to estimate depth for",
70
+ ),
71
+ InputParam(
72
+ "video_path",
73
+ type_hint=str,
74
+ required=False,
75
+ description="Path to input video file. When provided, image is ignored.",
76
+ ),
77
+ InputParam(
78
+ "colormap",
79
+ type_hint=str,
80
+ default="grayscale",
81
+ description="Depth visualization format: 'grayscale' or 'turbo' (colormapped)",
82
+ ),
83
+ ]
84
+
85
+ @property
86
+ def intermediate_outputs(self) -> List[OutputParam]:
87
+ return [
88
+ OutputParam(
89
+ "depth_image",
90
+ type_hint=Image.Image,
91
+ description="Normalized depth map as a PIL image (single image mode)",
92
+ ),
93
+ OutputParam(
94
+ "predicted_depth",
95
+ type_hint=torch.Tensor,
96
+ description="Raw relative depth tensor (H x W) (single image mode)",
97
+ ),
98
+ OutputParam(
99
+ "depth_frames",
100
+ type_hint=list,
101
+ description="List of per-frame depth PIL images (video mode)",
102
+ ),
103
+ OutputParam(
104
+ "fps",
105
+ type_hint=float,
106
+ description="Source video frame rate (video mode)",
107
+ ),
108
+ ]
109
+
110
+ def _estimate_depth(self, image: Image.Image, processor, model) -> dict:
111
+ inputs = processor(images=[image], return_tensors="pt").to(model.device)
112
+ outputs = model(**inputs)
113
+ post_processed = processor.post_process_depth_estimation(
114
+ outputs, target_sizes=[(image.height, image.width)]
115
+ )
116
+ return post_processed[0]
117
+
118
+ def _normalize_depth(self, depth: np.ndarray) -> np.ndarray:
119
+ d_min = depth.min()
120
+ d_max = depth.max()
121
+ normalized = (depth - d_min) / (d_max - d_min + 1e-8)
122
+ # Invert so bright = close, dark = far (consistent with depth_pro)
123
+ return 1.0 - normalized
124
+
125
+ def _apply_colormap(self, normalized: np.ndarray, mode: str) -> np.ndarray:
126
+ if mode == "turbo":
127
+ colored = (TURBO_CMAP(normalized)[..., :3] * 255).astype(np.uint8)
128
+ return colored
129
+ return (normalized * 255.0).astype(np.uint8)
130
+
131
+ def _process_video(self, video_path, processor, model, colormap):
132
+ input_container = av.open(video_path)
133
+ video_stream = input_container.streams.video[0]
134
+ fps = video_stream.average_rate
135
+
136
+ depth_frames = []
137
+ for frame in input_container.decode(video=0):
138
+ pil_image = frame.to_image().convert("RGB")
139
+
140
+ result = self._estimate_depth(pil_image, processor, model)
141
+ depth_np = result["predicted_depth"].float().cpu().numpy()
142
+ normalized = self._normalize_depth(depth_np)
143
+ colored = self._apply_colormap(normalized, colormap)
144
+
145
+ if colormap == "turbo":
146
+ depth_frame = Image.fromarray(colored, mode="RGB")
147
+ else:
148
+ depth_frame = Image.fromarray(colored, mode="L")
149
+ depth_frames.append(depth_frame)
150
+
151
+ input_container.close()
152
+
153
+ return depth_frames, fps
154
+
155
+ @torch.no_grad()
156
+ def __call__(self, components, state: PipelineState) -> PipelineState:
157
+ block_state = self.get_block_state(state)
158
+
159
+ processor = components.depth_estimator_processor
160
+ model = components.depth_estimator
161
+
162
+ video_path = getattr(block_state, "video_path", None)
163
+
164
+ if video_path:
165
+ depth_frames, fps = self._process_video(
166
+ video_path, processor, model, block_state.colormap
167
+ )
168
+ block_state.depth_frames = depth_frames
169
+ block_state.fps = float(fps)
170
+ block_state.depth_image = None
171
+ block_state.predicted_depth = None
172
+ else:
173
+ image = block_state.image
174
+ if not isinstance(image, list):
175
+ image = [image]
176
+
177
+ result = self._estimate_depth(image[0], processor, model)
178
+ predicted_depth = result["predicted_depth"]
179
+
180
+ block_state.predicted_depth = predicted_depth
181
+
182
+ depth_np = predicted_depth.float().cpu().numpy()
183
+ normalized = self._normalize_depth(depth_np)
184
+ colored = self._apply_colormap(normalized, block_state.colormap)
185
+ if block_state.colormap == "turbo":
186
+ block_state.depth_image = Image.fromarray(colored, mode="RGB")
187
+ else:
188
+ block_state.depth_image = Image.fromarray(colored, mode="L")
189
+
190
+ block_state.depth_frames = None
191
+ block_state.fps = None
192
+
193
+ self.set_block_state(state, block_state)
194
+
195
+ return components, state
modular_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "ModularPipelineBlocks": "block.DepthAnythingV2EstimatorBlock"
4
+ }
5
+ }
pyproject.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "depth-anything-v2-estimator"
3
+ version = "0.1.0"
4
+ description = "Modular Diffusers custom block for monocular depth estimation using Depth Anything V2"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "accelerate>=1.0.0",
9
+ "av>=12.0.0",
10
+ "diffusers>=0.37.0",
11
+ "matplotlib>=3.7.0",
12
+ "pillow>=10.0.0",
13
+ "torch>=2.9.0",
14
+ "torchvision>=0.16.0",
15
+ "transformers>=5.1.0",
16
+ ]
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch>=2.9.0
2
+ torchvision>=0.16.0
3
+ transformers>=5.1.0
4
+ diffusers>=0.37.0
5
+ accelerate>=1.0.0
6
+ av>=12.0.0
7
+ matplotlib>=3.7.0
8
+ pillow>=10.0.0