zeyuren2002 commited on 9 days ago

Commit

4b7b610

verified ·

1 Parent(s): aca0d59

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

DepthMaster/ckpt/eval/.gitattributes +35 -0
DepthMaster/ckpt/eval/README.md +97 -0
DepthMaster/ckpt/eval/model_index.json +28 -0
DepthMaster/ckpt/eval/text_encoder/config.json +25 -0
DepthMaster/ckpt/eval/tokenizer/merges.txt +0 -0
DepthMaster/ckpt/eval/tokenizer/special_tokens_map.json +24 -0
DepthMaster/ckpt/eval/tokenizer/tokenizer_config.json +34 -0
DepthMaster/ckpt/eval/tokenizer/vocab.json +0 -0
DepthMaster/ckpt/eval/unet/config.json +73 -0
DepthMaster/ckpt/eval/vae/config.json +30 -0
DepthMaster/data_split/kitti/eigen_train_files_with_gt.txt +0 -0
DepthMaster/depthmaster/modules/__pycache__/unet_2d_blocks.cpython-310.pyc +0 -0
DepthMaster/depthmaster/modules/__pycache__/unet_2d_condition_s2.cpython-310.pyc +0 -0
DepthMaster/external_encoder/dinov2/dinov2_layers/attention.py +83 -0
DepthMaster/external_encoder/dinov2/dinov2_layers/block.py +252 -0
DepthMaster/external_encoder/dinov2/dinov2_layers/drop_path.py +35 -0
DepthMaster/external_encoder/dinov2/dinov2_layers/layer_scale.py +28 -0
DepthMaster/external_encoder/dinov2/dinov2_layers/mlp.py +41 -0
DepthMaster/external_encoder/dinov2/dinov2_layers/patch_embed.py +89 -0
DepthMaster/external_encoder/dinov2/dinov2_layers/swiglu_ffn.py +63 -0
DepthMaster/external_encoder/dinov2/util/transform.py +160 -0
DepthMaster/in_the_wild_example/input/06.jpg +0 -0
DepthMaster/scripts/eval_diode.sh +13 -0
DepthMaster/scripts/eval_eth3d.sh +13 -0
DepthMaster/scripts/eval_hypersim.sh +13 -0
DepthMaster/scripts/eval_kitti.sh +13 -0
DepthMaster/scripts/eval_nyu.sh +13 -0
DepthMaster/scripts/eval_scannet.sh +13 -0
DepthMaster/scripts/infer.sh +10 -0
DepthMaster/scripts/train_s1.sh +9 -0
DepthMaster/scripts/train_s2.sh +9 -0
DepthMaster/src/dataset/__init__.py +71 -0
DepthMaster/src/dataset/base_depth_dataset.py +303 -0
DepthMaster/src/dataset/diode_dataset.py +94 -0
DepthMaster/src/dataset/eth3d_dataset.py +68 -0
DepthMaster/src/dataset/hypersim_dataset.py +48 -0
DepthMaster/src/dataset/kitti_dataset.py +127 -0
DepthMaster/src/dataset/mixed_sampler.py +151 -0
DepthMaster/src/dataset/nyu_dataset.py +64 -0
DepthMaster/src/dataset/scannet_dataset.py +47 -0
DepthMaster/src/dataset/vkitti_dataset.py +100 -0
DepthMaster/src/trainer/__init__.py +15 -0
DepthMaster/src/trainer/trainer_s1.py +671 -0
DepthMaster/src/trainer/trainer_s2.py +630 -0
DepthMaster/src/util/alignment.py +180 -0
DepthMaster/src/util/boundary_metrics.py +332 -0
DepthMaster/src/util/build_mlp.py +10 -0
DepthMaster/src/util/config_util.py +70 -0
DepthMaster/src/util/data_loader.py +111 -0
DepthMaster/src/util/depth_transform.py +124 -0

DepthMaster/ckpt/eval/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

DepthMaster/ckpt/eval/README.md ADDED Viewed

	@@ -0,0 +1,97 @@

+---
+license: apache-2.0
+language:
+- en
+base_model:
+- stabilityai/stable-diffusion-2
+pipeline_tag: depth-estimation
+---
+<!-- # DepthMaster: Taming Diffusion Models for Monocular Depth Estimation
+This repository represents the official implementation of the paper titled "DepthMaster: Taming Diffusion Models for Monocular Depth Estimation". -->
+<!-- [![Website](doc/badges/badge-website.svg)](https://marigoldmonodepth.github.io)
+[![Paper](https://img.shields.io/badge/arXiv-PDF-b31b1b)](https://arxiv.org/abs/2312.02145) -->
+<!-- [![License](https://img.shields.io/badge/License-Apache--2.0-929292)](https://www.apache.org/licenses/LICENSE-2.0) -->
+<h1 align="center"><strong>DepthMaster: Taming Diffusion Models for Monocular Depth Estimation</strong></h1>
+    <p align="center">
+        <a href="https://indu1ge.github.io/ziyangsong">Ziyang Song*</a>,
+        <a href="https://orcid.org/0009-0001-6677-0572">Zerong Wang*</a>,
+        <a href="https://orcid.org/0000-0001-7817-0665">Bo Li</a>,
+        <a href="https://orcid.org/0009-0007-1175-5918">Hao Zhang</a>,
+        <a href="https://ruijiezhu94.github.io/ruijiezhu/">Ruijie Zhu</a>,
+        <a href="https://orcid.org/0009-0004-3280-8490">Li Liu</a>,
+        <a href="https://pengtaojiang.github.io/">Peng-Tao Jiang†</a>,
+        <a href="http://staff.ustc.edu.cn/~tzzhang/">Tianzhu Zhang†</a>,
+        <br>
+        *Equal Contribution, †Corresponding Author
+        <br>
+        University of Science and Technology of China, vivo Mobile Communication Co., Ltd.
+        <br>
+        <b>Arxiv 2025</b>
+</p>
+<!-- [Ziyang Song*](https://indu1ge.github.io/ziyangsong),
+[Zerong Wang*](),
+[Bo Li](https://orcid.org/0000-0001-7817-0665),
+[Hao Zhang](https://orcid.org/0009-0007-1175-5918),
+[Ruijie Zhu](https://ruijiezhu94.github.io/ruijiezhu/),
+[Li Liu](https://orcid.org/0009-0004-3280-8490)
+[Tianzhu Zhang](http://staff.ustc.edu.cn/~tzzhang/)
+[Peng-Tao Jiang](https://pengtaojiang.github.io/) -->
+<div align="center">
+  <a href='https://arxiv.org/abs/2501.02576'>
+    <img src='https://img.shields.io/badge/Paper-arXiv-red'>
+  </a>
+  <a href='https://indu1ge.github.io/DepthMaster_page/'>
+    <img src='https://img.shields.io/badge/Project-Page-Green'>
+  </a>
+  <a href='https://github.com/indu1ge/DepthMaster'>
+    <img src='https://img.shields.io/badge/GitHub-Repository-blue?logo=github'>
+  </a>
+  <a href='https://www.apache.org/licenses/LICENSE-2.0'>
+    <img src='https://img.shields.io/badge/License-Apache--2.0-929292'>
+  </a>
+</div>
+<!-- We present Marigold, a diffusion model, and associated fine-tuning protocol for monocular depth estimation. Its core principle is to leverage the rich visual knowledge stored in modern generative image models. Our model, derived from Stable Diffusion and fine-tuned with synthetic data, can zero-shot transfer to unseen data, offering state-of-the-art monocular depth estimation results. -->
+![teaser](assets/framework.png)
+<!-- >We present DepthMaster, a tamed single-step diffusion model designed to enhance the generalization and detail preservation abilities of depth estimation models. Through feature alignment, we effectively prevent the overfitting to texture details. By adaptively enhance  -->
+>We present DepthMaster, a tamed single-step diffusion model that customizes generative features in diffusion models to suit the discriminative depth estimation task. We introduce a Feature Alignment module to mitigate overfitting to texture and a Fourier Enhancement module to refine fine-grained details. DepthMaster exhibits state-of-the-art zero-shot performance and superior detail preservation ability, surpassing
+other diffusion-based methods across various datasets.
+## 🎓 Citation
+Please cite our paper:
+```bibtex
+@article{song2025depthmaster,
+  title={DepthMaster: Taming Diffusion Models for Monocular Depth Estimation},
+  author={Song, Ziyang and Wang, Zerong and Li, Bo and Zhang, Hao and Zhu, Ruijie and Liu, Li and Jiang, Peng-Tao and Zhang, Tianzhu},
+  journal={arXiv preprint arXiv:2501.02576},
+  year={2025}
+}
+```
+## Acknowledgements
+The code is based on [Marigold](https://github.com/prs-eth/Marigold).
+## 🎫 License
+This work is licensed under the Apache License, Version 2.0 (as defined in the [LICENSE](LICENSE.txt)).
+By downloading and using the code and model you agree to the terms in the  [LICENSsE](LICENSE.txt).
+[![License](https://img.shields.io/badge/License-Apache--2.0-929292)](https://www.apache.org/licenses/LICENSE-2.0)

DepthMaster/ckpt/eval/model_index.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "_class_name":"MarigoldPipeline",
+    "_diffusers_version":"0.24.0",
+    "scale_invariant": true,
+    "shift_invariant": true,
+    "default_denoising_steps": 10,
+    "default_processing_resolution": 768,
+    "unet":[
+       "diffusers",
+       "UNet2DConditionModel"
+    ],
+    "vae":[
+       "diffusers",
+       "AutoencoderKL"
+    ],
+    "scheduler":[
+       "diffusers",
+       "DDIMScheduler"
+    ],
+    "text_encoder":[
+       "transformers",
+       "CLIPTextModel"
+    ],
+    "tokenizer":[
+       "transformers",
+       "CLIPTokenizer"
+    ]
+ }

DepthMaster/ckpt/eval/text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "hf-models/stable-diffusion-v2-768x768/text_encoder",
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_size": 1024,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 23,
+  "pad_token_id": 1,
+  "projection_dim": 512,
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.0.dev0",
+  "vocab_size": 49408
+}

DepthMaster/ckpt/eval/tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

DepthMaster/ckpt/eval/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "!",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

DepthMaster/ckpt/eval/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "do_lower_case": true,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "model_max_length": 77,
+  "name_or_path": "hf-models/stable-diffusion-v2-768x768/tokenizer",
+  "pad_token": "<|endoftext|>",
+  "special_tokens_map_file": "./special_tokens_map.json",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

DepthMaster/ckpt/eval/tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

DepthMaster/ckpt/eval/unet/config.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "/data/vjuicefs_ai_camera_jgroup/11169299/Marigold_rgb2d/log/depth_preprocess/rgb2disp_bs4_sqrt_disp_cos1e-3_0.85/checkpoint/iter_014000/unet",
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 1024,
+  "cross_attention_norm": null,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_out_scale_factor": 1.0,
+  "resnet_skip_time_act": false,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 96,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": true
+}

DepthMaster/ckpt/eval/vae/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.8.0",
+  "_name_or_path": "hf-models/stable-diffusion-v2-768x768/vae",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 768,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

DepthMaster/data_split/kitti/eigen_train_files_with_gt.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

DepthMaster/depthmaster/modules/__pycache__/unet_2d_blocks.cpython-310.pyc ADDED Viewed

Binary file (67.3 kB). View file

DepthMaster/depthmaster/modules/__pycache__/unet_2d_condition_s2.cpython-310.pyc ADDED Viewed

Binary file (40.9 kB). View file

DepthMaster/external_encoder/dinov2/dinov2_layers/attention.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

DepthMaster/external_encoder/dinov2/dinov2_layers/block.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

DepthMaster/external_encoder/dinov2/dinov2_layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

DepthMaster/external_encoder/dinov2/dinov2_layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

DepthMaster/external_encoder/dinov2/dinov2_layers/mlp.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

DepthMaster/external_encoder/dinov2/dinov2_layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

DepthMaster/external_encoder/dinov2/dinov2_layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, Optional
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+try:
+    from xformers.ops import SwiGLU
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

DepthMaster/external_encoder/dinov2/util/transform.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import numpy as np
+import cv2
+import torch.nn.functional as F
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(sample["image"].shape[-1], sample["image"].shape[-2])
+        # resize sample
+        # sample["image"] = cv2.resize(sample["image"], dsize=(width, height), interpolation=cv2.INTER_NEAREST)
+        sample["image"] = F.interpolate(sample["image"], size=(height, width), mode='bilinear', align_corners=False)
+        if self.__resize_target:
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST)
+        return sample, (height, width)
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        return sample

DepthMaster/in_the_wild_example/input/06.jpg ADDED Viewed

DepthMaster/scripts/eval_diode.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+set -e
+set -x
+export CUDA_VISIBLE_DEVICES=5
+python evaluate.py \
+    --base_data_dir path/to/basedata \
+    --dataset_config config/dataset/data_diode_all.yaml \
+    --alignment least_square_sqrt_disp \
+    --output_dir output/diode/final \
+    --checkpoint ckpt/eval \
+    --processing_res 640 \
+    --seed 1234 \

DepthMaster/scripts/eval_eth3d.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+set -e
+set -x
+export CUDA_VISIBLE_DEVICES=5
+python evaluate.py \
+    --base_data_dir path/to/basedata \
+    --dataset_config config/dataset/data_eth3d.yaml \
+    --alignment least_square_sqrt_disp \
+    --output_dir output/eth3d/final \
+    --checkpoint ckpt/eval \
+    --processing_res 756 \
+    --seed 1234 \

DepthMaster/scripts/eval_hypersim.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+set -e
+set -x
+export CUDA_VISIBLE_DEVICES=5
+python evaluate.py \
+    --base_data_dir path/to/basedata \
+    --dataset_config config/dataset/data_hypersim_test.yaml \
+    --alignment least_square_sqrt_disp \
+    --output_dir output/hypersim/final \
+    --checkpoint ckpt/eval \
+    --processing_res 0 \
+    --seed 1234 \

DepthMaster/scripts/eval_kitti.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+set -e
+set -x
+export CUDA_VISIBLE_DEVICES=5
+python evaluate.py \
+    --base_data_dir path/to/basedata \
+    --dataset_config config/dataset/data_kitti_eigen_test.yaml \
+    --alignment least_square_sqrt_disp \
+    --output_dir output/kitti/final \
+    --checkpoint ckpt/eval \
+    --processing_res 0 \
+    --seed 1234 \

DepthMaster/scripts/eval_nyu.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+set -e
+set -x
+export CUDA_VISIBLE_DEVICES=5
+python evaluate.py \
+    --base_data_dir path/to/basedata \
+    --dataset_config config/dataset/data_nyu_test.yaml \
+    --alignment least_square_sqrt_disp \
+    --output_dir output/nyu/final1 \
+    --checkpoint ckpt/eval \
+    --processing_res 0 \
+    --seed 1234 \

DepthMaster/scripts/eval_scannet.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+set -e
+set -x
+export CUDA_VISIBLE_DEVICES=5
+python evaluate.py \
+    --base_data_dir path/to/basedata \
+    --dataset_config config/dataset/data_scannet_val.yaml \
+    --alignment least_square_sqrt_disp \
+    --output_dir output/scannet/final \
+    --checkpoint ckpt/eval \
+    --processing_res 0 \
+    --seed 1234 \

DepthMaster/scripts/infer.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/usr/bin/env bash
+set -e
+set -x
+export CUDA_VISIBLE_DEVICES=5
+python run.py \
+    --checkpoint ckpt/eval \
+    --processing_res 768 \
+    --input_rgb_dir in_the_wild_example/input \
+    --output_dir in_the_wild_example/output/final \

DepthMaster/scripts/train_s1.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+BASE_DATA_DIR="path/to/basedata"
+BASE_CKPT_DIR="path/to/sd2_ckpt"
+export CUDA_VISIBLE_DEVICES=3
+python train_s1.py --config config/train_s1.yaml \
+    --base_data_dir $BASE_DATA_DIR \
+    --base_ckpt_dir $BASE_CKPT_DIR \
+    --output_dir log/stage1_bs8 \
+    --no_wandb \

DepthMaster/scripts/train_s2.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+BASE_DATA_DIR="/zhdd/dataset"
+BASE_CKPT_DIR="ori_ckpt"
+export CUDA_VISIBLE_DEVICES=2
+python train_s2.py --config config/train_s2.yaml \
+    --base_data_dir $BASE_DATA_DIR \
+    --base_ckpt_dir $BASE_CKPT_DIR \
+    --output_dir log/stage2 \
+    --no_wandb \

DepthMaster/src/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import os
+from .base_depth_dataset import BaseDepthDataset, get_pred_name, DatasetMode  # noqa: F401
+from .diode_dataset import DIODEDataset
+from .eth3d_dataset import ETH3DDataset
+from .hypersim_dataset import HypersimDataset
+from .kitti_dataset import KITTIDataset
+from .nyu_dataset import NYUDataset
+from .scannet_dataset import ScanNetDataset
+from .vkitti_dataset import VirtualKITTIDataset
+dataset_name_class_dict = {
+    "hypersim": HypersimDataset,
+    "vkitti": VirtualKITTIDataset,
+    "nyu_v2": NYUDataset,
+    "kitti": KITTIDataset,
+    "eth3d": ETH3DDataset,
+    "diode": DIODEDataset,
+    "scannet": ScanNetDataset,
+}
+def get_dataset(
+    cfg_data_split, base_data_dir: str, mode: DatasetMode, **kwargs
+) -> BaseDepthDataset:
+    if "mixed" == cfg_data_split.name:
+        assert DatasetMode.TRAIN == mode, "Only training mode supports mixed datasets."
+        dataset_ls = [
+            get_dataset(_cfg, base_data_dir, mode, **kwargs)
+            for _cfg in cfg_data_split.dataset_list
+        ]
+        return dataset_ls
+    elif cfg_data_split.name in dataset_name_class_dict.keys():
+        dataset_class = dataset_name_class_dict[cfg_data_split.name]
+        dataset = dataset_class(
+            mode=mode,
+            filename_ls_path=cfg_data_split.filenames,
+            dataset_dir=os.path.join(base_data_dir, cfg_data_split.dir),
+            # dataset_tom_dir=os.path.join(base_data_dir, cfg_data_split.tom_dir),
+            **cfg_data_split,
+            **kwargs,
+        )
+    else:
+        raise NotImplementedError
+    return dataset

DepthMaster/src/dataset/base_depth_dataset.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import io
+import os
+import random
+import tarfile
+from enum import Enum
+from typing import Union
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision.transforms import InterpolationMode, Resize, RandomResizedCrop
+from src.util.depth_transform import DepthNormalizerBase
+from src.util.alignment import depth2disparity
+class DatasetMode(Enum):
+    RGB_ONLY = "rgb_only"
+    EVAL = "evaluate"
+    TRAIN = "train"
+class DepthFileNameMode(Enum):
+    """Prediction file naming modes"""
+    id = 1  # id.png
+    rgb_id = 2  # rgb_id.png
+    i_d_rgb = 3  # i_d_1_rgb.png
+    rgb_i_d = 4
+def read_image_from_tar(tar_obj, img_rel_path):
+    image = tar_obj.extractfile("./" + img_rel_path)
+    image = image.read()
+    image = Image.open(io.BytesIO(image))
+class BaseDepthDataset(Dataset):
+    def __init__(
+        self,
+        mode: DatasetMode,
+        filename_ls_path: str,
+        dataset_dir: str,
+        disp_name: str,
+        min_depth: float,
+        max_depth: float,
+        has_filled_depth: bool,
+        has_egde_mask: bool,
+        name_mode: DepthFileNameMode,
+        depth_transform: Union[DepthNormalizerBase, None] = None,
+        augmentation_args: dict = None,
+        resize_to_hw=None,
+        move_invalid_to_far_plane: bool = True,
+        rgb_transform=lambda x: x / 255.0 * 2 - 1,  #  [0, 255] -> [-1, 1],
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.mode = mode
+        # dataset info
+        self.filename_ls_path = filename_ls_path
+        self.dataset_dir = dataset_dir
+        assert os.path.exists(
+            self.dataset_dir
+        ), f"Dataset does not exist at: {self.dataset_dir}"
+        self.disp_name = disp_name
+        self.has_filled_depth = has_filled_depth
+        self.has_egde_mask = has_egde_mask
+        self.name_mode: DepthFileNameMode = name_mode
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        # training arguments
+        self.depth_transform: DepthNormalizerBase = depth_transform
+        self.augm_args = augmentation_args
+        self.resize_to_hw = resize_to_hw
+        self.rgb_transform = rgb_transform
+        self.move_invalid_to_far_plane = move_invalid_to_far_plane
+        # Load filenames
+        with open(self.filename_ls_path, "r") as f:
+            self.filenames = [
+                s.split() for s in f.readlines()
+            ]  # [['rgb.png', 'depth.tif'], [], ...]
+        # Tar dataset
+        self.tar_obj = None
+        self.is_tar = (
+            True
+            if os.path.isfile(dataset_dir) and tarfile.is_tarfile(dataset_dir)
+            else False
+        )
+    def __len__(self):
+        return len(self.filenames)
+    def __getitem__(self, index):
+        rasters, other = self._get_data_item(index)
+        if DatasetMode.TRAIN == self.mode:
+            rasters = self._training_preprocess(rasters)
+        # merge
+        outputs = rasters
+        outputs.update(other)
+        return outputs
+    def _get_data_item(self, index):
+        rgb_rel_path, depth_rel_path, filled_rel_path = self._get_data_path(index=index)
+        rasters = {}
+        # RGB data
+        rasters.update(self._load_rgb_data(rgb_rel_path=rgb_rel_path))
+        # Depth data
+        if DatasetMode.RGB_ONLY != self.mode:
+            # load data
+            depth_data = self._load_depth_data(
+                depth_rel_path=depth_rel_path, filled_rel_path=filled_rel_path
+            )
+            rasters.update(depth_data)
+            # valid mask
+            rasters["valid_mask_raw"] = self._get_valid_mask(
+                rasters["depth_raw_linear"]
+            ).clone()
+            rasters["valid_mask_filled"] = self._get_valid_mask(
+                rasters["depth_filled_linear"]
+            ).clone()
+            if DatasetMode.TRAIN == self.mode:
+                # depth2disp
+                rasters["depth_raw_linear"] = depth2disparity(rasters["depth_raw_linear"]).clone()
+                if self.has_filled_depth:
+                    rasters["depth_filled_linear"] = depth2disparity(rasters["depth_filled_linear"]).clone()
+                # sqrt(x)
+                rasters["depth_raw_linear"] = torch.sqrt(rasters["depth_raw_linear"]).clone()
+                if self.has_filled_depth:
+                    rasters["depth_filled_linear"] = torch.sqrt(rasters["depth_filled_linear"]).clone()
+        other = {"index": index, "rgb_relative_path": rgb_rel_path}
+        return rasters, other
+    def _load_rgb_data(self, rgb_rel_path):
+        # Read RGB data
+        rgb = self._read_rgb_file(rgb_rel_path)
+        rgb_norm = rgb / 255.0 * 2.0 - 1.0  #  [0, 255] -> [-1, 1]
+        outputs = {
+            "rgb_int": torch.from_numpy(rgb).int(),
+            "rgb_norm": torch.from_numpy(rgb_norm).float(),
+        }
+        return outputs
+    def _load_depth_data(self, depth_rel_path, filled_rel_path):
+        # Read depth data
+        outputs = {}
+        depth_raw = self._read_depth_file(depth_rel_path).squeeze()
+        depth_raw_linear = torch.from_numpy(depth_raw).float().unsqueeze(0)  # [1, H, W]
+        outputs["depth_raw_linear"] = depth_raw_linear.clone()
+        if self.has_filled_depth:
+            depth_filled = self._read_depth_file(filled_rel_path).squeeze()
+            depth_filled_linear = torch.from_numpy(depth_filled).float().unsqueeze(0)
+            outputs["depth_filled_linear"] = depth_filled_linear
+        else:
+            outputs["depth_filled_linear"] = depth_raw_linear.clone()
+        return outputs
+    def _get_data_path(self, index):
+        filename_line = self.filenames[index]
+        # Get data path
+        rgb_rel_path = filename_line[0]
+        depth_rel_path, filled_rel_path = None, None
+        if DatasetMode.RGB_ONLY != self.mode:
+            depth_rel_path = filename_line[1]
+            if self.has_filled_depth:
+                filled_rel_path = filename_line[2]
+        return rgb_rel_path, depth_rel_path, filled_rel_path
+    def _read_image(self, img_rel_path) -> np.ndarray:
+        if self.is_tar:
+            if self.tar_obj is None:
+                self.tar_obj = tarfile.open(self.dataset_dir)
+            image_to_read = self.tar_obj.extractfile("./" + img_rel_path)
+            image_to_read = image_to_read.read()
+            image_to_read = io.BytesIO(image_to_read)
+        else:
+            image_to_read = os.path.join(self.dataset_dir, img_rel_path)
+        image = Image.open(image_to_read)  # [H, W, rgb]
+        image = np.asarray(image)
+        return image
+    def _read_rgb_file(self, rel_path) -> np.ndarray:
+        rgb = self._read_image(rel_path)
+        rgb = np.transpose(rgb, (2, 0, 1)).astype(int)  # [rgb, H, W]
+        return rgb
+    def _read_depth_file(self, rel_path):
+        depth_in = self._read_image(rel_path)
+        #  Replace code below to decode depth according to dataset definition
+        depth_decoded = depth_in
+        return depth_decoded
+    def _get_valid_mask(self, depth: torch.Tensor):
+        valid_mask = torch.logical_and(
+            (depth > self.min_depth), (depth < self.max_depth)
+        ).bool()
+        return valid_mask
+    def _training_preprocess(self, rasters):
+        # Augmentation
+        if self.augm_args is not None:
+            rasters = self._augment_data(rasters)
+        # Normalization
+        rasters["depth_raw_norm"] = self.depth_transform(
+            rasters["depth_raw_linear"], rasters["valid_mask_raw"]
+        ).clone()
+        rasters["depth_filled_norm"] = self.depth_transform(
+            rasters["depth_filled_linear"], rasters["valid_mask_filled"]
+        ).clone()
+        # Set invalid pixel to far plane
+        if self.move_invalid_to_far_plane:
+            if self.depth_transform.far_plane_at_max:
+                rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
+                    self.depth_transform.norm_max
+                )
+            else:
+                rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
+                    self.depth_transform.norm_min
+                )
+        # Resize
+        if self.resize_to_hw is not None:
+            resize_transform = Resize(
+                size=self.resize_to_hw, interpolation=InterpolationMode.NEAREST_EXACT
+            )
+            rasters = {k: resize_transform(v) for k, v in rasters.items()}
+            # # randomresizedcrop
+            # resizedcrop = RandomResizedCrop(size=self.resize_to_hw, scale=(0.9, 1), ratio=())
+        return rasters
+    def _augment_data(self, rasters_dict):
+        # lr flipping
+        lr_flip_p = self.augm_args.lr_flip_p
+        if random.random() < lr_flip_p:
+            rasters_dict = {k: v.flip(-1) for k, v in rasters_dict.items()}
+        return rasters_dict
+    def __del__(self):
+        if hasattr(self, "tar_obj") and self.tar_obj is not None:
+            self.tar_obj.close()
+            self.tar_obj = None
+def get_pred_name(rgb_basename, name_mode, suffix=".png"):
+    if DepthFileNameMode.rgb_id == name_mode:
+        pred_basename = "pred_" + rgb_basename.split("_")[1]
+    elif DepthFileNameMode.i_d_rgb == name_mode:
+        pred_basename = rgb_basename.replace("_rgb.", "_pred.")
+    elif DepthFileNameMode.id == name_mode:
+        pred_basename = "pred_" + rgb_basename
+    elif DepthFileNameMode.rgb_i_d == name_mode:
+        pred_basename = "pred_" + "_".join(rgb_basename.split("_")[1:])
+    else:
+        raise NotImplementedError
+    # change suffix
+    pred_basename = os.path.splitext(pred_basename)[0] + suffix
+    return pred_basename

DepthMaster/src/dataset/diode_dataset.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import os
+import tarfile
+from io import BytesIO
+import numpy as np
+import torch
+from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode, DatasetMode
+class DIODEDataset(BaseDepthDataset):
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            # DIODE data parameter
+            min_depth=0.6,
+            max_depth=350,
+            has_filled_depth=False,
+            has_egde_mask=False,
+            name_mode=DepthFileNameMode.id,
+            **kwargs,
+        )
+    def _read_npy_file(self, rel_path):
+        if self.is_tar:
+            if self.tar_obj is None:
+                self.tar_obj = tarfile.open(self.dataset_dir)
+            fileobj = self.tar_obj.extractfile("./" + rel_path)
+            npy_path_or_content = BytesIO(fileobj.read())
+        else:
+            npy_path_or_content = os.path.join(self.dataset_dir, rel_path)
+        data = np.load(npy_path_or_content).squeeze()[np.newaxis, :, :]
+        return data
+    def _read_depth_file(self, rel_path):
+        depth = self._read_npy_file(rel_path)
+        return depth
+    def _get_data_path(self, index):
+        return self.filenames[index]
+    def _get_data_item(self, index):
+        # Special: depth mask is read from data
+        rgb_rel_path, depth_rel_path, mask_rel_path = self._get_data_path(index=index)
+        rasters = {}
+        # RGB data
+        rasters.update(self._load_rgb_data(rgb_rel_path=rgb_rel_path))
+        # Depth data
+        if DatasetMode.RGB_ONLY != self.mode:
+            # load data
+            depth_data = self._load_depth_data(
+                depth_rel_path=depth_rel_path, filled_rel_path=None
+            )
+            rasters.update(depth_data)
+            # valid mask
+            mask = self._read_npy_file(mask_rel_path).astype(bool)
+            mask = torch.from_numpy(mask).bool()
+            rasters["valid_mask_raw"] = mask.clone()
+            rasters["valid_mask_filled"] = mask.clone()
+        other = {"index": index, "rgb_relative_path": rgb_rel_path}
+        return rasters, other

DepthMaster/src/dataset/eth3d_dataset.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import torch
+import tarfile
+import os
+import numpy as np
+from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode
+class ETH3DDataset(BaseDepthDataset):
+    HEIGHT, WIDTH = 4032, 6048
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            # ETH3D data parameter
+            min_depth=1e-5,
+            max_depth=torch.inf,
+            has_filled_depth=False,
+            has_egde_mask=False,
+            name_mode=DepthFileNameMode.id,
+            **kwargs,
+        )
+    def _read_depth_file(self, rel_path):
+        # Read special binary data: https://www.eth3d.net/documentation#format-of-multi-view-data-image-formats
+        if self.is_tar:
+            if self.tar_obj is None:
+                self.tar_obj = tarfile.open(self.dataset_dir)
+            binary_data = self.tar_obj.extractfile("./" + rel_path)
+            binary_data = binary_data.read()
+        else:
+            depth_path = os.path.join(self.dataset_dir, rel_path)
+            with open(depth_path, "rb") as file:
+                binary_data = file.read()
+        # Convert the binary data to a numpy array of 32-bit floats
+        depth_decoded = np.frombuffer(binary_data, dtype=np.float32).copy()
+        depth_decoded[depth_decoded == torch.inf] = 0.0
+        depth_decoded = depth_decoded.reshape((self.HEIGHT, self.WIDTH))
+        return depth_decoded

DepthMaster/src/dataset/hypersim_dataset.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode
+class HypersimDataset(BaseDepthDataset):
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            # Hypersim data parameter
+            min_depth=1e-5,
+            max_depth=65.0,
+            has_filled_depth=False,
+            has_egde_mask=False,
+            name_mode=DepthFileNameMode.rgb_i_d,
+            **kwargs,
+        )
+    def _read_depth_file(self, rel_path):
+        depth_in = self._read_image(rel_path)
+        # Decode Hypersim depth
+        depth_decoded = depth_in / 1000.0
+        return depth_decoded

DepthMaster/src/dataset/kitti_dataset.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import torch
+from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode
+class KITTIDataset(BaseDepthDataset):
+    def __init__(
+        self,
+        kitti_bm_crop,  # Crop to KITTI benchmark size
+        valid_mask_crop,  # Evaluation mask. [None, garg or eigen]
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            # KITTI data parameter
+            min_depth=1e-5,
+            max_depth=80,
+            has_filled_depth=False,
+            has_egde_mask=False,
+            name_mode=DepthFileNameMode.id,
+            **kwargs,
+        )
+        self.kitti_bm_crop = kitti_bm_crop
+        self.valid_mask_crop = valid_mask_crop
+        assert self.valid_mask_crop in [
+            None,
+            "garg",  # set evaluation mask according to Garg  ECCV16
+            "eigen",  # set evaluation mask according to Eigen NIPS14
+        ], f"Unknown crop type: {self.valid_mask_crop}"
+        # Filter out empty depth
+        self.filenames = [f for f in self.filenames if "None" != f[1]]
+    def _read_depth_file(self, rel_path):
+        depth_in = self._read_image(rel_path)
+        # Decode KITTI depth
+        depth_decoded = depth_in / 256.0
+        return depth_decoded
+    def _load_rgb_data(self, rgb_rel_path):
+        rgb_data = super()._load_rgb_data(rgb_rel_path)
+        if self.kitti_bm_crop:
+            rgb_data = {k: self.kitti_benchmark_crop(v) for k, v in rgb_data.items()}
+        return rgb_data
+    def _load_depth_data(self, depth_rel_path, filled_rel_path):
+        depth_data = super()._load_depth_data(depth_rel_path, filled_rel_path)
+        if self.kitti_bm_crop:
+            depth_data = {
+                k: self.kitti_benchmark_crop(v) for k, v in depth_data.items()
+            }
+        return depth_data
+    @staticmethod
+    def kitti_benchmark_crop(input_img):
+        """
+        Crop images to KITTI benchmark size
+        Args:
+            `input_img` (torch.Tensor): Input image to be cropped.
+        Returns:
+            torch.Tensor:Cropped image.
+        """
+        KB_CROP_HEIGHT = 352
+        KB_CROP_WIDTH = 1216
+        height, width = input_img.shape[-2:]
+        top_margin = int(height - KB_CROP_HEIGHT)
+        left_margin = int((width - KB_CROP_WIDTH) / 2)
+        if 2 == len(input_img.shape):
+            out = input_img[
+                top_margin : top_margin + KB_CROP_HEIGHT,
+                left_margin : left_margin + KB_CROP_WIDTH,
+            ]
+        elif 3 == len(input_img.shape):
+            out = input_img[
+                :,
+                top_margin : top_margin + KB_CROP_HEIGHT,
+                left_margin : left_margin + KB_CROP_WIDTH,
+            ]
+        return out
+    def _get_valid_mask(self, depth: torch.Tensor):
+        # reference: https://github.com/cleinc/bts/blob/master/pytorch/bts_eval.py
+        valid_mask = super()._get_valid_mask(depth)  # [1, H, W]
+        if self.valid_mask_crop is not None:
+            eval_mask = torch.zeros_like(valid_mask.squeeze()).bool()
+            gt_height, gt_width = eval_mask.shape
+            if "garg" == self.valid_mask_crop:
+                eval_mask[
+                    int(0.40810811 * gt_height) : int(0.99189189 * gt_height),
+                    int(0.03594771 * gt_width) : int(0.96405229 * gt_width),
+                ] = 1
+            elif "eigen" == self.valid_mask_crop:
+                eval_mask[
+                    int(0.3324324 * gt_height) : int(0.91351351 * gt_height),
+                    int(0.0359477 * gt_width) : int(0.96405229 * gt_width),
+                ] = 1
+            eval_mask.reshape(valid_mask.shape)
+            valid_mask = torch.logical_and(valid_mask, eval_mask)
+        return valid_mask

DepthMaster/src/dataset/mixed_sampler.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import torch
+from torch.utils.data import (
+    BatchSampler,
+    RandomSampler,
+    SequentialSampler,
+)
+class MixedBatchSampler(BatchSampler):
+    """Sample one batch from a selected dataset with given probability.
+    Compatible with datasets at different resolution
+    """
+    def __init__(
+        self, src_dataset_ls, batch_size, drop_last, shuffle, prob=None, generator=None
+    ):
+        self.base_sampler = None
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.generator = generator
+        self.src_dataset_ls = src_dataset_ls
+        self.n_dataset = len(self.src_dataset_ls)
+        # Dataset length
+        self.dataset_length = [len(ds) for ds in self.src_dataset_ls]
+        self.cum_dataset_length = [
+            sum(self.dataset_length[:i]) for i in range(self.n_dataset)
+        ]  # cumulative dataset length
+        # BatchSamplers for each source dataset
+        if self.shuffle:
+            self.src_batch_samplers = [
+                BatchSampler(
+                    sampler=RandomSampler(
+                        ds, replacement=False, generator=self.generator
+                    ),
+                    batch_size=self.batch_size,
+                    drop_last=self.drop_last,
+                )
+                for ds in self.src_dataset_ls
+            ]
+        else:
+            self.src_batch_samplers = [
+                BatchSampler(
+                    sampler=SequentialSampler(ds),
+                    batch_size=self.batch_size,
+                    drop_last=self.drop_last,
+                )
+                for ds in self.src_dataset_ls
+            ]
+        self.raw_batches = [
+            list(bs) for bs in self.src_batch_samplers
+        ]  # index in original dataset
+        self.n_batches = [len(b) for b in self.raw_batches]
+        self.n_total_batch = sum(self.n_batches)
+        # sampling probability
+        if prob is None:
+            # if not given, decide by dataset length
+            self.prob = torch.tensor(self.n_batches) / self.n_total_batch
+        else:
+            self.prob = torch.as_tensor(prob)
+    def __iter__(self):
+        """_summary_
+        Yields:
+            list(int): a batch of indics, corresponding to ConcatDataset of src_dataset_ls
+        """
+        for _ in range(self.n_total_batch):
+            idx_ds = torch.multinomial(
+                self.prob, 1, replacement=True, generator=self.generator
+            ).item()
+            # if batch list is empty, generate new list
+            if 0 == len(self.raw_batches[idx_ds]):
+                self.raw_batches[idx_ds] = list(self.src_batch_samplers[idx_ds])
+            # get a batch from list
+            batch_raw = self.raw_batches[idx_ds].pop()
+            # shift by cumulative dataset length
+            shift = self.cum_dataset_length[idx_ds]
+            batch = [n + shift for n in batch_raw]
+            yield batch
+    def __len__(self):
+        return self.n_total_batch
+# Unit test
+if "__main__" == __name__:
+    from torch.utils.data import ConcatDataset, DataLoader, Dataset
+    class SimpleDataset(Dataset):
+        def __init__(self, start, len) -> None:
+            super().__init__()
+            self.start = start
+            self.len = len
+        def __len__(self):
+            return self.len
+        def __getitem__(self, index):
+            return self.start + index
+    dataset_1 = SimpleDataset(0, 10)
+    dataset_2 = SimpleDataset(200, 20)
+    dataset_3 = SimpleDataset(1000, 50)
+    concat_dataset = ConcatDataset(
+        [dataset_1, dataset_2, dataset_3]
+    )  # will directly concatenate
+    mixed_sampler = MixedBatchSampler(
+        src_dataset_ls=[dataset_1, dataset_2, dataset_3],
+        batch_size=4,
+        drop_last=True,
+        shuffle=False,
+        prob=[0.6, 0.3, 0.1],
+        generator=torch.Generator().manual_seed(0),
+    )
+    loader = DataLoader(concat_dataset, batch_sampler=mixed_sampler)
+    for d in loader:
+        print(d)

DepthMaster/src/dataset/nyu_dataset.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import torch
+from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode
+class NYUDataset(BaseDepthDataset):
+    def __init__(
+        self,
+        eigen_valid_mask: bool,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            # NYUv2 dataset parameter
+            min_depth=1e-3,
+            max_depth=10.0,
+            has_filled_depth=True,
+            has_egde_mask=False,
+            name_mode=DepthFileNameMode.rgb_id,
+            **kwargs,
+        )
+        self.eigen_valid_mask = eigen_valid_mask
+    def _read_depth_file(self, rel_path):
+        depth_in = self._read_image(rel_path)
+        # Decode NYU depth
+        depth_decoded = depth_in / 1000.0
+        return depth_decoded
+    def _get_valid_mask(self, depth: torch.Tensor):
+        valid_mask = super()._get_valid_mask(depth)
+        # Eigen crop for evaluation
+        if self.eigen_valid_mask:
+            eval_mask = torch.zeros_like(valid_mask.squeeze()).bool()
+            eval_mask[45:471, 41:601] = 1
+            eval_mask.reshape(valid_mask.shape)
+            valid_mask = torch.logical_and(valid_mask, eval_mask)
+        return valid_mask

DepthMaster/src/dataset/scannet_dataset.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode
+class ScanNetDataset(BaseDepthDataset):
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            # ScanNet data parameter
+            min_depth=1e-3,
+            max_depth=10,
+            has_filled_depth=False,
+            has_egde_mask=False,
+            name_mode=DepthFileNameMode.id,
+            **kwargs,
+        )
+    def _read_depth_file(self, rel_path):
+        depth_in = self._read_image(rel_path)
+        # Decode ScanNet depth
+        depth_decoded = depth_in / 1000.0
+        return depth_decoded

DepthMaster/src/dataset/vkitti_dataset.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import torch
+from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode
+from .kitti_dataset import KITTIDataset
+class VirtualKITTIDataset(BaseDepthDataset):
+    def __init__(
+        self,
+        kitti_bm_crop,  # Crop to KITTI benchmark size
+        valid_mask_crop,  # Evaluation mask. [None, garg or eigen]
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            # virtual KITTI data parameter
+            min_depth=1e-5,
+            max_depth=80,  # 655.35
+            has_filled_depth=False,
+            has_egde_mask=False,
+            name_mode=DepthFileNameMode.id,
+            **kwargs,
+        )
+        self.kitti_bm_crop = kitti_bm_crop
+        self.valid_mask_crop = valid_mask_crop
+        assert self.valid_mask_crop in [
+            None,
+            "garg",  # set evaluation mask according to Garg  ECCV16
+            "eigen",  # set evaluation mask according to Eigen NIPS14
+        ], f"Unknown crop type: {self.valid_mask_crop}"
+        # Filter out empty depth
+        self.filenames = [f for f in self.filenames if "None" != f[1]]
+    def _read_depth_file(self, rel_path):
+        depth_in = self._read_image(rel_path)
+        # Decode vKITTI depth
+        depth_decoded = depth_in / 100.0
+        return depth_decoded
+    def _load_rgb_data(self, rgb_rel_path):
+        rgb_data = super()._load_rgb_data(rgb_rel_path)
+        if self.kitti_bm_crop:
+            rgb_data = {
+                k: KITTIDataset.kitti_benchmark_crop(v) for k, v in rgb_data.items()
+            }
+        return rgb_data
+    def _load_depth_data(self, depth_rel_path, filled_rel_path):
+        depth_data = super()._load_depth_data(depth_rel_path, filled_rel_path)
+        if self.kitti_bm_crop:
+            depth_data = {
+                k: KITTIDataset.kitti_benchmark_crop(v) for k, v in depth_data.items()
+            }
+        return depth_data
+    def _get_valid_mask(self, depth: torch.Tensor):
+        # reference: https://github.com/cleinc/bts/blob/master/pytorch/bts_eval.py
+        valid_mask = super()._get_valid_mask(depth)  # [1, H, W]
+        if self.valid_mask_crop is not None:
+            eval_mask = torch.zeros_like(valid_mask.squeeze()).bool()
+            gt_height, gt_width = eval_mask.shape
+            if "garg" == self.valid_mask_crop:
+                eval_mask[
+                    int(0.40810811 * gt_height) : int(0.99189189 * gt_height),
+                    int(0.03594771 * gt_width) : int(0.96405229 * gt_width),
+                ] = 1
+            elif "eigen" == self.valid_mask_crop:
+                eval_mask[
+                    int(0.3324324 * gt_height) : int(0.91351351 * gt_height),
+                    int(0.0359477 * gt_width) : int(0.96405229 * gt_width),
+                ] = 1
+            eval_mask.reshape(valid_mask.shape)
+            valid_mask = torch.logical_and(valid_mask, eval_mask)
+        return valid_mask

DepthMaster/src/trainer/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Author: Bingxin Ke
+# Last modified: 2024-05-17
+from .trainer_s1 import DepthMasterTrainerS1
+from .trainer_s2 import DepthMasterTrainerS2
+trainer_cls_name_dict = {
+    "DepthMasterTrainerS1": DepthMasterTrainerS1,
+    "DepthMasterTrainerS2": DepthMasterTrainerS2,
+}
+def get_trainer_cls(trainer_name):
+    return trainer_cls_name_dict[trainer_name]

DepthMaster/src/trainer/trainer_s1.py ADDED Viewed

	@@ -0,0 +1,671 @@

+# Last modified: 2025-07-13
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import logging
+import os
+import random
+import shutil
+from datetime import datetime
+from typing import List, Union
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+from torch.optim import Adam
+from torch.optim.lr_scheduler import LambdaLR
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from PIL import Image
+import torch.nn.functional as F
+from depthmaster import DepthMasterPipeline, DepthMasterDepthOutput
+from src.util import metric
+from src.util.data_loader import skip_first_batches
+from src.util.logging_util import tb_logger, eval_dic_to_text
+from src.util.loss import get_loss, SSIM
+from src.util.lr_scheduler import IterExponential
+from src.util.metric import MetricTracker
+from src.util.alignment import (
+    align_depth_least_square,
+    depth2disparity,
+    disparity2depth,
+)
+from src.util.seeding import generate_seed_sequence
+from src.util.build_mlp import build_mlp_
+from torchvision.transforms import Normalize
+from external_encoder.dinov2.dinov2 import DINOv2
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+class DepthMasterTrainerS1:
+    def __init__(
+        self,
+        cfg: OmegaConf,
+        model: DepthMasterPipeline,
+        train_dataloader: DataLoader,
+        device,
+        base_ckpt_dir,
+        out_dir_ckpt,
+        out_dir_eval,
+        out_dir_vis,
+        accumulation_steps: int,
+        val_dataloaders: List[DataLoader] = None,
+        vis_dataloaders: List[DataLoader] = None,
+    ):
+        self.cfg: OmegaConf = cfg
+        self.model: DepthMasterPipeline = model
+        self.device = device
+        self.seed: Union[int, None] = (
+            self.cfg.trainer.init_seed
+        )  # used to generate seed sequence, set to `None` to train w/o seeding
+        self.out_dir_ckpt = out_dir_ckpt
+        self.out_dir_eval = out_dir_eval
+        self.out_dir_vis = out_dir_vis
+        self.train_loader: DataLoader = train_dataloader
+        self.val_loaders: List[DataLoader] = val_dataloaders
+        self.vis_loaders: List[DataLoader] = vis_dataloaders
+        self.accumulation_steps: int = accumulation_steps
+        # Encode empty text prompt
+        self.model.encode_empty_text()
+        self.empty_text_embed = self.model.empty_text_embed.detach().clone().to(device)
+        self.model.unet.enable_xformers_memory_efficient_attention()
+        # Initialize DINOv2 encoder
+        self.dinov2_encoder = DINOv2(model_name='vitg')
+        dinov2_encoder_dict = self.dinov2_encoder.state_dict()
+        pretrained_ckpt_dict = torch.load(f'checkpoints/depth_anything_v2_vitg.pth', map_location='cpu')
+        pretrained_dict = {k.replace('pretrained.', ''): v for k, v in pretrained_ckpt_dict.items() if k.replace('pretrained.', '') in dinov2_encoder_dict}
+        self.dinov2_encoder.load_state_dict(pretrained_dict)
+        del self.dinov2_encoder.head
+        self.dinov2_encoder.head = torch.nn.Identity()
+        self.dinov2_encoder.eval()
+        # Initialize adapter to align the feat dimension of SD and DINOv2
+        self.dinov2_adapter = build_mlp_(hidden_size=1280, projector_dim=1536, z_dim=1536)
+        # Trainability
+        self.dinov2_adapter.requires_grad_(True)
+        self.dinov2_encoder.requires_grad_(False)
+        self.model.vae.requires_grad_(False)
+        self.model.text_encoder.requires_grad_(False)
+        self.model.unet.requires_grad_(True)
+        # Optimizer !should be defined after input layer is adapted
+        lr = self.cfg.lr
+        self.optimizer = Adam([
+            {'params': self.model.unet.parameters(), 'lr': lr},
+            {'params': self.dinov2_adapter.parameters(), 'lr': lr}
+        ])
+        # LR scheduler
+        lr_func = IterExponential(
+            total_iter_length=self.cfg.lr_scheduler.kwargs.total_iter,
+            final_ratio=self.cfg.lr_scheduler.kwargs.final_ratio,
+            warmup_steps=self.cfg.lr_scheduler.kwargs.warmup_steps,
+        )
+        self.lr_scheduler = LambdaLR(optimizer=self.optimizer, lr_lambda=lr_func)
+        # Loss
+        self.loss = get_loss(loss_name=self.cfg.loss.name, **self.cfg.loss.kwargs)
+        # Eval metrics
+        self.metric_funcs = [getattr(metric, _met) for _met in cfg.eval.eval_metrics]
+        self.train_metrics = MetricTracker(*["loss", "feat_align_loss"])
+        self.val_metrics = MetricTracker(*[m.__name__ for m in self.metric_funcs])
+        # main metric for best checkpoint saving
+        self.main_val_metric = cfg.validation.main_val_metric
+        self.main_val_metric_goal = cfg.validation.main_val_metric_goal
+        assert (
+            self.main_val_metric in cfg.eval.eval_metrics
+        ), f"Main eval metric `{self.main_val_metric}` not found in evaluation metrics."
+        self.best_metric = 1e8 if "minimize" == self.main_val_metric_goal else -1e8
+        # Settings
+        self.max_epoch = self.cfg.max_epoch
+        self.max_iter = self.cfg.max_iter
+        self.gradient_accumulation_steps = accumulation_steps
+        self.gt_depth_type = self.cfg.gt_depth_type
+        self.gt_mask_type = self.cfg.gt_mask_type
+        self.save_period = self.cfg.trainer.save_period
+        self.backup_period = self.cfg.trainer.backup_period
+        self.val_period = self.cfg.trainer.validation_period
+        self.vis_period = self.cfg.trainer.visualization_period
+        # Internal variables
+        self.epoch = 1
+        self.n_batch_in_epoch = 0  # batch index in the epoch, used when resume training
+        self.effective_iter = 0  # how many times optimizer.step() is called
+        self.in_evaluation = False
+        self.global_seed_sequence: List = []  # consistent global seed sequence, used to seed random generator, to ensure consistency when resuming
+    def train(self, t_end=None):
+        logging.info("Start training")
+        device = self.device
+        self.model.to(device)
+        self.dinov2_encoder.to(device)
+        self.dinov2_adapter.to(device)
+        self.visualize()
+        if self.in_evaluation:
+            logging.info(
+                "Last evaluation was not finished, will do evaluation before continue training."
+            )
+            self.validate()
+        self.train_metrics.reset()
+        accumulated_step = 0
+        progress_bar = tqdm(
+            range(0, self.max_iter),
+            initial=self.effective_iter,
+            desc="iter"
+        )
+        for epoch in range(self.epoch, self.max_epoch + 1):
+            self.epoch = epoch
+            logging.debug(f"epoch: {self.epoch}")
+            # Skip previous batches when resume
+            for batch in skip_first_batches(self.train_loader, self.n_batch_in_epoch):
+                self.model.unet.train()
+                self.dinov2_adapter.train()
+                # >>> With gradient accumulation >>>
+                # Get data
+                rgb = batch["rgb_norm"].to(device)
+                depth_gt_for_latent = batch[self.gt_depth_type].to(device)
+                if self.gt_mask_type is not None:
+                    valid_mask_for_latent = batch[self.gt_mask_type].to(device)
+                    invalid_mask = ~valid_mask_for_latent
+                    valid_mask_down = ~torch.max_pool2d(
+                        invalid_mask.float(), 8, 8
+                    ).bool()
+                    valid_mask_down = valid_mask_down.repeat((1, 4, 1, 1))
+                else:
+                    raise NotImplementedError
+                batch_size = rgb.shape[0]
+                with torch.no_grad():
+                    # Encode image
+                    rgb_latent = self.model.encode_rgb(rgb)  # [B, 4, h, w]
+                    # Encode GT depth
+                    gt_depth_latent = self.encode_depth(
+                        depth_gt_for_latent
+                    )  # [B, 4, h, w]
+                    # DINOv2 feat
+                    dinov2_input_rgb = Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)(rgb)
+                    dinov2_input_rgb = F.interpolate(dinov2_input_rgb, scale_factor=0.875, mode='bicubic')
+                    dinov2_z = self.dinov2_encoder.forward_features(dinov2_input_rgb)['x_norm_patchtokens']
+                # Text embedding
+                text_embed = self.empty_text_embed.to(device).repeat(
+                    (batch_size, 1, 1)
+                )  # [B, 77, 1024]
+                # Predict the noise residual
+                rgb_latent = self.model.unet(
+                    rgb_latent, 1, text_embed
+                )  # [B, 4, h, w]
+                feat_16 = rgb_latent.feat_64
+                rgb_latent = rgb_latent.sample
+                if self.gt_mask_type is not None:
+                    loss = self.loss(
+                        rgb_latent[valid_mask_down].float(),
+                        gt_depth_latent[valid_mask_down].float(),
+                    ).mean()
+                else:
+                    loss = self.loss(rgb_latent.float(), gt_depth_latent.float()).mean()
+                self.train_metrics.update("loss", loss.item())
+                # feat align loss
+                b, c, h, w = feat_16.shape
+                _, _, H, W = rgb_latent.shape
+                # update dinov2_adapter
+                unet_16_feat_aligned = self.dinov2_adapter(feat_16.permute(0, 2, 3, 1).reshape(batch_size, -1, c))
+                if torch.isnan(rgb_latent).any():
+                    logging.warning("model_pred contains NaN.")
+                dinov2_z = dinov2_z.reshape(b, int(H/2), int(W/2), -1).permute(0, 3, 1, 2)
+                dinov2_z = F.interpolate(dinov2_z, size=(h, w), mode='bicubic').permute(0, 2, 3, 1).reshape(b, h*w, -1)
+                # kl loss
+                unet_16_feat_aligned = F.softmax(unet_16_feat_aligned, dim=-1)
+                dinov2_z = F.softmax(dinov2_z, dim=-1)
+                loss_feat_align = F.kl_div(unet_16_feat_aligned.log(), dinov2_z)
+                self.train_metrics.update("feat_align_loss", loss_feat_align)
+                loss += self.cfg.loss_feat_align.lamda * loss_feat_align
+                loss = loss / self.gradient_accumulation_steps
+                loss.backward()
+                accumulated_step += 1
+                self.n_batch_in_epoch += 1
+                # Practical batch end
+                # Perform optimization step
+                if accumulated_step >= self.gradient_accumulation_steps:
+                    self.optimizer.step()
+                    self.lr_scheduler.step()
+                    self.optimizer.zero_grad()
+                    accumulated_step = 0
+                    self.effective_iter += 1
+                    progress_bar.update(1)
+                    # Log to tensorboard
+                    accumulated_loss = self.train_metrics.result()["loss"]
+                    logs = {"loss": accumulated_loss}
+                    progress_bar.set_postfix(**logs)
+                    tb_logger.log_dic(
+                        {
+                            f"train/{k}": v
+                            for k, v in self.train_metrics.result().items()
+                        },
+                        global_step=self.effective_iter,
+                    )
+                    tb_logger.writer.add_scalar(
+                        "lr",
+                        self.lr_scheduler.get_last_lr()[0],
+                        global_step=self.effective_iter,
+                    )
+                    tb_logger.writer.add_scalar(
+                        "n_batch_in_epoch",
+                        self.n_batch_in_epoch,
+                        global_step=self.effective_iter,
+                    )
+                    self.train_metrics.reset()
+                    # Per-step callback
+                    self._train_step_callback()
+                    # End of training
+                    if self.max_iter > 0 and self.effective_iter >= self.max_iter:
+                        self.save_checkpoint(
+                            ckpt_name=self._get_backup_ckpt_name(),
+                            save_train_state=False,
+                        )
+                        logging.info("Training ended.")
+                        return
+                    # Time's up
+                    elif t_end is not None and datetime.now() >= t_end:
+                        self.save_checkpoint(ckpt_name="latest", save_train_state=True)
+                        logging.info("Time is up, training paused.")
+                        return
+                    torch.cuda.empty_cache()
+                    # <<< Effective batch end <<<
+            # Epoch end
+            self.n_batch_in_epoch = 0
+    def encode_depth(self, depth_in):
+        # stack depth into 3-channel
+        stacked = self.stack_depth_images(depth_in)
+        # encode using VAE encoder
+        depth_latent = self.model.encode_rgb(stacked)
+        return depth_latent
+    @staticmethod
+    def stack_depth_images(depth_in):
+        if 4 == len(depth_in.shape):
+            stacked = depth_in.repeat(1, 3, 1, 1)
+        elif 3 == len(depth_in.shape):
+            stacked = depth_in.unsqueeze(1)
+            stacked = depth_in.repeat(1, 3, 1, 1)
+        return stacked
+    def _train_step_callback(self):
+        """Executed after every iteration"""
+        # Save backup (with a larger interval, without training states)
+        if self.backup_period > 0 and 0 == self.effective_iter % self.backup_period:
+            self.save_checkpoint(
+                ckpt_name=self._get_backup_ckpt_name(), save_train_state=False
+            )
+        _is_latest_saved = False
+        # Validation
+        if self.val_period > 0 and 0 == self.effective_iter % self.val_period:
+            self.in_evaluation = True  # flag to do evaluation in resume run if validation is not finished
+            self.save_checkpoint(ckpt_name="latest", save_train_state=True)
+            _is_latest_saved = True
+            self.validate()
+            self.in_evaluation = False
+            self.save_checkpoint(ckpt_name="latest", save_train_state=True)
+        # Save training checkpoint (can be resumed)
+        if (
+            self.save_period > 0
+            and 0 == self.effective_iter % self.save_period
+            and not _is_latest_saved
+        ):
+            self.save_checkpoint(ckpt_name="latest", save_train_state=True)
+        # Visualization
+        if self.vis_period > 0 and 0 == self.effective_iter % self.vis_period:
+            self.visualize()
+    def validate(self):
+        for i, val_loader in enumerate(self.val_loaders):
+            val_dataset_name = val_loader.dataset.disp_name
+            val_metric_dic = self.validate_single_dataset(
+                data_loader=val_loader, metric_tracker=self.val_metrics
+            )
+            logging.info(
+                f"Iter {self.effective_iter}. Validation metrics on `{val_dataset_name}`: {val_metric_dic}"
+            )
+            tb_logger.log_dic(
+                {f"val/{val_dataset_name}/{k}": v for k, v in val_metric_dic.items()},
+                global_step=self.effective_iter,
+            )
+            # save to file
+            eval_text = eval_dic_to_text(
+                val_metrics=val_metric_dic,
+                dataset_name=val_dataset_name,
+                sample_list_path=val_loader.dataset.filename_ls_path,
+            )
+            _save_to = os.path.join(
+                self.out_dir_eval,
+                f"eval-{val_dataset_name}-iter{self.effective_iter:06d}.txt",
+            )
+            with open(_save_to, "w+") as f:
+                f.write(eval_text)
+            # Update main eval metric
+            if 0 == i:
+                main_eval_metric = val_metric_dic[self.main_val_metric]
+                if (
+                    "minimize" == self.main_val_metric_goal
+                    and main_eval_metric < self.best_metric
+                    or "maximize" == self.main_val_metric_goal
+                    and main_eval_metric > self.best_metric
+                ):
+                    self.best_metric = main_eval_metric
+                    logging.info(
+                        f"Best metric: {self.main_val_metric} = {self.best_metric} at iteration {self.effective_iter}"
+                    )
+                    # Save a checkpoint
+                    self.save_checkpoint(
+                        ckpt_name=self._get_backup_ckpt_name(), save_train_state=False
+                    )
+    def visualize(self):
+        for val_loader in self.vis_loaders:
+            vis_dataset_name = val_loader.dataset.disp_name
+            vis_out_dir = os.path.join(
+                self.out_dir_vis, self._get_backup_ckpt_name(), vis_dataset_name
+            )
+            os.makedirs(vis_out_dir, exist_ok=True)
+            _ = self.validate_single_dataset(
+                data_loader=val_loader,
+                metric_tracker=self.val_metrics,
+                save_to_dir=vis_out_dir,
+            )
+    @torch.no_grad()
+    def validate_single_dataset(
+        self,
+        data_loader: DataLoader,
+        metric_tracker: MetricTracker,
+        save_to_dir: str = None,
+    ):
+        self.model.to(self.device)
+        metric_tracker.reset()
+        # Generate seed sequence for consistent evaluation
+        val_init_seed = self.cfg.validation.init_seed
+        val_seed_ls = generate_seed_sequence(val_init_seed, len(data_loader))
+        for i, batch in enumerate(
+            tqdm(data_loader, desc=f"evaluating on {data_loader.dataset.disp_name}"),
+            start=1,
+        ):
+            assert 1 == data_loader.batch_size
+            # Read input image
+            rgb_int = batch["rgb_int"]  # [3, H, W]
+            # GT depth
+            depth_raw_ts = batch["depth_raw_linear"].squeeze()
+            depth_raw = depth_raw_ts.numpy()
+            depth_raw_ts = depth_raw_ts.to(self.device)
+            valid_mask_ts = batch["valid_mask_raw"].squeeze()
+            valid_mask = valid_mask_ts.numpy()
+            valid_mask_ts = valid_mask_ts.to(self.device)
+            # Predict depth
+            pipe_out: DepthMasterDepthOutput = self.model(
+                rgb_int,
+                processing_res=self.cfg.validation.processing_res,
+                match_input_res=self.cfg.validation.match_input_res,
+                batch_size=1,  # use batch size 1 to increase reproducibility
+                color_map=None,
+                show_progress_bar=False,
+                resample_method=self.cfg.validation.resample_method,
+            )
+            depth_pred: np.ndarray = pipe_out.depth_np.squeeze()
+            if "least_square" == self.cfg.eval.alignment:
+                depth_pred, scale, shift = align_depth_least_square(
+                    gt_arr=depth_raw,
+                    pred_arr=depth_pred,
+                    valid_mask_arr=valid_mask,
+                    return_scale_shift=True,
+                    max_resolution=self.cfg.eval.align_max_res,
+                )
+            elif  "least_square_disparity" == self.cfg.eval.alignment:
+                gt_disparity = depth_raw
+                gt_non_neg_mask = gt_disparity > 0
+                # LS alignment in disparity space
+                pred_non_neg_mask = depth_pred > 0
+                valid_nonnegative_mask = valid_mask & gt_non_neg_mask & pred_non_neg_mask
+                disparity_pred, scale, shift = align_depth_least_square(
+                    gt_arr=gt_disparity,
+                    pred_arr=depth_pred,
+                    valid_mask_arr=valid_nonnegative_mask,
+                    return_scale_shift=True,
+                )
+                # convert to depth
+                disparity_pred = np.clip(
+                    disparity_pred, a_min=1e-3, a_max=None
+                )  # avoid 0 disparity
+                depth_pred = disparity2depth(disparity_pred)
+                depth_raw_ts = disparity2depth(depth_raw_ts)
+            elif "least_square_sqrt_disp" == self.cfg.eval.alignment:
+                gt_sqrt_disp = depth_raw
+                gt_non_neg_mask = gt_sqrt_disp > 0
+                # LS alignment in sqrt space
+                pred_non_neg_mask = depth_pred > 0
+                valid_nonnegative_mask = valid_mask & gt_non_neg_mask & pred_non_neg_mask
+                depth_sqrt_disp_pred, scale, shift = align_depth_least_square(
+                    gt_arr=gt_sqrt_disp,
+                    pred_arr=depth_pred,
+                    valid_mask_arr=valid_mask,
+                    return_scale_shift=True,
+                )
+                # convert to depth
+                disparity_pred = depth_sqrt_disp_pred ** 2
+                depth_raw_ts = torch.pow(depth_raw_ts, 2)
+                # convert to depth
+                disparity_pred = np.clip(
+                    disparity_pred, a_min=1e-3, a_max=None
+                )  # avoid 0 disparity
+                depth_pred = disparity2depth(disparity_pred)
+                depth_raw_ts = disparity2depth(depth_raw_ts)
+            else:
+                raise RuntimeError(f"Unknown alignment type: {self.cfg.eval.alignment}")
+            # Clip to dataset min max
+            depth_pred = np.clip(
+                depth_pred,
+                a_min=data_loader.dataset.min_depth,
+                a_max=data_loader.dataset.max_depth,
+            )
+            # clip to d > 0 for evaluation
+            depth_pred = np.clip(depth_pred, a_min=1e-6, a_max=None)
+            # Evaluate
+            sample_metric = []
+            depth_pred_ts = torch.from_numpy(depth_pred).to(self.device)
+            for met_func in self.metric_funcs:
+                _metric_name = met_func.__name__
+                _metric = met_func(depth_pred_ts, depth_raw_ts, valid_mask_ts).item()
+                sample_metric.append(_metric.__str__())
+                metric_tracker.update(_metric_name, _metric)
+            # Save as 16-bit uint png
+            if save_to_dir is not None:
+                img_name = batch["rgb_relative_path"][0].replace("/", "_")
+                png_save_path = os.path.join(save_to_dir, f"{img_name}.png")
+                depth_to_save = (pipe_out.depth_np.squeeze() * 65535.0).astype(np.uint16)
+                Image.fromarray(depth_to_save).save(png_save_path, mode="I;16")
+        return metric_tracker.result()
+    def _get_next_seed(self):
+        if 0 == len(self.global_seed_sequence):
+            self.global_seed_sequence = generate_seed_sequence(
+                initial_seed=self.seed,
+                length=self.max_iter * self.gradient_accumulation_steps,
+            )
+            logging.info(
+                f"Global seed sequence is generated, length={len(self.global_seed_sequence)}"
+            )
+        return self.global_seed_sequence.pop()
+    def save_checkpoint(self, ckpt_name, save_train_state):
+        ckpt_dir = os.path.join(self.out_dir_ckpt, ckpt_name)
+        logging.info(f"Saving checkpoint to: {ckpt_dir}")
+        # Backup previous checkpoint
+        temp_ckpt_dir = None
+        if os.path.exists(ckpt_dir) and os.path.isdir(ckpt_dir):
+            temp_ckpt_dir = os.path.join(
+                os.path.dirname(ckpt_dir), f"_old_{os.path.basename(ckpt_dir)}"
+            )
+            if os.path.exists(temp_ckpt_dir):
+                shutil.rmtree(temp_ckpt_dir, ignore_errors=True)
+            os.rename(ckpt_dir, temp_ckpt_dir)
+            logging.debug(f"Old checkpoint is backed up at: {temp_ckpt_dir}")
+        # Save UNet
+        unet_path = os.path.join(ckpt_dir, "unet")
+        self.model.unet.save_pretrained(unet_path, safe_serialization=False)
+        logging.info(f"UNet is saved to: {unet_path}")
+        # Save DINOv2_Adapter
+        adapter_path = os.path.join(ckpt_dir, "dinov2_adapter.pth")
+        state_dict = self.dinov2_adapter.state_dict()
+        torch.save(state_dict, adapter_path)
+        logging.info(f"dinov2_adapter is saved to: {adapter_path}")
+        if save_train_state:
+            state = {
+                "optimizer": self.optimizer.state_dict(),
+                "lr_scheduler": self.lr_scheduler.state_dict(),
+                "config": self.cfg,
+                "effective_iter": self.effective_iter,
+                "epoch": self.epoch,
+                "n_batch_in_epoch": self.n_batch_in_epoch,
+                "best_metric": self.best_metric,
+                "in_evaluation": self.in_evaluation,
+                "global_seed_sequence": self.global_seed_sequence,
+            }
+            train_state_path = os.path.join(ckpt_dir, "trainer.ckpt")
+            torch.save(state, train_state_path)
+            # iteration indicator
+            f = open(os.path.join(ckpt_dir, self._get_backup_ckpt_name()), "w")
+            f.close()
+            logging.info(f"Trainer state is saved to: {train_state_path}")
+        # Remove temp ckpt
+        if temp_ckpt_dir is not None and os.path.exists(temp_ckpt_dir):
+            shutil.rmtree(temp_ckpt_dir, ignore_errors=True)
+            logging.debug("Old checkpoint backup is removed.")
+    def load_checkpoint(
+        self, ckpt_path, load_trainer_state=True, resume_lr_scheduler=True
+    ):
+        logging.info(f"Loading checkpoint from: {ckpt_path}")
+        # Load UNet
+        _model_path = os.path.join(ckpt_path, "unet", "diffusion_pytorch_model.bin")
+        self.model.unet.load_state_dict(
+            torch.load(_model_path, map_location=self.device)
+        )
+        self.model.unet.to(self.device)
+        logging.info(f"UNet parameters are loaded from {_model_path}")
+        # Load DINOv2_adapter
+        _model_path = os.path.join(ckpt_path, "dinov2_adapter.pth")
+        self.dinov2_adapter.load_state_dict(
+            torch.load(_model_path, map_location=self.device)
+        )
+        self.dinov2_adapter.to(self.device)
+        logging.info(f"dinov2_adapter parameters are loaded from {_model_path}")
+        # Load training states
+        if load_trainer_state:
+            checkpoint = torch.load(os.path.join(ckpt_path, "trainer.ckpt"))
+            self.effective_iter = checkpoint["effective_iter"]
+            self.epoch = checkpoint["epoch"]
+            self.n_batch_in_epoch = checkpoint["n_batch_in_epoch"]
+            self.in_evaluation = checkpoint["in_evaluation"]
+            self.global_seed_sequence = checkpoint["global_seed_sequence"]
+            self.best_metric = checkpoint["best_metric"]
+            self.optimizer.load_state_dict(checkpoint["optimizer"])
+            logging.info(f"optimizer state is loaded from {ckpt_path}")
+            if resume_lr_scheduler:
+                self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
+                logging.info(f"LR scheduler state is loaded from {ckpt_path}")
+        logging.info(
+            f"Checkpoint loaded from: {ckpt_path}. Resume from iteration {self.effective_iter} (epoch {self.epoch})"
+        )
+        return
+    def _get_backup_ckpt_name(self):
+        return f"iter_{self.effective_iter:06d}"

DepthMaster/src/trainer/trainer_s2.py ADDED Viewed

	@@ -0,0 +1,630 @@

+# An official reimplemented version of Marigold training script.
+# Last modified: 2024-04-29
+#
+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# If you use or adapt this code, please attribute to https://github.com/prs-eth/marigold.
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import logging
+import os
+import shutil
+from datetime import datetime
+from typing import List, Union
+import numpy as np
+import torch
+# from diffusers import DDPMScheduler
+from omegaconf import OmegaConf
+# from torch.nn import Conv2d
+# from torch.nn.parameter import Parameter
+from torch.optim import Adam
+from torch.optim.lr_scheduler import LambdaLR
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from PIL import Image
+from depthmaster.depthmaster_pipeline import DepthMasterPipeline, DepthMasterDepthOutput
+from src.util import metric
+from src.util.data_loader import skip_first_batches
+from src.util.logging_util import tb_logger, eval_dic_to_text
+from src.util.loss import get_loss
+from src.util.lr_scheduler import IterExponential
+from src.util.metric import MetricTracker
+from src.util.alignment import (
+    align_depth_least_square,
+    depth2disparity,
+    disparity2depth,
+    align_depth_least_square_torch_mask,
+    align_depth_medium_mask
+)
+# from src.util.alignment import align_depth_least_square
+# from src.util.alignment import align_depth_least_square
+from src.util.seeding import generate_seed_sequence
+import torch.nn.functional as F
+class DepthMasterTrainerS2:
+    def __init__(
+        self,
+        cfg: OmegaConf,
+        model: DepthMasterPipeline,
+        train_dataloader: DataLoader,
+        device,
+        base_ckpt_dir,
+        out_dir_ckpt,
+        out_dir_eval,
+        out_dir_vis,
+        accumulation_steps: int,
+        val_dataloaders: List[DataLoader] = None,
+        vis_dataloaders: List[DataLoader] = None,
+    ):
+        self.cfg: OmegaConf = cfg
+        self.model: DepthMasterPipeline = model
+        self.device = device
+        self.seed: Union[int, None] = (
+            self.cfg.trainer.init_seed
+        )  # used to generate seed sequence, set to `None` to train w/o seeding
+        self.out_dir_ckpt = out_dir_ckpt
+        self.out_dir_eval = out_dir_eval
+        self.out_dir_vis = out_dir_vis
+        self.train_loader: DataLoader = train_dataloader
+        self.val_loaders: List[DataLoader] = val_dataloaders
+        self.vis_loaders: List[DataLoader] = vis_dataloaders
+        self.accumulation_steps: int = accumulation_steps
+        # Encode empty text prompt
+        self.model.encode_empty_text()
+        self.empty_text_embed = self.model.empty_text_embed.detach().clone().to(device)
+        self.model.unet.enable_xformers_memory_efficient_attention()
+        # Trainability
+        self.model.vae.requires_grad_(False)
+        self.model.vae.decoder.requires_grad_(False)
+        self.model.text_encoder.requires_grad_(False)
+        self.model.unet.requires_grad_(True)
+        # Optimizer !should be defined after input layer is adapted
+        lr = self.cfg.lr
+        self.optimizer = Adam(self.model.unet.parameters(), lr=lr)
+        # LR scheduler
+        lr_func = IterExponential(
+            total_iter_length=self.cfg.lr_scheduler.kwargs.total_iter,
+            final_ratio=self.cfg.lr_scheduler.kwargs.final_ratio,
+            warmup_steps=self.cfg.lr_scheduler.kwargs.warmup_steps,
+        )
+        self.lr_scheduler = LambdaLR(optimizer=self.optimizer, lr_lambda=lr_func)
+        # Loss
+        self.loss = get_loss(loss_name=self.cfg.loss.name, **self.cfg.loss.kwargs)
+        self.grad_loss = get_loss(loss_name=self.cfg.grad_loss.name, ** self.cfg.grad_loss.kwargs)
+        # Eval metrics
+        self.metric_funcs = [getattr(metric, _met) for _met in cfg.eval.eval_metrics]
+        self.train_metrics = MetricTracker(*["loss", "grad_loss"])
+        self.val_metrics = MetricTracker(*[m.__name__ for m in self.metric_funcs])
+        # main metric for best checkpoint saving
+        self.main_val_metric = cfg.validation.main_val_metric
+        self.main_val_metric_goal = cfg.validation.main_val_metric_goal
+        assert (
+            self.main_val_metric in cfg.eval.eval_metrics
+        ), f"Main eval metric `{self.main_val_metric}` not found in evaluation metrics."
+        self.best_metric = 1e8 if "minimize" == self.main_val_metric_goal else -1e8
+        # Settings
+        self.max_epoch = self.cfg.max_epoch
+        self.max_iter = self.cfg.max_iter
+        self.gradient_accumulation_steps = accumulation_steps
+        self.gt_depth_type = self.cfg.gt_depth_type
+        self.gt_mask_type = self.cfg.gt_mask_type
+        self.save_period = self.cfg.trainer.save_period
+        self.backup_period = self.cfg.trainer.backup_period
+        self.val_period = self.cfg.trainer.validation_period
+        self.vis_period = self.cfg.trainer.visualization_period
+        # Internal variables
+        self.epoch = 1
+        self.n_batch_in_epoch = 0  # batch index in the epoch, used when resume training
+        self.effective_iter = 0  # how many times optimizer.step() is called
+        self.in_evaluation = False
+        self.global_seed_sequence: List = []  # consistent global seed sequence, used to seed random generator, to ensure consistency when resuming
+    def grad(self, x):
+        # x.shape : n, c, h, w
+        diff_x = x[..., 1:, 1:] - x[..., 1:, :-1]
+        diff_y = x[..., 1:, 1:] - x[..., :-1, 1:]
+        diff_45 = x[..., :-1, 1:] - x[..., 1:, :-1]
+        diff_135 = x[..., 1:, 1:] - x[..., :-1, :-1]
+        # mag = diff_x**2 + diff_y**2
+        # # angle_ratio
+        # angle = torch.atan(diff_y / (diff_x + 1e-10))
+        # result = torch.cat([mag, angle], dim=1)
+        result = torch.cat([diff_x, diff_y, diff_45, diff_135], dim=1)
+        return result
+    def train(self, t_end=None):
+        logging.info("Start training")
+        device = self.device
+        self.model.to(device)
+        self.visualize()
+        if self.in_evaluation:
+            logging.info(
+                "Last evaluation was not finished, will do evaluation before continue training."
+            )
+            self.validate()
+        self.train_metrics.reset()
+        accumulated_step = 0
+        progress_bar = tqdm(
+            range(0, self.max_iter),
+            initial=self.effective_iter,
+            desc="iter"
+        )
+        for epoch in range(self.epoch, self.max_epoch + 1):
+            self.epoch = epoch
+            logging.debug(f"epoch: {self.epoch}")
+            # Skip previous batches when resume
+            for batch in skip_first_batches(self.train_loader, self.n_batch_in_epoch):
+                self.model.unet.train()
+                # >>> With gradient accumulation >>>
+                # Get data
+                rgb = batch["rgb_norm"].to(device)
+                depth_gt_for_latent = batch[self.gt_depth_type].to(device)
+                if self.gt_mask_type is not None:
+                    valid_mask_for_latent = batch[self.gt_mask_type].to(device)
+                else:
+                    raise NotImplementedError
+                batch_size = rgb.shape[0]
+                with torch.no_grad():
+                    # Encode image
+                    rgb_latent = self.model.encode_rgb(rgb)  # [B, 4, h, w]
+                # Text embedding
+                text_embed = self.empty_text_embed.to(device).repeat(
+                    (batch_size, 1, 1)
+                )  # [B, 77, 1024]
+                rgb_latent = self.model.unet(
+                    rgb_latent, 1, text_embed
+                ).sample  # [B, 4, h, w]
+                depth_pred = self.model.decode_depth(rgb_latent)
+                depth_gt_for_loss = depth_gt_for_latent
+                aligned_pred = depth_pred
+                if self.gt_mask_type is not None:
+                    loss = self.loss(aligned_pred[valid_mask_for_latent].float(), depth_gt_for_loss[valid_mask_for_latent].float()).mean()
+                else:
+                    loss = self.loss(aligned_pred.float(), depth_gt_for_loss.float()).mean()
+                self.train_metrics.update("loss", loss.item())
+                # grad loss
+                depth_gt_for_loss[~valid_mask_for_latent] = 0
+                grad_gt = self.grad(depth_gt_for_loss)
+                aligned_pred[~valid_mask_for_latent] = 0
+                grad_pred = self.grad(aligned_pred)
+                grad_loss = self.grad_loss(grad_gt, grad_pred)
+                self.train_metrics.update(f"grad_loss", grad_loss.item())
+                loss += self.cfg.grad_loss.lamda * grad_loss
+                loss = loss / self.gradient_accumulation_steps
+                loss.backward()
+                accumulated_step += 1
+                self.n_batch_in_epoch += 1
+                # Practical batch end
+                # Perform optimization step
+                if accumulated_step >= self.gradient_accumulation_steps:
+                    self.optimizer.step()
+                    self.lr_scheduler.step()
+                    self.optimizer.zero_grad()
+                    accumulated_step = 0
+                    self.effective_iter += 1
+                    progress_bar.update(1)
+                    # Log to tensorboard
+                    accumulated_loss = self.train_metrics.result()["loss"]
+                    logs = {"loss": accumulated_loss}
+                    progress_bar.set_postfix(**logs)
+                    tb_logger.log_dic(
+                        {
+                            f"train/{k}": v
+                            for k, v in self.train_metrics.result().items()
+                        },
+                        global_step=self.effective_iter,
+                    )
+                    tb_logger.writer.add_scalar(
+                        "lr",
+                        self.lr_scheduler.get_last_lr()[0],
+                        global_step=self.effective_iter,
+                    )
+                    tb_logger.writer.add_scalar(
+                        "n_batch_in_epoch",
+                        self.n_batch_in_epoch,
+                        global_step=self.effective_iter,
+                    )
+                    self.train_metrics.reset()
+                    # Per-step callback
+                    self._train_step_callback()
+                    # End of training
+                    if self.max_iter > 0 and self.effective_iter >= self.max_iter:
+                        self.save_checkpoint(
+                            ckpt_name=self._get_backup_ckpt_name(),
+                            save_train_state=False,
+                        )
+                        logging.info("Training ended.")
+                        return
+                    # Time's up
+                    elif t_end is not None and datetime.now() >= t_end:
+                        self.save_checkpoint(ckpt_name="latest", save_train_state=True)
+                        logging.info("Time is up, training paused.")
+                        return
+                    torch.cuda.empty_cache()
+                    # <<< Effective batch end <<<
+            # Epoch end
+            self.n_batch_in_epoch = 0
+    def encode_depth(self, depth_in):
+        # stack depth into 3-channel
+        stacked = self.stack_depth_images(depth_in)
+        # encode using VAE encoder
+        depth_latent = self.model.encode_rgb(stacked)
+        return depth_latent
+    @staticmethod
+    def stack_depth_images(depth_in):
+        if 4 == len(depth_in.shape):
+            stacked = depth_in.repeat(1, 3, 1, 1)
+        elif 3 == len(depth_in.shape):
+            stacked = depth_in.unsqueeze(1)
+            stacked = depth_in.repeat(1, 3, 1, 1)
+        return stacked
+    def _train_step_callback(self):
+        """Executed after every iteration"""
+        # Save backup (with a larger interval, without training states)
+        if self.backup_period > 0 and 0 == self.effective_iter % self.backup_period:
+            self.save_checkpoint(
+                ckpt_name=self._get_backup_ckpt_name(), save_train_state=False
+            )
+        _is_latest_saved = False
+        # Validation
+        if self.val_period > 0 and 0 == self.effective_iter % self.val_period:
+            self.in_evaluation = True  # flag to do evaluation in resume run if validation is not finished
+            self.save_checkpoint(ckpt_name="latest", save_train_state=True)
+            _is_latest_saved = True
+            self.validate()
+            self.in_evaluation = False
+            self.save_checkpoint(ckpt_name="latest", save_train_state=True)
+        # Save training checkpoint (can be resumed)
+        if (
+            self.save_period > 0
+            and 0 == self.effective_iter % self.save_period
+            and not _is_latest_saved
+        ):
+            self.save_checkpoint(ckpt_name="latest", save_train_state=True)
+        # Visualization
+        if self.vis_period > 0 and 0 == self.effective_iter % self.vis_period:
+            self.visualize()
+    def validate(self):
+        for i, val_loader in enumerate(self.val_loaders):
+            val_dataset_name = val_loader.dataset.disp_name
+            val_metric_dic = self.validate_single_dataset(
+                data_loader=val_loader, metric_tracker=self.val_metrics
+            )
+            logging.info(
+                f"Iter {self.effective_iter}. Validation metrics on `{val_dataset_name}`: {val_metric_dic}"
+            )
+            tb_logger.log_dic(
+                {f"val/{val_dataset_name}/{k}": v for k, v in val_metric_dic.items()},
+                global_step=self.effective_iter,
+            )
+            # save to file
+            eval_text = eval_dic_to_text(
+                val_metrics=val_metric_dic,
+                dataset_name=val_dataset_name,
+                sample_list_path=val_loader.dataset.filename_ls_path,
+            )
+            _save_to = os.path.join(
+                self.out_dir_eval,
+                f"eval-{val_dataset_name}-iter{self.effective_iter:06d}.txt",
+            )
+            with open(_save_to, "w+") as f:
+                f.write(eval_text)
+            # Update main eval metric
+            if 0 == i:
+                main_eval_metric = val_metric_dic[self.main_val_metric]
+                if (
+                    "minimize" == self.main_val_metric_goal
+                    and main_eval_metric < self.best_metric
+                    or "maximize" == self.main_val_metric_goal
+                    and main_eval_metric > self.best_metric
+                ):
+                    self.best_metric = main_eval_metric
+                    logging.info(
+                        f"Best metric: {self.main_val_metric} = {self.best_metric} at iteration {self.effective_iter}"
+                    )
+                    # Save a checkpoint
+                    self.save_checkpoint(
+                        ckpt_name=self._get_backup_ckpt_name(), save_train_state=False
+                    )
+    def visualize(self):
+        for val_loader in self.vis_loaders:
+            vis_dataset_name = val_loader.dataset.disp_name
+            vis_out_dir = os.path.join(
+                self.out_dir_vis, self._get_backup_ckpt_name(), vis_dataset_name
+            )
+            os.makedirs(vis_out_dir, exist_ok=True)
+            _ = self.validate_single_dataset(
+                data_loader=val_loader,
+                metric_tracker=self.val_metrics,
+                save_to_dir=vis_out_dir,
+            )
+    @torch.no_grad()
+    def validate_single_dataset(
+        self,
+        data_loader: DataLoader,
+        metric_tracker: MetricTracker,
+        save_to_dir: str = None,
+    ):
+        self.model.to(self.device)
+        metric_tracker.reset()
+        # Generate seed sequence for consistent evaluation
+        val_init_seed = self.cfg.validation.init_seed
+        val_seed_ls = generate_seed_sequence(val_init_seed, len(data_loader))
+        for i, batch in enumerate(
+            tqdm(data_loader, desc=f"evaluating on {data_loader.dataset.disp_name}"),
+            start=1,
+        ):
+            assert 1 == data_loader.batch_size
+            # Read input image
+            rgb_int = batch["rgb_int"]  # [3, H, W]
+            # GT depth
+            depth_raw_ts = batch["depth_raw_linear"].squeeze()
+            depth_raw = depth_raw_ts.numpy()
+            depth_raw_ts = depth_raw_ts.to(self.device)
+            valid_mask_ts = batch["valid_mask_raw"].squeeze()
+            valid_mask = valid_mask_ts.numpy()
+            valid_mask_ts = valid_mask_ts.to(self.device)
+            # Predict depth
+            pipe_out: DepthMasterDepthOutput = self.model(
+                rgb_int,
+                processing_res=self.cfg.validation.processing_res,
+                match_input_res=self.cfg.validation.match_input_res,
+                batch_size=1,  # use batch size 1 to increase reproducibility
+                color_map=None,
+                show_progress_bar=False,
+                resample_method=self.cfg.validation.resample_method,
+            )
+            depth_pred: np.ndarray = pipe_out.depth_np.squeeze()
+            if "least_square" == self.cfg.eval.alignment:
+                depth_pred, scale, shift = align_depth_least_square(
+                    gt_arr=depth_raw,
+                    pred_arr=depth_pred,
+                    valid_mask_arr=valid_mask,
+                    return_scale_shift=True,
+                    max_resolution=self.cfg.eval.align_max_res,
+                )
+            elif  "least_square_disparity" == self.cfg.eval.alignment:
+                # gt_disparity = depth_raw
+                gt_disparity = depth2disparity(depth_raw)
+                gt_non_neg_mask = gt_disparity > 0
+                # LS alignment in disparity space
+                pred_non_neg_mask = depth_pred > 0
+                valid_nonnegative_mask = valid_mask & gt_non_neg_mask & pred_non_neg_mask
+                disparity_pred, scale, shift = align_depth_least_square(
+                    gt_arr=gt_disparity,
+                    pred_arr=depth_pred,
+                    valid_mask_arr=valid_nonnegative_mask,
+                    return_scale_shift=True,
+                )
+                # convert to depth
+                disparity_pred = np.clip(
+                    disparity_pred, a_min=1e-3, a_max=None
+                )  # avoid 0 disparity
+                depth_pred = disparity2depth(disparity_pred)
+                depth_raw_ts = disparity2depth(depth_raw_ts)
+            elif "least_square_sqrt_disp" == self.cfg.eval.alignment:
+                # gt_sqrt_disp = depth_raw
+                gt_sqrt_disp = np.sqrt(depth2disparity(depth_raw))
+                gt_non_neg_mask = gt_sqrt_disp > 0
+                # LS alignment in sqrt space
+                pred_non_neg_mask = depth_pred > 0
+                valid_nonnegative_mask = valid_mask & gt_non_neg_mask & pred_non_neg_mask
+                depth_sqrt_disp_pred, scale, shift = align_depth_least_square(
+                    gt_arr=gt_sqrt_disp,
+                    pred_arr=depth_pred,
+                    valid_mask_arr=valid_mask,
+                    return_scale_shift=True,
+                )
+                # convert to depth
+                disparity_pred = depth_sqrt_disp_pred ** 2
+                depth_raw_ts = torch.pow(depth_raw_ts, 2)
+                # convert to depth
+                disparity_pred = np.clip(
+                    disparity_pred, a_min=1e-3, a_max=None
+                )  # avoid 0 disparity
+                depth_pred = disparity2depth(disparity_pred)
+                depth_raw_ts = disparity2depth(depth_raw_ts)
+            else:
+                raise RuntimeError(f"Unknown alignment type: {self.cfg.eval.alignment}")
+            # Clip to dataset min max
+            depth_pred = np.clip(
+                depth_pred,
+                a_min=data_loader.dataset.min_depth,
+                a_max=data_loader.dataset.max_depth,
+            )
+            # clip to d > 0 for evaluation
+            depth_pred = np.clip(depth_pred, a_min=1e-6, a_max=None)
+            # Evaluate
+            sample_metric = []
+            depth_pred_ts = torch.from_numpy(depth_pred).to(self.device)
+            for met_func in self.metric_funcs:
+                _metric_name = met_func.__name__
+                _metric = met_func(depth_pred_ts, depth_raw_ts, valid_mask_ts).item()
+                sample_metric.append(_metric.__str__())
+                metric_tracker.update(_metric_name, _metric)
+            # Save as 16-bit uint png
+            if save_to_dir is not None:
+                img_name = batch["rgb_relative_path"][0].replace("/", "_")
+                png_save_path = os.path.join(save_to_dir, f"{img_name}.png")
+                depth_to_save = (pipe_out.depth_np.squeeze() * 65535.0).astype(np.uint16)
+                Image.fromarray(depth_to_save).save(png_save_path, mode="I;16")
+        return metric_tracker.result()
+    def _get_next_seed(self):
+        if 0 == len(self.global_seed_sequence):
+            self.global_seed_sequence = generate_seed_sequence(
+                initial_seed=self.seed,
+                length=self.max_iter * self.gradient_accumulation_steps,
+            )
+            logging.info(
+                f"Global seed sequence is generated, length={len(self.global_seed_sequence)}"
+            )
+        return self.global_seed_sequence.pop()
+    def save_checkpoint(self, ckpt_name, save_train_state):
+        ckpt_dir = os.path.join(self.out_dir_ckpt, ckpt_name)
+        logging.info(f"Saving checkpoint to: {ckpt_dir}")
+        # Backup previous checkpoint
+        temp_ckpt_dir = None
+        if os.path.exists(ckpt_dir) and os.path.isdir(ckpt_dir):
+            temp_ckpt_dir = os.path.join(
+                os.path.dirname(ckpt_dir), f"_old_{os.path.basename(ckpt_dir)}"
+            )
+            if os.path.exists(temp_ckpt_dir):
+                shutil.rmtree(temp_ckpt_dir, ignore_errors=True)
+            os.rename(ckpt_dir, temp_ckpt_dir)
+            logging.debug(f"Old checkpoint is backed up at: {temp_ckpt_dir}")
+        # Save UNet
+        unet_path = os.path.join(ckpt_dir, "unet")
+        self.model.unet.save_pretrained(unet_path, safe_serialization=False)
+        logging.info(f"UNet is saved to: {unet_path}")
+        if save_train_state:
+            state = {
+                "optimizer": self.optimizer.state_dict(),
+                "lr_scheduler": self.lr_scheduler.state_dict(),
+                "config": self.cfg,
+                "effective_iter": self.effective_iter,
+                "epoch": self.epoch,
+                "n_batch_in_epoch": self.n_batch_in_epoch,
+                "best_metric": self.best_metric,
+                "in_evaluation": self.in_evaluation,
+                "global_seed_sequence": self.global_seed_sequence,
+            }
+            train_state_path = os.path.join(ckpt_dir, "trainer.ckpt")
+            torch.save(state, train_state_path)
+            # iteration indicator
+            f = open(os.path.join(ckpt_dir, self._get_backup_ckpt_name()), "w")
+            f.close()
+            logging.info(f"Trainer state is saved to: {train_state_path}")
+        # Remove temp ckpt
+        if temp_ckpt_dir is not None and os.path.exists(temp_ckpt_dir):
+            shutil.rmtree(temp_ckpt_dir, ignore_errors=True)
+            logging.debug("Old checkpoint backup is removed.")
+    def load_checkpoint(
+        self, ckpt_path, load_trainer_state=True, resume_lr_scheduler=True
+    ):
+        logging.info(f"Loading checkpoint from: {ckpt_path}")
+        # Load UNet
+        _model_path = os.path.join(ckpt_path, "unet", "diffusion_pytorch_model.bin")
+        self.model.unet.load_state_dict(
+            torch.load(_model_path, map_location=self.device)
+        )
+        self.model.unet.to(self.device)
+        logging.info(f"UNet parameters are loaded from {_model_path}")
+        # Load training states
+        if load_trainer_state:
+            checkpoint = torch.load(os.path.join(ckpt_path, "trainer.ckpt"))
+            self.effective_iter = checkpoint["effective_iter"]
+            self.epoch = checkpoint["epoch"]
+            self.n_batch_in_epoch = checkpoint["n_batch_in_epoch"]
+            self.in_evaluation = checkpoint["in_evaluation"]
+            self.global_seed_sequence = checkpoint["global_seed_sequence"]
+            self.best_metric = checkpoint["best_metric"]
+            self.optimizer.load_state_dict(checkpoint["optimizer"])
+            logging.info(f"optimizer state is loaded from {ckpt_path}")
+            if resume_lr_scheduler:
+                self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
+                logging.info(f"LR scheduler state is loaded from {ckpt_path}")
+        logging.info(
+            f"Checkpoint loaded from: {ckpt_path}. Resume from iteration {self.effective_iter} (epoch {self.epoch})"
+        )
+        return
+    def _get_backup_ckpt_name(self):
+        return f"iter_{self.effective_iter:06d}"

DepthMaster/src/util/alignment.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import numpy as np
+import torch
+def align_depth_medium_mask(
+    gt: torch.Tensor,
+    valid_mask: torch.Tensor,
+    max_resolution=None,
+):
+    ori_shape = gt.shape[-2:]  # input shape
+    batch_size = gt.shape[0]
+    # print(gt.shape)
+    # Downsample
+    if max_resolution is not None:
+        scale_factor = np.min(max_resolution / np.array(ori_shape[-2:]))
+        if scale_factor < 1:
+            downscaler = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest")
+            gt = downscaler(gt)
+            valid_mask = downscaler(valid_mask).bool()
+    scale_ls = []
+    shift_ls = []
+    for i in range(batch_size):
+        # print('yes')
+        gt_masked = gt[i][valid_mask[i]]
+        shift = torch.median(gt_masked).unsqueeze(0)
+        scale = torch.mean(torch.abs(gt_masked - shift)).unsqueeze(0)
+        # print(scale)
+        scale_ls.append(scale)
+        shift_ls.append(shift)
+        # print(len(scale_ls))
+    scale = torch.concat(scale_ls, dim=0).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+    shift = torch.concat(shift_ls, dim=0).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+    return scale, shift
+def align_depth_least_square(
+    gt_arr: np.ndarray,
+    pred_arr: np.ndarray,
+    valid_mask_arr: np.ndarray,
+    return_scale_shift=True,
+    max_resolution=None,
+):
+    ori_shape = pred_arr.shape  # input shape
+    gt = gt_arr.squeeze()  # [H, W]
+    pred = pred_arr.squeeze()
+    valid_mask = valid_mask_arr.squeeze()
+    # Downsample
+    if max_resolution is not None:
+        scale_factor = np.min(max_resolution / np.array(ori_shape[-2:]))
+        if scale_factor < 1:
+            downscaler = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest")
+            gt = downscaler(torch.as_tensor(gt).unsqueeze(0)).numpy()
+            pred = downscaler(torch.as_tensor(pred).unsqueeze(0)).numpy()
+            valid_mask = (
+                downscaler(torch.as_tensor(valid_mask).unsqueeze(0).float())
+                .bool()
+                .numpy()
+            )
+    assert (
+        gt.shape == pred.shape == valid_mask.shape
+    ), f"{gt.shape}, {pred.shape}, {valid_mask.shape}"
+    gt_masked = gt[valid_mask].reshape((-1, 1))
+    pred_masked = pred[valid_mask].reshape((-1, 1))
+    # numpy solver
+    _ones = np.ones_like(pred_masked)
+    A = np.concatenate([pred_masked, _ones], axis=-1)
+    X = np.linalg.lstsq(A, gt_masked, rcond=None)[0]
+    scale, shift = X
+    aligned_pred = pred_arr * scale + shift
+    # restore dimensions
+    aligned_pred = aligned_pred.reshape(ori_shape)
+    if return_scale_shift:
+        return aligned_pred, scale, shift
+    else:
+        return aligned_pred
+# ******************** disparity space ********************
+def depth2disparity(depth, return_mask=False):
+    if isinstance(depth, torch.Tensor):
+        disparity = torch.zeros_like(depth)
+    elif isinstance(depth, np.ndarray):
+        disparity = np.zeros_like(depth)
+    non_negtive_mask = depth > 0
+    disparity[non_negtive_mask] = 1.0 / depth[non_negtive_mask]
+    if return_mask:
+        return disparity, non_negtive_mask
+    else:
+        return disparity
+def disparity2depth(disparity, **kwargs):
+    return depth2disparity(disparity, **kwargs)
+def align_depth_least_square_torch_mask(
+    gt: torch.Tensor,
+    pred: torch.Tensor,
+    valid_mask: torch.Tensor,
+    max_resolution=None,
+):
+    ori_shape = pred.shape[-2:]  # input shape
+    batch_size = gt.shape[0]
+    # gt = gt_arr.squeeze()  # [B, H, W]
+    # pred = pred_arr.squeeze()
+    # valid_mask = valid_mask_arr.squeeze()
+    # Downsample
+    if max_resolution is not None:
+        scale_factor = np.min(max_resolution / np.array(ori_shape[-2:]))
+        if scale_factor < 1:
+            downscaler = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest")
+            gt = downscaler(gt)
+            pred = downscaler(pred)
+            valid_mask = downscaler(valid_mask).bool()
+    assert (
+        gt.shape == pred.shape
+    ), f"{gt.shape}, {pred.shape}"
+    scale_ls = []
+    shift_ls = []
+    for i in range(batch_size):
+        gt_masked = gt[i][valid_mask[i]].view(-1, 1)
+        pred_masked = pred[i][valid_mask[i]].view(-1, 1)
+        # torch solver
+        ones = torch.ones_like(pred_masked)
+        A = torch.cat([pred_masked, ones], dim=-1)
+        X, *_ = torch.linalg.lstsq(A, gt_masked)
+        scale, shift = X[0, :].detach(), X[1, :].detach()
+        scale_ls.append(scale)
+        shift_ls.append(shift)
+    scale = torch.concat(scale_ls, dim=0).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+    shift = torch.concat(shift_ls, dim=0).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+    return scale, shift

DepthMaster/src/util/boundary_metrics.py ADDED Viewed

	@@ -0,0 +1,332 @@

+from typing import List, Tuple
+import numpy as np
+def connected_component(r: np.ndarray, c: np.ndarray) -> List[List[int]]: # type: ignore
+    """Find connected components in the given row and column indices.
+    Args:
+    ----
+        r (np.ndarray): Row indices.
+        c (np.ndarray): Column indices.
+    Yields:
+    ------
+        List[int]: Indices of connected components.
+    """
+    indices = [0]
+    for i in range(1, r.size):
+        if r[i] == r[indices[-1]] and c[i] == c[indices[-1]] + 1:
+            indices.append(i)
+        else:
+            yield indices
+            indices = [i]
+    yield indices
+def nms_horizontal(ratio: np.ndarray, threshold: float) -> np.ndarray:
+    """Apply Non-Maximum Suppression (NMS) horizontally on the given ratio matrix.
+    Args:
+    ----
+        ratio (np.ndarray): Input ratio matrix.
+        threshold (float): Threshold for NMS.
+    Returns:
+    -------
+        np.ndarray: Binary mask after applying NMS.
+    """
+    mask = np.zeros_like(ratio, dtype=bool)
+    r, c = np.nonzero(ratio > threshold)
+    if len(r) == 0:
+        return mask
+    for ids in connected_component(r, c):
+        values = [ratio[r[i], c[i]] for i in ids]
+        mi = np.argmax(values)
+        mask[r[ids[mi]], c[ids[mi]]] = True
+    return mask
+def nms_vertical(ratio: np.ndarray, threshold: float) -> np.ndarray:
+    """Apply Non-Maximum Suppression (NMS) vertically on the given ratio matrix.
+    Args:
+    ----
+        ratio (np.ndarray): Input ratio matrix.
+        threshold (float): Threshold for NMS.
+    Returns:
+    -------
+        np.ndarray: Binary mask after applying NMS.
+    """
+    return np.transpose(nms_horizontal(np.transpose(ratio), threshold))
+def fgbg_depth(
+    d: np.ndarray, t: float
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels.
+    Args:
+    ----
+        d (np.ndarray): Depth matrix.
+        t (float): Threshold for comparison.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations.
+    """
+    right_is_big_enough = (d[..., :, 1:] / d[..., :, :-1]) > t
+    left_is_big_enough = (d[..., :, :-1] / d[..., :, 1:]) > t
+    bottom_is_big_enough = (d[..., 1:, :] / d[..., :-1, :]) > t
+    top_is_big_enough = (d[..., :-1, :] / d[..., 1:, :]) > t
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def fgbg_depth_thinned(
+    d: np.ndarray, t: float
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels with Non-Maximum Suppression.
+    Args:
+    ----
+        d (np.ndarray): Depth matrix.
+        t (float): Threshold for NMS.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations with NMS applied.
+    """
+    right_is_big_enough = nms_horizontal(d[..., :, 1:] / d[..., :, :-1], t)
+    left_is_big_enough = nms_horizontal(d[..., :, :-1] / d[..., :, 1:], t)
+    bottom_is_big_enough = nms_vertical(d[..., 1:, :] / d[..., :-1, :], t)
+    top_is_big_enough = nms_vertical(d[..., :-1, :] / d[..., 1:, :], t)
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def fgbg_binary_mask(
+    d: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels in binary masks.
+    Args:
+    ----
+        d (np.ndarray): Binary depth matrix.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations in binary masks.
+    """
+    assert d.dtype == bool
+    right_is_big_enough = d[..., :, 1:] & ~d[..., :, :-1]
+    left_is_big_enough = d[..., :, :-1] & ~d[..., :, 1:]
+    bottom_is_big_enough = d[..., 1:, :] & ~d[..., :-1, :]
+    top_is_big_enough = d[..., :-1, :] & ~d[..., 1:, :]
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def edge_recall_matting(pr: np.ndarray, gt: np.ndarray, t: float) -> float:
+    """Calculate edge recall for image matting.
+    Args:
+    ----
+        pr (np.ndarray): Predicted depth matrix.
+        gt (np.ndarray): Ground truth binary mask.
+        t (float): Threshold for NMS.
+    Returns:
+    -------
+        float: Edge recall value.
+    """
+    assert gt.dtype == bool
+    ap, bp, cp, dp = fgbg_depth_thinned(pr, t)
+    ag, bg, cg, dg = fgbg_binary_mask(gt)
+    return 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
+    )
+def boundary_f1(
+    pr: np.ndarray,
+    gt: np.ndarray,
+    t: float,
+    return_p: bool = False,
+    return_r: bool = False,
+) -> float:
+    """Calculate Boundary F1 score.
+    Args:
+    ----
+        pr (np.ndarray): Predicted depth matrix.
+        gt (np.ndarray): Ground truth depth matrix.
+        t (float): Threshold for comparison.
+        return_p (bool, optional): If True, return precision. Defaults to False.
+        return_r (bool, optional): If True, return recall. Defaults to False.
+    Returns:
+    -------
+        float: Boundary F1 score, or precision, or recall depending on the flags.
+    """
+    ap, bp, cp, dp = fgbg_depth(pr, t)
+    ag, bg, cg, dg = fgbg_depth(gt, t)
+    r = 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
+    )
+    p = 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ap), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bp), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cp), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dp), 1)
+    )
+    if r + p == 0:
+        return 0.0
+    if return_p:
+        return p
+    if return_r:
+        return r
+    return 2 * (r * p) / (r + p)
+def get_thresholds_and_weights(
+    t_min: float, t_max: float, N: int
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate thresholds and weights for the given range.
+    Args:
+    ----
+        t_min (float): Minimum threshold.
+        t_max (float): Maximum threshold.
+        N (int): Number of thresholds.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray]: Array of thresholds and corresponding weights.
+    """
+    thresholds = np.linspace(t_min, t_max, N)
+    weights = thresholds / thresholds.sum()
+    return thresholds, weights
+def invert_depth(depth: np.ndarray, eps: float = 1e-6) -> np.ndarray:
+    """Inverts a depth map with numerical stability.
+    Args:
+    ----
+        depth (np.ndarray): Depth map to be inverted.
+        eps (float): Minimum value to avoid division by zero (default is 1e-6).
+    Returns:
+    -------
+    np.ndarray: Inverted depth map.
+    """
+    inverse_depth = 1.0 / depth.clip(min=eps)
+    return inverse_depth
+def SI_boundary_F1(
+    predicted_depth: np.ndarray,
+    target_depth: np.ndarray,
+    t_min: float = 1.05,
+    t_max: float = 1.25,
+    N: int = 10,
+) -> float:
+    """Calculate Scale-Invariant Boundary F1 Score for depth-based ground-truth.
+    Args:
+    ----
+        predicted_depth (np.ndarray): Predicted depth matrix.
+        target_depth (np.ndarray): Ground truth depth matrix.
+        t_min (float, optional): Minimum threshold. Defaults to 1.05.
+        t_max (float, optional): Maximum threshold. Defaults to 1.25.
+        N (int, optional): Number of thresholds. Defaults to 10.
+    Returns:
+    -------
+        float: Scale-Invariant Boundary F1 Score.
+    """
+    assert predicted_depth.ndim == target_depth.ndim == 2
+    thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
+    f1_scores = np.array(
+        [
+            boundary_f1(invert_depth(predicted_depth), invert_depth(target_depth), t)
+            for t in thresholds
+        ]
+    )
+    return np.sum(f1_scores * weights)
+def SI_boundary_Recall(
+    predicted_depth: np.ndarray,
+    target_mask: np.ndarray,
+    t_min: float = 1.05,
+    t_max: float = 1.25,
+    N: int = 10,
+    alpha_threshold: float = 0.1,
+) -> float:
+    """Calculate Scale-Invariant Boundary Recall Score for mask-based ground-truth.
+    Args:
+    ----
+        predicted_depth (np.ndarray): Predicted depth matrix.
+        target_mask (np.ndarray): Ground truth binary mask.
+        t_min (float, optional): Minimum threshold. Defaults to 1.05.
+        t_max (float, optional): Maximum threshold. Defaults to 1.25.
+        N (int, optional): Number of thresholds. Defaults to 10.
+        alpha_threshold (float, optional): Threshold for alpha masking. Defaults to 0.1.
+    Returns:
+    -------
+        float: Scale-Invariant Boundary Recall Score.
+    """
+    assert predicted_depth.ndim == target_mask.ndim == 2
+    thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
+    thresholded_target = target_mask > alpha_threshold
+    recall_scores = np.array(
+        [
+            edge_recall_matting(
+                invert_depth(predicted_depth), thresholded_target, t=float(t)
+            )
+            for t in thresholds
+        ]
+    )
+    weighted_recall = np.sum(recall_scores * weights)
+    return weighted_recall

DepthMaster/src/util/build_mlp.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import torch.nn as nn
+def build_mlp_(hidden_size=640, projector_dim=1024, z_dim=768):
+    return nn.Sequential(
+                nn.Linear(hidden_size, projector_dim),
+                nn.SiLU(),
+                nn.Linear(projector_dim, projector_dim),
+                nn.SiLU(),
+                nn.Linear(projector_dim, z_dim),
+            )

DepthMaster/src/util/config_util.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import omegaconf
+from omegaconf import OmegaConf
+def recursive_load_config(config_path: str) -> OmegaConf:
+    conf = OmegaConf.load(config_path)
+    output_conf = OmegaConf.create({})
+    # Load base config. Later configs on the list will overwrite previous
+    base_configs = conf.get("base_config", default_value=None)
+    if base_configs is not None:
+        assert isinstance(base_configs, omegaconf.listconfig.ListConfig)
+        for _path in base_configs:
+            assert (
+                _path != config_path
+            ), "Circulate merging, base_config should not include itself."
+            _base_conf = recursive_load_config(_path)
+            output_conf = OmegaConf.merge(output_conf, _base_conf)
+    # Merge configs and overwrite values
+    output_conf = OmegaConf.merge(output_conf, conf)
+    return output_conf
+def find_value_in_omegaconf(search_key, config):
+    result_list = []
+    if isinstance(config, omegaconf.DictConfig):
+        for key, value in config.items():
+            if key == search_key:
+                result_list.append(value)
+            elif isinstance(value, (omegaconf.DictConfig, omegaconf.ListConfig)):
+                result_list.extend(find_value_in_omegaconf(search_key, value))
+    elif isinstance(config, omegaconf.ListConfig):
+        for item in config:
+            if isinstance(item, (omegaconf.DictConfig, omegaconf.ListConfig)):
+                result_list.extend(find_value_in_omegaconf(search_key, item))
+    return result_list
+if "__main__" == __name__:
+    conf = recursive_load_config("config/train_base.yaml")
+    print(OmegaConf.to_yaml(conf))

DepthMaster/src/util/data_loader.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copied from https://github.com/huggingface/accelerate/blob/e2ae254008061b3e53fc1c97f88d65743a857e75/src/accelerate/data_loader.py
+from torch.utils.data import BatchSampler, DataLoader, IterableDataset
+# kwargs of the DataLoader in min version 1.4.0.
+_PYTORCH_DATALOADER_KWARGS = {
+    "batch_size": 1,
+    "shuffle": False,
+    "sampler": None,
+    "batch_sampler": None,
+    "num_workers": 0,
+    "collate_fn": None,
+    "pin_memory": False,
+    "drop_last": False,
+    "timeout": 0,
+    "worker_init_fn": None,
+    "multiprocessing_context": None,
+    "generator": None,
+    "prefetch_factor": 2,
+    "persistent_workers": False,
+}
+class SkipBatchSampler(BatchSampler):
+    """
+    A `torch.utils.data.BatchSampler` that skips the first `n` batches of another `torch.utils.data.BatchSampler`.
+    """
+    def __init__(self, batch_sampler, skip_batches=0):
+        self.batch_sampler = batch_sampler
+        self.skip_batches = skip_batches
+    def __iter__(self):
+        for index, samples in enumerate(self.batch_sampler):
+            if index >= self.skip_batches:
+                yield samples
+    @property
+    def total_length(self):
+        return len(self.batch_sampler)
+    def __len__(self):
+        return len(self.batch_sampler) - self.skip_batches
+class SkipDataLoader(DataLoader):
+    """
+    Subclass of a PyTorch `DataLoader` that will skip the first batches.
+    Args:
+        dataset (`torch.utils.data.dataset.Dataset`):
+            The dataset to use to build this datalaoder.
+        skip_batches (`int`, *optional*, defaults to 0):
+            The number of batches to skip at the beginning.
+        kwargs:
+            All other keyword arguments to pass to the regular `DataLoader` initialization.
+    """
+    def __init__(self, dataset, skip_batches=0, **kwargs):
+        super().__init__(dataset, **kwargs)
+        self.skip_batches = skip_batches
+    def __iter__(self):
+        for index, batch in enumerate(super().__iter__()):
+            if index >= self.skip_batches:
+                yield batch
+# Adapted from https://github.com/huggingface/accelerate
+def skip_first_batches(dataloader, num_batches=0):
+    """
+    Creates a `torch.utils.data.DataLoader` that will efficiently skip the first `num_batches`.
+    """
+    dataset = dataloader.dataset
+    sampler_is_batch_sampler = False
+    if isinstance(dataset, IterableDataset):
+        new_batch_sampler = None
+    else:
+        sampler_is_batch_sampler = isinstance(dataloader.sampler, BatchSampler)
+        batch_sampler = (
+            dataloader.sampler if sampler_is_batch_sampler else dataloader.batch_sampler
+        )
+        new_batch_sampler = SkipBatchSampler(batch_sampler, skip_batches=num_batches)
+    # We ignore all of those since they are all dealt with by our new_batch_sampler
+    ignore_kwargs = [
+        "batch_size",
+        "shuffle",
+        "sampler",
+        "batch_sampler",
+        "drop_last",
+    ]
+    kwargs = {
+        k: getattr(dataloader, k, _PYTORCH_DATALOADER_KWARGS[k])
+        for k in _PYTORCH_DATALOADER_KWARGS
+        if k not in ignore_kwargs
+    }
+    # Need to provide batch_size as batch_sampler is None for Iterable dataset
+    if new_batch_sampler is None:
+        kwargs["drop_last"] = dataloader.drop_last
+        kwargs["batch_size"] = dataloader.batch_size
+    if new_batch_sampler is None:
+        # Need to manually skip batches in the dataloader
+        dataloader = SkipDataLoader(dataset, skip_batches=num_batches, **kwargs)
+    else:
+        dataloader = DataLoader(dataset, batch_sampler=new_batch_sampler, **kwargs)
+    return dataloader

DepthMaster/src/util/depth_transform.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Last modified: 2025-01-14
+#
+# Copyright 2025 Ziyang Song, USTC. All rights reserved.
+#
+# This file has been modified from the original version.
+# Original copyright (c) 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/indu1ge/DepthMaster#-citation
+# More information about the method can be found at https://indu1ge.github.io/DepthMaster_page
+# --------------------------------------------------------------------------
+import torch
+import logging
+def get_depth_normalizer(cfg_normalizer):
+    if cfg_normalizer is None:
+        def identical(x):
+            return x
+        depth_transform = identical
+    elif "scale_shift_depth" == cfg_normalizer.type:
+        depth_transform = ScaleShiftDepthNormalizer(
+            norm_min=cfg_normalizer.norm_min,
+            norm_max=cfg_normalizer.norm_max,
+            min_max_quantile=cfg_normalizer.min_max_quantile,
+            clip=cfg_normalizer.clip,
+        )
+    else:
+        raise NotImplementedError
+    return depth_transform
+class DepthNormalizerBase:
+    is_absolute = None
+    far_plane_at_max = None
+    def __init__(
+        self,
+        norm_min=-1.0,
+        norm_max=1.0,
+    ) -> None:
+        self.norm_min = norm_min
+        self.norm_max = norm_max
+        raise NotImplementedError
+    def __call__(self, depth, valid_mask=None, clip=None):
+        raise NotImplementedError
+    def denormalize(self, depth_norm, **kwargs):
+        # For metric depth: convert prediction back to metric depth
+        # For relative depth: convert prediction to [0, 1]
+        raise NotImplementedError
+class ScaleShiftDepthNormalizer(DepthNormalizerBase):
+    """
+    Use near and far plane to linearly normalize depth,
+        i.e. d' = d * s + t,
+        where near plane is mapped to `norm_min`, and far plane is mapped to `norm_max`
+    Near and far planes are determined by taking quantile values.
+    """
+    is_absolute = False
+    far_plane_at_max = True
+    def __init__(
+        self, norm_min=-1.0, norm_max=1.0, min_max_quantile=0.02, clip=True
+    ) -> None:
+        self.norm_min = norm_min
+        self.norm_max = norm_max
+        self.norm_range = self.norm_max - self.norm_min
+        self.min_quantile = min_max_quantile
+        self.max_quantile = 1.0 - self.min_quantile
+        self.clip = clip
+    def __call__(self, depth_linear, valid_mask=None, clip=None):
+        clip = clip if clip is not None else self.clip
+        if valid_mask is None:
+            valid_mask = torch.ones_like(depth_linear).bool()
+        valid_mask = valid_mask & (depth_linear > 0)
+        # Take quantiles as min and max
+        _min, _max = torch.quantile(
+            depth_linear[valid_mask],
+            torch.tensor([self.min_quantile, self.max_quantile]),
+        )
+        # scale and shift
+        depth_norm_linear = (depth_linear - _min) / (
+            _max - _min
+        ) * self.norm_range + self.norm_min
+        if clip:
+            depth_norm_linear = torch.clip(
+                depth_norm_linear, self.norm_min, self.norm_max
+            )
+        return depth_norm_linear
+    def scale_back(self, depth_norm):
+        # scale to [0, 1]
+        depth_linear = (depth_norm - self.norm_min) / self.norm_range
+        return depth_linear
+    def denormalize(self, depth_norm, **kwargs):
+        logging.warning(f"{self.__class__} is not revertible without GT")
+        return self.scale_back(depth_norm=depth_norm)