Instructions to use zeyuren2002/EvalMDE with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use zeyuren2002/EvalMDE with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("zeyuren2002/EvalMDE", dtype=torch.bfloat16, device_map="cuda") prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
| # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| DEFAULT_MODEL = "depth-anything/DA3NESTED-GIANT-LARGE-1.1" | |
| DEFAULT_EXPORT_DIR = "workspace/gallery/scene" | |
| DEFAULT_GALLERY_DIR = "workspace/gallery" | |
| DEFAULT_GRADIO_DIR = "workspace/gradio" | |
| THRESH_FOR_REF_SELECTION = 3 | |
| # ============================================================================= | |
| # Benchmark Evaluation Constants | |
| # ============================================================================= | |
| # Default evaluation workspace directory | |
| DEFAULT_EVAL_WORKSPACE = "workspace/evaluation" | |
| # Default reference view selection strategy for evaluation | |
| # Use "first" for consistent and reproducible evaluation results | |
| # Other options: "saddle_balanced", "auto", "mid" | |
| EVAL_REF_VIEW_STRATEGY = "first" | |
| # ----------------------------------------------------------------------------- | |
| # DTU Dataset Configuration | |
| # Reference: https://roboimagedata.compute.dtu.dk/ | |
| # Note: DepthAnything3 was never trained on any images from DTU. | |
| # ----------------------------------------------------------------------------- | |
| # Root directory for DTU evaluation data (MVSNet format) | |
| # Download from: https://drive.google.com/file/d/1rX0EXlUL4prRxrRu2DgLJv2j7-tpUD4D/view | |
| DTU_EVAL_DATA_ROOT = "workspace/benchmark_dataset/dtu" | |
| # List of DTU evaluation scenes | |
| DTU_SCENES = [ | |
| "scan1", | |
| "scan4", | |
| "scan9", | |
| "scan10", | |
| "scan11", | |
| "scan12", | |
| "scan13", | |
| "scan15", | |
| "scan23", | |
| "scan24", | |
| "scan29", | |
| "scan32", | |
| "scan33", | |
| "scan34", | |
| "scan48", | |
| "scan49", | |
| "scan62", | |
| "scan75", | |
| "scan77", | |
| "scan110", | |
| "scan114", | |
| "scan118", | |
| ] | |
| # Point cloud fusion hyperparameters | |
| DTU_DIST_THRESH = 0.2 # Distance threshold for geometric consistency (mm) | |
| DTU_NUM_CONSIST = 4 # Minimum number of consistent views for a point | |
| DTU_MAX_POINTS = 4_000_000 # Maximum points in fused point cloud | |
| # 3D reconstruction evaluation hyperparameters | |
| DTU_DOWN_DENSE = 0.2 # Downsample density for evaluation (mm) | |
| DTU_PATCH_SIZE = 60 # Patch size for boundary handling | |
| DTU_MAX_DIST = 20 # Outlier threshold for accuracy/completeness (mm) | |
| # ----------------------------------------------------------------------------- | |
| # DTU-64 Dataset Configuration (Pose Evaluation Only) | |
| # This is a subset of DTU with 64 images per scene for pose evaluation. | |
| # Note: This dataset is ONLY for pose evaluation, not 3D reconstruction. | |
| # ----------------------------------------------------------------------------- | |
| # Root directory for DTU-64 evaluation data | |
| DTU64_EVAL_DATA_ROOT = "workspace/benchmark_dataset/dtu64" | |
| DTU64_CAMERA_ROOT = "workspace/benchmark_dataset/dtu64/Cameras" | |
| # List of DTU-64 evaluation scenes (13 scenes) | |
| DTU64_SCENES = [ | |
| "scan105", | |
| "scan114", | |
| "scan118", | |
| "scan122", | |
| "scan24", | |
| "scan37", | |
| "scan40", | |
| "scan55", | |
| "scan63", | |
| "scan65", | |
| "scan69", | |
| "scan83", | |
| "scan97", | |
| ] | |
| # ----------------------------------------------------------------------------- | |
| # ETH3D Dataset Configuration | |
| # Reference: https://www.eth3d.net/ | |
| # High-resolution multi-view stereo benchmark with laser-scanned ground truth. | |
| # Note: DepthAnything3 was never trained on any images from ETH3D. | |
| # ----------------------------------------------------------------------------- | |
| # Root directory for ETH3D evaluation data | |
| ETH3D_EVAL_DATA_ROOT = "workspace/benchmark_dataset/eth3d" | |
| # List of ETH3D evaluation scenes (indoor and outdoor) | |
| ETH3D_SCENES = [ | |
| "courtyard", | |
| "electro", | |
| "kicker", | |
| "pipes", | |
| "relief", | |
| # "terrace", # Excluded: known issues | |
| "delivery_area", | |
| "facade", | |
| # "meadow", # Excluded: known issues | |
| "office", | |
| "playground", | |
| "relief_2", | |
| "terrains", | |
| ] | |
| # Images to filter out (known problematic views per scene) | |
| ETH3D_FILTER_KEYS = { | |
| "delivery_area": ["711.JPG", "712.JPG", "713.JPG", "714.JPG"], | |
| "electro": ["9289.JPG", "9290.JPG", "9291.JPG", "9292.JPG", "9293.JPG", "9298.JPG"], | |
| "playground": ["587.JPG", "588.JPG", "589.JPG", "590.JPG", "591.JPG", "592.JPG"], | |
| "relief": [ | |
| "427.JPG", "428.JPG", "429.JPG", "430.JPG", "431.JPG", "432.JPG", | |
| "433.JPG", "434.JPG", "435.JPG", "436.JPG", "437.JPG", "438.JPG", | |
| ], | |
| "relief_2": [ | |
| "458.JPG", "459.JPG", "460.JPG", "461.JPG", "462.JPG", "463.JPG", | |
| "464.JPG", "465.JPG", "466.JPG", "467.JPG", "468.JPG", | |
| ], | |
| } | |
| # TSDF fusion hyperparameters (scaled for outdoor scenes) | |
| ETH3D_VOXEL_LENGTH = 4.0 / 512.0 * 5 # Voxel size for TSDF (meters) | |
| ETH3D_SDF_TRUNC = 0.04 * 5 # SDF truncation distance (meters) | |
| ETH3D_MAX_DEPTH = 100000.0 # Maximum depth for integration (effectively no truncation) | |
| # Point cloud sampling | |
| ETH3D_SAMPLING_NUMBER = 1_000_000 # Number of points to sample from mesh | |
| # 3D reconstruction evaluation hyperparameters | |
| ETH3D_EVAL_THRESHOLD = 0.05 * 5 # Distance threshold for precision/recall (meters) | |
| ETH3D_DOWN_SAMPLE = 4.0 / 512.0 * 5 # Voxel size for evaluation downsampling (meters) | |
| # ============================================================================== | |
| # 7Scenes Dataset Configuration | |
| # ============================================================================== | |
| # Reference: https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/ | |
| # Note: Indoor RGB-D dataset with ground truth poses and meshes. | |
| # Root directory for 7Scenes evaluation data | |
| SEVENSCENES_EVAL_DATA_ROOT = "workspace/benchmark_dataset/7scenes" | |
| # List of 7Scenes evaluation scenes | |
| SEVENSCENES_SCENES = [ | |
| "chess", | |
| "fire", | |
| "heads", | |
| "office", | |
| "pumpkin", | |
| "redkitchen", | |
| "stairs", | |
| ] | |
| # Fixed camera intrinsics for 7Scenes (all images share same intrinsics) | |
| SEVENSCENES_FX = 585.0 | |
| SEVENSCENES_FY = 585.0 | |
| SEVENSCENES_CX = 320.0 | |
| SEVENSCENES_CY = 240.0 | |
| # TSDF fusion hyperparameters (indoor scenes, smaller voxels) | |
| SEVENSCENES_VOXEL_LENGTH = 4.0 / 512.0 # Voxel size for TSDF (meters) | |
| SEVENSCENES_SDF_TRUNC = 0.04 # SDF truncation distance (meters) | |
| SEVENSCENES_MAX_DEPTH = 1000000.0 # Maximum depth for integration (no truncation) | |
| # Point cloud sampling | |
| SEVENSCENES_SAMPLING_NUMBER = 1_000_000 # Number of points to sample from mesh | |
| # 3D reconstruction evaluation hyperparameters | |
| SEVENSCENES_EVAL_THRESHOLD = 0.05 # Distance threshold for precision/recall (meters) | |
| SEVENSCENES_DOWN_SAMPLE = 4.0 / 512.0 # Voxel size for evaluation downsampling (meters) | |
| # ============================================================================== | |
| # ScanNet++ Dataset Configuration | |
| # ============================================================================== | |
| # Reference: https://kaldir.vc.in.tum.de/scannetpp/ | |
| # Note: High-quality indoor RGB-D dataset with iPhone and DSLR images. | |
| # Root directory for ScanNet++ evaluation data | |
| SCANNETPP_EVAL_DATA_ROOT = "workspace/benchmark_dataset/scannetpp" | |
| # List of ScanNet++ evaluation scenes | |
| SCANNETPP_SCENES = [ | |
| "09c1414f1b", | |
| "1ada7a0617", | |
| "40aec5fffa", | |
| "3e8bba0176", | |
| "acd95847c5", | |
| "578511c8a9", | |
| "5f99900f09", | |
| "c4c04e6d6c", | |
| "f3d64c30f8", | |
| "7bc286c1b6", | |
| "c5439f4607", | |
| "286b55a2bf", | |
| "fb5a96b1a2", | |
| "7831862f02", | |
| "38d58a7a31", | |
| "bde1e479ad", | |
| "9071e139d9", | |
| "21d970d8de", | |
| "bcd2436daf", | |
| "cc5237fd77", | |
| ] | |
| # Input resolution for ScanNet++ (after undistortion and resize) | |
| SCANNETPP_INPUT_H = 768 | |
| SCANNETPP_INPUT_W = 1024 | |
| # TSDF fusion hyperparameters (indoor scenes) | |
| SCANNETPP_VOXEL_LENGTH = 0.02 # Voxel size for TSDF (meters) | |
| SCANNETPP_SDF_TRUNC = 0.15 # SDF truncation distance (meters) | |
| SCANNETPP_MAX_DEPTH = 5.0 # Maximum depth for integration (meters) | |
| # Point cloud sampling | |
| SCANNETPP_SAMPLING_NUMBER = 1_000_000 # Number of points to sample from mesh | |
| # 3D reconstruction evaluation hyperparameters | |
| SCANNETPP_EVAL_THRESHOLD = 0.05 # Distance threshold for precision/recall (meters) | |
| SCANNETPP_DOWN_SAMPLE = 0.02 # Voxel size for evaluation downsampling (meters) | |
| # ============================================================================== | |
| # HiRoom Dataset Configuration | |
| # ============================================================================== | |
| # Note: Indoor RGB-D dataset. | |
| # Root directory for HiRoom evaluation data | |
| HIROOM_EVAL_DATA_ROOT = "workspace/benchmark_dataset/hiroom/data" | |
| HIROOM_GT_ROOT_PATH = "workspace/benchmark_dataset/hiroom/fused_pcd" | |
| HIROOM_SCENE_LIST_PATH = "workspace/benchmark_dataset/hiroom/selected_scene_list_val.txt" | |
| # TSDF fusion hyperparameters (indoor scenes) | |
| HIROOM_VOXEL_LENGTH = 4.0 / 512.0 # Voxel size for TSDF (meters) | |
| HIROOM_SDF_TRUNC = 0.04 # SDF truncation distance (meters) | |
| HIROOM_MAX_DEPTH = 10000.0 # Maximum depth for integration (no truncation) | |
| # Point cloud sampling | |
| HIROOM_SAMPLING_NUMBER = 1_000_000 # Number of points to sample from mesh | |
| # 3D reconstruction evaluation hyperparameters | |
| HIROOM_EVAL_THRESHOLD = 0.05 # Distance threshold for precision/recall (meters) | |
| HIROOM_DOWN_SAMPLE = 4.0 / 512.0 # Voxel size for evaluation downsampling (meters) | |