Diffusers
Safetensors
zeyuren2002's picture
Add files using upload-large-folder tool
f2c12ee verified
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
DEFAULT_MODEL = "depth-anything/DA3NESTED-GIANT-LARGE-1.1"
DEFAULT_EXPORT_DIR = "workspace/gallery/scene"
DEFAULT_GALLERY_DIR = "workspace/gallery"
DEFAULT_GRADIO_DIR = "workspace/gradio"
THRESH_FOR_REF_SELECTION = 3
# =============================================================================
# Benchmark Evaluation Constants
# =============================================================================
# Default evaluation workspace directory
DEFAULT_EVAL_WORKSPACE = "workspace/evaluation"
# Default reference view selection strategy for evaluation
# Use "first" for consistent and reproducible evaluation results
# Other options: "saddle_balanced", "auto", "mid"
EVAL_REF_VIEW_STRATEGY = "first"
# -----------------------------------------------------------------------------
# DTU Dataset Configuration
# Reference: https://roboimagedata.compute.dtu.dk/
# Note: DepthAnything3 was never trained on any images from DTU.
# -----------------------------------------------------------------------------
# Root directory for DTU evaluation data (MVSNet format)
# Download from: https://drive.google.com/file/d/1rX0EXlUL4prRxrRu2DgLJv2j7-tpUD4D/view
DTU_EVAL_DATA_ROOT = "workspace/benchmark_dataset/dtu"
# List of DTU evaluation scenes
DTU_SCENES = [
"scan1",
"scan4",
"scan9",
"scan10",
"scan11",
"scan12",
"scan13",
"scan15",
"scan23",
"scan24",
"scan29",
"scan32",
"scan33",
"scan34",
"scan48",
"scan49",
"scan62",
"scan75",
"scan77",
"scan110",
"scan114",
"scan118",
]
# Point cloud fusion hyperparameters
DTU_DIST_THRESH = 0.2 # Distance threshold for geometric consistency (mm)
DTU_NUM_CONSIST = 4 # Minimum number of consistent views for a point
DTU_MAX_POINTS = 4_000_000 # Maximum points in fused point cloud
# 3D reconstruction evaluation hyperparameters
DTU_DOWN_DENSE = 0.2 # Downsample density for evaluation (mm)
DTU_PATCH_SIZE = 60 # Patch size for boundary handling
DTU_MAX_DIST = 20 # Outlier threshold for accuracy/completeness (mm)
# -----------------------------------------------------------------------------
# DTU-64 Dataset Configuration (Pose Evaluation Only)
# This is a subset of DTU with 64 images per scene for pose evaluation.
# Note: This dataset is ONLY for pose evaluation, not 3D reconstruction.
# -----------------------------------------------------------------------------
# Root directory for DTU-64 evaluation data
DTU64_EVAL_DATA_ROOT = "workspace/benchmark_dataset/dtu64"
DTU64_CAMERA_ROOT = "workspace/benchmark_dataset/dtu64/Cameras"
# List of DTU-64 evaluation scenes (13 scenes)
DTU64_SCENES = [
"scan105",
"scan114",
"scan118",
"scan122",
"scan24",
"scan37",
"scan40",
"scan55",
"scan63",
"scan65",
"scan69",
"scan83",
"scan97",
]
# -----------------------------------------------------------------------------
# ETH3D Dataset Configuration
# Reference: https://www.eth3d.net/
# High-resolution multi-view stereo benchmark with laser-scanned ground truth.
# Note: DepthAnything3 was never trained on any images from ETH3D.
# -----------------------------------------------------------------------------
# Root directory for ETH3D evaluation data
ETH3D_EVAL_DATA_ROOT = "workspace/benchmark_dataset/eth3d"
# List of ETH3D evaluation scenes (indoor and outdoor)
ETH3D_SCENES = [
"courtyard",
"electro",
"kicker",
"pipes",
"relief",
# "terrace", # Excluded: known issues
"delivery_area",
"facade",
# "meadow", # Excluded: known issues
"office",
"playground",
"relief_2",
"terrains",
]
# Images to filter out (known problematic views per scene)
ETH3D_FILTER_KEYS = {
"delivery_area": ["711.JPG", "712.JPG", "713.JPG", "714.JPG"],
"electro": ["9289.JPG", "9290.JPG", "9291.JPG", "9292.JPG", "9293.JPG", "9298.JPG"],
"playground": ["587.JPG", "588.JPG", "589.JPG", "590.JPG", "591.JPG", "592.JPG"],
"relief": [
"427.JPG", "428.JPG", "429.JPG", "430.JPG", "431.JPG", "432.JPG",
"433.JPG", "434.JPG", "435.JPG", "436.JPG", "437.JPG", "438.JPG",
],
"relief_2": [
"458.JPG", "459.JPG", "460.JPG", "461.JPG", "462.JPG", "463.JPG",
"464.JPG", "465.JPG", "466.JPG", "467.JPG", "468.JPG",
],
}
# TSDF fusion hyperparameters (scaled for outdoor scenes)
ETH3D_VOXEL_LENGTH = 4.0 / 512.0 * 5 # Voxel size for TSDF (meters)
ETH3D_SDF_TRUNC = 0.04 * 5 # SDF truncation distance (meters)
ETH3D_MAX_DEPTH = 100000.0 # Maximum depth for integration (effectively no truncation)
# Point cloud sampling
ETH3D_SAMPLING_NUMBER = 1_000_000 # Number of points to sample from mesh
# 3D reconstruction evaluation hyperparameters
ETH3D_EVAL_THRESHOLD = 0.05 * 5 # Distance threshold for precision/recall (meters)
ETH3D_DOWN_SAMPLE = 4.0 / 512.0 * 5 # Voxel size for evaluation downsampling (meters)
# ==============================================================================
# 7Scenes Dataset Configuration
# ==============================================================================
# Reference: https://www.microsoft.com/en-us/research/project/rgb-d-dataset-7-scenes/
# Note: Indoor RGB-D dataset with ground truth poses and meshes.
# Root directory for 7Scenes evaluation data
SEVENSCENES_EVAL_DATA_ROOT = "workspace/benchmark_dataset/7scenes"
# List of 7Scenes evaluation scenes
SEVENSCENES_SCENES = [
"chess",
"fire",
"heads",
"office",
"pumpkin",
"redkitchen",
"stairs",
]
# Fixed camera intrinsics for 7Scenes (all images share same intrinsics)
SEVENSCENES_FX = 585.0
SEVENSCENES_FY = 585.0
SEVENSCENES_CX = 320.0
SEVENSCENES_CY = 240.0
# TSDF fusion hyperparameters (indoor scenes, smaller voxels)
SEVENSCENES_VOXEL_LENGTH = 4.0 / 512.0 # Voxel size for TSDF (meters)
SEVENSCENES_SDF_TRUNC = 0.04 # SDF truncation distance (meters)
SEVENSCENES_MAX_DEPTH = 1000000.0 # Maximum depth for integration (no truncation)
# Point cloud sampling
SEVENSCENES_SAMPLING_NUMBER = 1_000_000 # Number of points to sample from mesh
# 3D reconstruction evaluation hyperparameters
SEVENSCENES_EVAL_THRESHOLD = 0.05 # Distance threshold for precision/recall (meters)
SEVENSCENES_DOWN_SAMPLE = 4.0 / 512.0 # Voxel size for evaluation downsampling (meters)
# ==============================================================================
# ScanNet++ Dataset Configuration
# ==============================================================================
# Reference: https://kaldir.vc.in.tum.de/scannetpp/
# Note: High-quality indoor RGB-D dataset with iPhone and DSLR images.
# Root directory for ScanNet++ evaluation data
SCANNETPP_EVAL_DATA_ROOT = "workspace/benchmark_dataset/scannetpp"
# List of ScanNet++ evaluation scenes
SCANNETPP_SCENES = [
"09c1414f1b",
"1ada7a0617",
"40aec5fffa",
"3e8bba0176",
"acd95847c5",
"578511c8a9",
"5f99900f09",
"c4c04e6d6c",
"f3d64c30f8",
"7bc286c1b6",
"c5439f4607",
"286b55a2bf",
"fb5a96b1a2",
"7831862f02",
"38d58a7a31",
"bde1e479ad",
"9071e139d9",
"21d970d8de",
"bcd2436daf",
"cc5237fd77",
]
# Input resolution for ScanNet++ (after undistortion and resize)
SCANNETPP_INPUT_H = 768
SCANNETPP_INPUT_W = 1024
# TSDF fusion hyperparameters (indoor scenes)
SCANNETPP_VOXEL_LENGTH = 0.02 # Voxel size for TSDF (meters)
SCANNETPP_SDF_TRUNC = 0.15 # SDF truncation distance (meters)
SCANNETPP_MAX_DEPTH = 5.0 # Maximum depth for integration (meters)
# Point cloud sampling
SCANNETPP_SAMPLING_NUMBER = 1_000_000 # Number of points to sample from mesh
# 3D reconstruction evaluation hyperparameters
SCANNETPP_EVAL_THRESHOLD = 0.05 # Distance threshold for precision/recall (meters)
SCANNETPP_DOWN_SAMPLE = 0.02 # Voxel size for evaluation downsampling (meters)
# ==============================================================================
# HiRoom Dataset Configuration
# ==============================================================================
# Note: Indoor RGB-D dataset.
# Root directory for HiRoom evaluation data
HIROOM_EVAL_DATA_ROOT = "workspace/benchmark_dataset/hiroom/data"
HIROOM_GT_ROOT_PATH = "workspace/benchmark_dataset/hiroom/fused_pcd"
HIROOM_SCENE_LIST_PATH = "workspace/benchmark_dataset/hiroom/selected_scene_list_val.txt"
# TSDF fusion hyperparameters (indoor scenes)
HIROOM_VOXEL_LENGTH = 4.0 / 512.0 # Voxel size for TSDF (meters)
HIROOM_SDF_TRUNC = 0.04 # SDF truncation distance (meters)
HIROOM_MAX_DEPTH = 10000.0 # Maximum depth for integration (no truncation)
# Point cloud sampling
HIROOM_SAMPLING_NUMBER = 1_000_000 # Number of points to sample from mesh
# 3D reconstruction evaluation hyperparameters
HIROOM_EVAL_THRESHOLD = 0.05 # Distance threshold for precision/recall (meters)
HIROOM_DOWN_SAMPLE = 4.0 / 512.0 # Voxel size for evaluation downsampling (meters)